diff --git a/nonlinear.tex b/nonlinear.tex
index 2b2400d..b14577b 100755
--- a/nonlinear.tex
+++ b/nonlinear.tex
@@ -1,1010 +1,1010 @@
 %%%%%%%%%%%%%%%%%%%%%%% file template.tex %%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % This is a general template file for the LaTeX package SVJour3
 % for Springer journals.          Springer Heidelberg 2010/09/16
 %
 % Copy it to a new file with a new name and use it as the basis
 % for your article. Delete % signs as needed.
 %
 % This template includes a few options for different layouts and
 % content for various journals. Please consult a previous issue of
 % your journal as needed.
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % First comes an example EPS file -- just ignore it and
 % proceed on the \documentclass line
 % your LaTeX will extract the file if required
 \begin{filecontents*}{example.eps}
 %!PS-Adobe-3.0 EPSF-3.0
 %%BoundingBox: 19 19 221 221
 %%CreationDate: Mon Sep 29 1997
 %%Creator: programmed by hand (JK)
 %%EndComments
 gsave
 newpath
   20 20 moveto
   20 220 lineto
   220 220 lineto
   220 20 lineto
 closepath
 2 setlinewidth
 gsave
   .4 setgray fill
 grestore
 stroke
 grestore
 \end{filecontents*}
 %
 \RequirePackage{fix-cm}
 %
 %\documentclass{svjour3}                     % onecolumn (standard format)
 %\documentclass[smallcondensed]{svjour3}     % onecolumn (ditto)
 \documentclass[smallextended]{svjour3}       % onecolumn (second format)
 %\documentclass[twocolumn]{svjour3}          % twocolumn
 %
 \smartqed  % flush right qed marks, e.g. at end of proof
 %
 \usepackage{amsmath}
 \usepackage{amssymb}
 \usepackage{graphicx}
 \usepackage{algorithm}
 \usepackage{color}
 
 \usepackage{hyperref}
 
 \hypersetup{
    colorlinks=true,%
    citecolor=black,%
    filecolor=black,%
    linkcolor=black,%
    urlcolor=blue
 }
 
 
 %
 % \usepackage{mathptmx}      % use Times fonts if available on your TeX system
 
 % insert here the call for the packages your document requires
 %\usepackage{latexsym}
 % etc.
 
 \PassOptionsToPackage{normalem}{ulem}
 \newcommand{\minimize}[2]{\ensuremath{\underset{\substack{{#1}}}%
 {\mathrm{minimize}}\;\;#2 }}
 \newcommand{\Argmind}[2]{\ensuremath{\underset{\substack{{#1}}}%
 {\mathrm{Argmin}}\;\;#2 }}
 \newcommand{\Frac}[2]{\displaystyle{\frac{#1}{#2}}} 
 \newcommand{\menge}[2]{\big\{{#1} \;|\; {#2}\big\}} 
 \newcommand{\Pair}[2]{{\big\langle{{#1},{#2}}\big\rangle}}
 \newcommand{\pair}[2]{{\langle{{#1},{#2}}\rangle}}
 \newcommand{\Scal}[2]{{\bigg\langle{{#1}\:\bigg |~{#2}}\bigg\rangle}}
 \newcommand{\Menge}[2]{\bigg\{{#1}~\bigg|~{#2}\bigg\}} 
 \newcommand{\lev}[1]{\ensuremath{\mathrm{lev}_{\leq #1}\:}}
 \newcommand{\enve}[1]{\ensuremath{\operatorname{env}#1}}
 \newcommand{\emp}{\ensuremath{{\varnothing}}}
 \newcommand{\bdry}{\ensuremath{\operatorname{bdry}}}
 \newcommand{\yosi}[2]{\ensuremath{\sideset{^{#2}}{}{\operatorname{}}\!\!#1}}
 \newcommand{\infconv}{\ensuremath{\mbox{\small$\,\square\,$}}}
 \newcommand{\scal}[2]{\left\langle{#1}\mid {#2} \right\rangle} 
 \newcommand{\pscal}[2]{\langle\langle{#1}\mid{#2}\rangle\rangle} 
 \newcommand{\norm}[1]{\|#1\|}
 \newcommand{\vuo}{\ensuremath{\mbox{\footnotesize$\square$}}}
 \newcommand{\exi}{\ensuremath{\exists\,}}
 \newcommand{\zeroun}{\ensuremath{\left]0,1\right[}}   
 \newcommand{\HH}{\ensuremath{\mathcal H}}
 \newcommand{\GG}{\ensuremath{\mathcal G}}
 \newcommand{\YY}{\ensuremath{\mathcal Y}}
 \newcommand{\XX}{\ensuremath{\mathcal X}}
 \newcommand{\bary}{\ensuremath{\widetilde{\boldsymbol{y}}}}
 \newcommand{\barp}{\ensuremath{\widetilde{\boldsymbol{p}}}}
 \newcommand{\barq}{\ensuremath{\widetilde{\boldsymbol{q}}}}
 \newcommand{\barx}{\ensuremath{\widetilde{\boldsymbol{x}}}}
 \newcommand{\BL}{\ensuremath{\EuScript B}\,}
 \newcommand{\BP}{\ensuremath{\EuScript P}}
 \newcommand{\HHH}{\ensuremath{\boldsymbol{\mathcal H}}}
 \newcommand{\WW}{\ensuremath{\boldsymbol{\mathcal W}}}
 \newcommand{\KKK}{\ensuremath{\boldsymbol{\mathcal K}}}
 \newcommand{\GGG}{\ensuremath{\boldsymbol{\mathcal G}}}
 \newcommand{\VV}{\ensuremath{\boldsymbol{V}}}
 \newcommand{\CCC}{\ensuremath{\boldsymbol{C}}}
 \newcommand{\MM}{\ensuremath{\boldsymbol{M}}}
 \newcommand{\SG}{\ensuremath{\boldsymbol{S}}}
 \newcommand{\EE}{\ensuremath{\boldsymbol{E}}}
 \newcommand{\XP}{\ensuremath{\mathcal X}^*}
 \newcommand{\sri}{\ensuremath{\operatorname{sri}}}
 \newcommand{\ri}{\ensuremath{\operatorname{ri}\,}}
 \newcommand{\RR}{\ensuremath{\mathbb R}}
 \newcommand{\KK}{\ensuremath{\mathcal K}}
 \newcommand{\RP}{\ensuremath{\left[0,+\infty\right[}}
 \newcommand{\RPP}{\ensuremath{\,\left]0,+\infty\right[}}
 \newcommand{\RX}{\ensuremath{\,\left]-\infty,+\infty\right]}}
 \newcommand{\NN}{\ensuremath{\mathbb N}}
 \newcommand{\SL}{\ensuremath{\EuScript S}\,}
 \newcommand{\dom}{\ensuremath{\operatorname{dom}}}
 \newcommand{\cont}{\ensuremath{\operatorname{cont}}}
 %\newcommand{\gr}{\ensuremath{\operatorname{gra}}}
 \newcommand{\prox}{\ensuremath{\operatorname{prox}}}
 \newcommand{\Prox}{\ensuremath{\operatorname{Prox}}}
 \newcommand{\intdom}{\ensuremath{\operatorname{int}\operatorname{dom}}\,}
 \newcommand{\inte}{\ensuremath{\operatorname{int}}}
 \newcommand{\cart}{\ensuremath{\mbox{\huge{$\times$}}}}
 \newcommand{\scart}{\ensuremath{\mbox{\LARGE{$\times$}}}}
 \newcommand{\WC}{\ensuremath{{\mathfrak W}}}
 \newcommand{\TT}{\ensuremath{{\mathbf T}}}
 \newcommand{\SC}{\ensuremath{{\mathfrak S}}}
 \newcommand{\rh}{\ensuremath{{\mathrm  a}}}
 \newcommand{\og}{\ensuremath{{\mathrm  b}}}
 \newcommand{\rk}{\ensuremath{{\mathsf {\mathbf  a}}}}
 \newcommand{\ck}{\ensuremath{{\mathsf {\mathbf  u}}}}
 \newcommand{\xk}{\ensuremath{{\mathsf{\mathbf  x}}}}
 \newcommand{\yk}{\ensuremath{{\mathsf{\mathbf  y}}}}
 \newcommand{\RPX}{\ensuremath{{[0,\pinf]}}}
 \newcommand{\card}{\ensuremath{\operatorname{card}}}
 \newcommand{\bd}{\ensuremath{\operatorname{bdry}}}
 \newcommand{\Argmin}{\ensuremath{\operatorname{Argmin}}}
 \newcommand{\argmin}{\ensuremath{\operatorname{argmin}}}
 \newcommand{\ran}{\ensuremath{\operatorname{ran}}}
 \newcommand{\zer}{\ensuremath{\operatorname{zer}}}
 \newcommand{\gra}{\ensuremath{\operatorname{gra}}}
 \newcommand{\conv}{\ensuremath{\operatorname{conv}}}
 \newcommand{\vv}{\ensuremath{\boldsymbol{v}}}
 \newcommand{\sss}{\ensuremath{\boldsymbol{s}}}
 \newcommand{\xx}{\ensuremath{\boldsymbol{x}}}
 \newcommand{\xs}{\ensuremath{\textsf{x}}}
 \newcommand{\xo}{\ensuremath{\overline{\boldsymbol{x}}}}
 \newcommand{\pp}{\ensuremath{\boldsymbol{p}}}
 \newcommand{\qq}{\ensuremath{\boldsymbol{q}}}
 \newcommand{\yy}{\ensuremath{\boldsymbol{y}}}
 \newcommand{\ff}{\ensuremath{\boldsymbol{f}}}
 \newcommand{\hh}{\ensuremath{\boldsymbol{h}}}
 \newcommand{\ttt}{\ensuremath{\boldsymbol{t}}}
 \newcommand{\ee}{\ensuremath{\boldsymbol{e}}}
 \newcommand{\rr}{\ensuremath{\boldsymbol{r}}}
 \newcommand{\gggg}{\ensuremath{\boldsymbol{g}}}
 \newcommand{\zz}{\ensuremath{\boldsymbol{z}}}
 \newcommand{\bb}{\ensuremath{\boldsymbol{b}}}
 \newcommand{\uu}{\ensuremath{\boldsymbol{u}}}
 \newcommand{\cc}{\ensuremath{\boldsymbol{c}}}
 \newcommand{\dd}{\ensuremath{\boldsymbol{d}}}
 \newcommand{\aaa}{\ensuremath{\boldsymbol{a}}}
 \newcommand{\ww}{\ensuremath{\boldsymbol{w}}}
 \newcommand{\BB}{\ensuremath{\boldsymbol{B}}}
 \newcommand{\LL}{\ensuremath{\boldsymbol{L}}}
 \newcommand{\PPP}{\ensuremath{\boldsymbol{\mathsf{P}}}}
 \newcommand{\UU}{\ensuremath{\boldsymbol{U}}}
 \newcommand{\EEE}{\ensuremath{\boldsymbol{E}}}
 \newcommand{\E}{\ensuremath{\mathbf{E}}}
 \newcommand{\D}{\ensuremath{\mathbf{D}}}
 \newcommand{\ep}{\ensuremath{\boldsymbol{\varepsilon}}}
 \newcommand{\RRR}{\ensuremath{\boldsymbol{R}}}
 %\newcommand{\RRR}{\ensuremath{\boldsymbol{R}}}
 \newcommand{\AAA}{\ensuremath{\boldsymbol{A}}}
 \newcommand{\BBB}{\ensuremath{\boldsymbol{B}}}
 \newcommand{\QQ}{\ensuremath{\boldsymbol{Q}}}
 \newcommand{\SSS}{\ensuremath{\boldsymbol{S}}}
 \newcommand{\DD}{\ensuremath{\boldsymbol{D}}}
 \newcommand{\PP}{\ensuremath{\boldsymbol{P}}}
 \newcommand{\FF}{\ensuremath{\boldsymbol{\mathcal{F}}}}
 \newcommand{\cone}{\ensuremath{\operatorname{cone}}}
 \newcommand{\Fix}{\ensuremath{\operatorname{Fix}}}
 \newcommand{\Id}{\ensuremath{\operatorname{Id}}}
 \newcommand{\diam}{\ensuremath{\operatorname{diam}}}
 \newcommand{\IId}{\ensuremath{\boldsymbol{\operatorname{Id}}}}
 \newcommand{\weakly}{\ensuremath{\rightharpoonup}}
 \newcommand{\minf}{\ensuremath{-\infty}}
 \newcommand{\pinf}{\ensuremath{+\infty}}
 \newcommand{\LLambda}{\ensuremath{\boldsymbol{\Lambda}}}
 \newcommand{\vva}{\ensuremath{\boldsymbol{\epsilon}}}
 \newcommand{\trace}{\operatorname{tr}}
 
 
 % AE's commands
 \newcommand{\edita}[1]{{\color{blue} #1}}
 \newcommand{\notea}[1]{{\color{magenta} \textbf{Note: #1}}}
 \newcommand{\ol}{\overline}
 
 
 % please place your own definitions here and don't use \def but
 % \newcommand{}{}
 %
 % Insert the name of "your journal" with
 % \journalname{myjournal}
 %
 \begin{document}
 
 \title{A relaxation of the augmented Lagrange method
 }
 %\subtitle{Do you have a subtitle?\\ If so, write it here}
 
 %\titlerunning{Short form of title}        % if too long for running head
 
 \author{Bang C. Vu\and Alcaoglu Ahmet\and   Sahin M. Fatih\and Alp Yurtsever\and Volkan Cevher \\[5mm]
 }
 
 %\authorrunning{Short form of author list} % if too long for running head
 
 \institute{Laboratory for Information and Inference Systems (LIONS), EPFL, Switzerland\\
               \email{bang.vu@epfl.ch\and ahmet.alacaoglu@epfl.ch; mehmet.sahin@epfl.ch;alp.yurtsever@epfl.ch; mehmet.sahin@epfl.ch;volkan.cehver@epfl.ch}                  }
 
 \date{Received: date / Accepted: date}
 % The correct dates will be entered by the editor
 
 
 \maketitle
 
 \begin{abstract}
 We propose a splitting method for solving ....
 \keywords{Non-linear constraint \and Non-convex \and Smoothing\and Primal-dual}
 % \PACS{PACS code1 \and PACS code2 \and more}
  \subclass{47H05\and 49M29\and 49M27\and 90C25}
 \end{abstract}
 
 \section{Introduction \label{intro}}
 
 \edita{Various problems in engineering and computational sciences can be cast as non-linear optimization programs, and the design of efficient numerical algorithms to provably solve such problems is therefore of fundamental importance. cite?
 %Non-linear programming is a broad discipline in applied mathematics. 
 In this paper, we are particularly interested in solving the optimization program
 \begin{equation}
 \label{prob:01}
 \begin{cases}
 \min_{u} h(u),\\
 A(u) = b,\\
 u\in C,
 \end{cases}
 \end{equation}where $h\in \mathbb{L}_{d}(\lambda_h)$ is a continuously-differentiable function from $\mathbb{R}^d$ to $\RR$ with $\lambda_h$ Lipschitz-continuous gradient, $A: \mathbb{R}^d\rightarrow\mathbb{R}^m$ is a non-linear function, with each component $A_i\in \mathbb{L}_d(\lambda_A) $ for $i\in \{1,\cdots,m\}$. Moreover, 
 $C\subset \RR^d$ is  non-empty, closed, and  convex.
 %\edita{Program \eqref{prob:01} is typically} non-convex and \edita{can be considered as a standard non-linear program, where the set $C$
 %is modeled by inequalities constraints, namely, $C = \menge{u}{g(u) \leq 0}$ for some $g\colon\RR^d\to \RR^s$.}
 
 
 Variants of Program~\eqref{prob:01} naturally arise in a broad range of applications in ?? \notea{Please add some representative applications above alongside some references. } For the sake of brevity, we showcase here one instance of Program $\eqref{prob:01}$.}  
  
  \begin{example}\edita{{{\textbf{(Burer-Monteiro factorization)}}}
 Let $\mathbb{S}^{d'\times d'}$ be the space of $d'\times d'$ symmetric matrices, equipped with the standard inner product $\langle x|y\rangle = \text{tr}(x^*y)$.  In particular, when $x\in \mathbb{S}^{d'\times d'}$ is positive semi-definite, we write that $x\succeq 0$.}
 % Consider $\mathcal{C}'\subseteq \mathcal{X}$, and let $h_0\colon\mathcal{X}\to \RR$ be a differentiable 
 %convex function, with $\mathbb{L}_{0}$ Lipschitz-continuous gradient.} 
 % 
 \edita{Consider the program 
 \begin{equation}
 \label{e:fac}
 \begin{cases}
 \min_x h'(x) \\
 A'(x) = b'\\
 x\in C'\\
 x \succeq 0 ,
 \end{cases}
 \end{equation}
 where $h': \mathbb{S}^{d'\times d'} \to \RR$, $A'\colon\mathbb{S}^{d'\times d'}\to\RR^m$, $b\in\RR^m$, and  $C' \subseteq \mathbb{R}^{d'\times d'}$. 
 
 Variants of Program \eqref{e:fac} are popular in matrix completion and sensing \cite{park2016provable}, with a broad range of applications to problems in collaborative filtering, geophysics, and imaging, among others~\cite{Burer2005,Burer2003,tu2014practical}. Two common choices for $C'$ in Program \eqref{e:fac} are 
 $C' =\{x:  x \ge 0\}$ and $C' = \{x: \text{tr}(x) \le 1\}$ \cite{mixon2016clustering}.} 
 
 \edita{Solving Program \eqref{e:fac} with semi-definite programming is not scalable,  becoming increasingly cumbersome as the dimension $d'$ grows.
 To overcome this {computational bottleneck}, the factorized technique sets $x = uu^\top$ for $u\in \mathbb{R}^{d'\times r}$ and a sufficiently large $r$. The resulting non-convex program is then solved with respect to the much lower-dimensional variable $u$. If we also replace the constraint $uu^\top \in C'$ with $u\in C$ for a properly chosen convex set, the new problem in $u$ matches Program \eqref{prob:01} with $h(u) = h'(uu^\top)$ and $A(u) = A'(uu^\top)$. For our examples of $C'$ above, we might choose $C=\{u:u\ge 0\}$ and $C=\{\|u\|_F^2 \le 1\}$, respectively. Here, $\|\cdot\|_F$ stands for the Frobenius norm. 
 }
  \end{example}
 
 
 \edita{The \emph{augmented Lagrangian method} \cite{luenberger1984linear} is a powerful approach to solve Program \eqref{prob:01}, see Section \ref{sec:related work} for a review of the related literature as well as other approaches to solve Program \eqref{prob:01}}. \edita{ Indeed, for positive $\beta$, it is easy to verify that Program \eqref{prob:01} is equivalent to 
 \begin{align}
 \min_{u\in C} \max_y \, \mathcal{L}_\beta(u,y), 
 \label{eq:minmax}
 \end{align}
 where 
 \begin{align}
 \label{eq:Lagrangian}
 \mathcal{L}_\beta(u,y) := h(u) + \langle A(u)-b, y \rangle + \frac{1}{2\beta}\|A(u)-b\|_2^2,
 \end{align}
 is the augmented Lagrangian corresponding to Program \eqref{prob:01}. The equivalent formulation in Program \eqref{eq:minmax} naturally suggests the following algorithm to solve Program \eqref{prob:01}:}
 \begin{equation}\label{e:exac}
 u_{k+1} \in \underset{u\in C}{\argmin} \, \mathcal{L}_{\beta}(u,y_k), 
 \end{equation} 
 \begin{equation}
 y_{k+1} = y_k+\frac{1}{\beta}(A(u_{k+1}) -b).
 \end{equation}
 In fact, when the penalty parameter $\beta$ is sufficiently small, the augmented Lagrangian has
 a local minimum point near the true optimal point. However, we do not know exactly how small $\beta$ is. 
 Hence, the choice of $\beta$ plays a centreral role in practices. \notea{Is the last claim really true? Programs \eqref{prob:01} and \eqref{eq:minmax} seem to be equivalent. }
 \edita{In our nonlinear framework, updating $u$ in the augmented Lagrangian method requires solving the non-convex Program \eqref{e:exac} to global optimality, which is often intractable. } \notea{We should discuss fixes to this issue, if any, and explain why they are not satisfactory.} \edita{The key contribution of this paper is to provably and efficiently address this challenge.}
 
 \edita{
 \paragraph{\emph{\textbf{Contributions.}} }
 
 In order to solve Program \eqref{prob:01}, this paper proposes to replace the (intractable) Program \eqref{e:exac} with the update 
 \begin{equation}
 u_{k+1} = P_C (u_k - \gamma_k \nabla \mathcal{L}_{\beta_k} (u_k,y_k)), 
 \label{eq:new update}
 \end{equation}
 for carefully selected  sequences $\{\beta_k,\gamma_k\}_k$. Here, $P_C$ is the orthogonal projection onto the convex set $C$ which is often easy to compute in various applications and consequently the update in \eqref{eq:new update} is inexpensive and fast. 
 
 Put differently, instead of fully solving Program \eqref{e:exac}, this paper  proposes to apply one iteration of the projected gradient algorithm for every update. We provide the convergence guarantees for this fast and scalable new algorithm.
 }\notea{We should summarize the guarantees.}
 
 %%%%%%%%%%%%%%%%%%%%%
 \section{ Preliminaries}
 \notea{I think the whole of this section should move down. The actual results are hidden deep in the paper!}
 
 \paragraph{\textbf{\emph{Notation.}}}
 We use the notations $\scal{\cdot}{\cdot}$ and $\|\cdot\|$ for the \edita{standard inner} product and \edita{the} associated norm on $\RR^d$\edita{, respectively}.
 The conjugate of \edita{a} linear operator $A$ is denoted by $A^\top$.
 Let $C\subset \mathbb{R}^d$ be nonempty, closed, \edita{and convex}. The indicator function of $C$ is denoted by $\iota_{\mathcal{C}}$, and the projection onto $C$ is denoted by $P_C$. 
 %The distance function is $d_{\mathcal{C}}\colon u\mapsto \inf_{a\in\mathcal{C}}\|u-a\|$. The projection of $x$ onto $C$ is denoted by $P_Cx$. 
 \edita{For $u\in C$, the tangent cone to $C$ at $u$ is 
 \begin{equation}
 T_{C}(u) = \left\{v\in \RR^d : \exists t > 0 \text{ such that } u+t v \in C\right\}.
 \end{equation}
 The corresponding normal cone $N_C(u)$ at $u$ is the polar of the tangent cone, namely,
 \begin{align}
 N_C(u) = \left\{ v':  \langle v, v' \rangle \le 0,\, \forall v\in T_C(u)  \right\}. 
 \end{align}}
 %The regular normal cone $\hat{N}_C(\overline{u})$ is defined as the dual to the tangent cone, $\hat{N}_C(\overline{u}) = T_{C}(\overline{u})^*$.
 %The (Mordukhovich) limiting  normal cone to $C$ at $\overline{u}$ is defined by 
 %\begin{equation}
 %N_C(\overline{u}) = \menge{v\in \RR^d}{\exists u_k\to \overline{u}, v_k\to v \;\text{with}\; (\forall k\in\NN)\;  v_k \in \hat{N}_{C}(u_k)}.
 %\end{equation}
  \edita{
 The sub-differential of a convex function $f$ at $u$ is defined as 
 %
 %Let $f\colon \RR^d\to (-\infty, +\infty$ be a proper, lower semi-continuous, convex function. The sub-differential of $f$ at $p$ is
 \begin{equation}
 \partial f(u)=
 \left\{ g : 
 f(u') - f(u) \ge \langle g, u'-u\rangle,
 \,\, \forall u'
  \right\}. 
 \end{equation} }
 \edita{In particular,} if $f$ is differentiable at $u$, $\partial f(u)$ is a singleton and denoted by $\nabla f(u)$.
 \label{s:nota}
 \paragraph{\textbf{\emph{\edita{Necessary Optimality Conditions.}}} \label{sec:opt cnds}}
 \edita{Necessary optimality conditions} for  \edita{Program} \eqref{prob:01} are well studied in the literature \cite[Corollary 6.15]{rockafellar2009variational}. \edita{Indeed, $u$ is a (first-order) stationary point of Program \eqref{prob:01} if there exists $y$ for which 
 \begin{align}
 \begin{cases}
 -\nabla h(u) - DA(u)^\top y \in N_C(u)\\
 A(u) = b.
 \end{cases}
 \label{e:inclu1}
 \end{align}
 Here, $DA(u)$ is the Jacobian of $A$ at $u$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to 
 \begin{align}
 \begin{cases}
 -\nabla_u \mathcal{L}_\beta(u,y) \in N_C(u)\\
 A(u) = b,
 \end{cases}
 \label{e:inclu1}
 \end{align}
 which is in turn the necessary optimality condition for Program \eqref{eq:minmax}. 
 }
 %
 %Let $\overline{u}$ be a locally optimal and suppose that there no vector $y\not=0$ such that 
 %\begin{equation}
 %-\nabla L(\overline{u})^* y \in N_C(\overline{u}). 
 %\end{equation}
 %Then, the first order optimality condition for  $\overline{u}$ is 
 %\begin{equation}\label{e:inclu1}
 %(\exists y \in\RR^m)\;  - \nabla L(\overline{u})^*y - \nabla h(\overline{u}) \in N_{C}(\overline{u}).
 %\end{equation}
 %Since $\partial \iota_{\{0\}} = \RR^m$, the condition \eqref{e:inclu1} is equivalent to 
 %\begin{equation}
 %0 \in  \partial \iota_C(\overline{u}) +\nabla h(\overline{u}) + \nabla L(\overline{u})^* \partial \iota_{\{0\}}(L\overline{u}-b).
 %\end{equation}
 %Observe that the condition \eqref{e:inclu1} is also equivalent to   the following condition
 %\begin{equation}\label{e:inclu2}
 %(\exists y \in\RR^m)\; 0 \in \nabla \mathcal{L}(\overline{u},y),
 %\end{equation}
 %where $\mathcal{L}(u,y)$ is the  Lagrangian function associated to the non-linear constraint $Lu=b$,
 %\begin{equation}
 % \mathcal{L}_{\beta}(u,y) = (h +\iota_C)(u) + \scal{Lu-b}{y}.
 %\end{equation}
 %The corresponding augmented Lagrangian function associated to the non-linear constraint $Lu=b$ is defined by 
 %\begin{equation}
 %(\forall \beta \in \left]0,+\infty\right[)\quad \mathcal{L}_{\beta}(u,y) = (h+\iota_C)(u) + \scal{Lu-b}{y} +\frac{1}{2\beta}\|Lu-b\|^2.
 %\end{equation}
 %For convenience, we define 
 %\begin{equation}
 %(\forall \beta \in \left]0,+\infty\right[)\quad g_{\beta}(u,y) = \scal{Lu-b}{y} +\frac{1}{2\beta}\|Lu-b\|^2.
 %\end{equation}
 %and 
 %\begin{equation}
 %(\forall \beta \in \left]0,+\infty\right[)\quad F_{\beta}(u,y) = h(u)+ g_{\beta}(u,y).
 %\end{equation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \paragraph{\textbf{\emph{Gradient Mapping.}}}
 In nonconvex optimization, the relation between the gradient mapping  and stationarity is well-understood \cite{Lan16,Hare2009,bolte2014proximal}, \edita{which we review here for completeness.} 
 \begin{definition} \label{def:grad map}  Given $u$ and $\gamma >0$, define 
  the gradient mapping $G_{\beta,\gamma}(\cdot; y)\colon u\rightarrow \gamma^{-1}(u-u^+)$, where $u^+=P_{C}(u-\gamma \nabla \mathcal{L}_ {\beta}(u,y))$.
 \end{definition}
 %\begin{definition} Given $u$ and $\gamma >0$, let $r_{\beta}(\xi,y)$ be a stochastic estimimate of $\nabla F_{\beta}(u,y)$, define
  %stochastic gradient mapping $SG_{\beta,\gamma}(\cdot; y)\colon u\mapsto \gamma^{-1}(u-u_+)$, where $u_+=\prox_{\gamma f}(u-\gamma r_{\beta}(\xi;y))$.
 %\end{definition}
 \edita{In particular, if we remove the constraints of Program \eqref{prob:01}, the gradient mapping reduces to $G_{\beta,\gamma}(u,y)=\nabla h(u) $. The gradient mapping is closely related to $\mathcal{L}_\beta$}. The following result follows immediately from an application of \cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}.
 \begin{lemma}\label{lem:11}
  \edita{For fixed $y\in \RR^m$, suppose that $\mathcal{L}_{\beta}(\cdot, y) \in \mathbb{L}_{d}(\lambda_\beta)$. For $u\in C$ and $\gamma \in (0, 1/\lambda_\beta)$, it holds that }
  \begin{equation}
  \label{e:deslem}
  \| G_{\beta,\gamma}(u;y)\|^{2}\leq \frac{2}{\gamma} (\mathcal{L}_\beta(u;y) - \mathcal{L}_\beta(u^+;y)).
 \end{equation}
 %where $u^+ =u^+(\mu,u) = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$.
 %\end{enumerate}
 \end{lemma}
 %\begin{proof} For a fixed $y$, the function $u\mapsto h(u)+F_{\beta}(u;y)$ is $(\mathbb{L}_h+ \mathbb{L}_{\beta})$-Lipschitz. Hence, the results follows from 
 %\cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}.
 %\end{proof}
 
 In \edita{practice}, the Lipschitz constant $\lambda_{\beta}$ is often hard to evaluate exactly \edita{and we might resort to the classic line search technique, reviewed below and proved in Appendix \ref{sec:proof of eval Lipsc cte}.} 
 \edita{
 \begin{lemma} \label{lem:eval Lipsc cte} Fix $\rho \in (0,1)$ and ${\gamma}_0$. For $\ol{\gamma}>0$, let $u^+_{\gamma'} = P_C(u - \gamma' \nabla \mathcal{L}_\beta(u,y))$ and  define 
 \begin{equation*}
 \gamma := 
 \max \left\{
 \gamma' ={\gamma}_0 \rho^i : 
 \mathcal{L}_\beta (u^+_{\gamma'},y) \le \mathcal{L}_\beta (u,y) + 
 \left\langle u^+_{\gamma'} - u |  \nabla \mathcal{L}_{\beta}(u,y) \right\rangle 
 + \frac{1}{2\gamma'} \| u^+_{\gamma'} - u\|^2   
 \right\}.
 \end{equation*}
 Then, (\ref{e:deslem}) holds. 
 \end{lemma}
 }
 
 
 %Let $\delta $ and  $\theta$ be in $\left]0,1\right[$ and $\overline{\gamma} > 0$. Define 
 %\begin{alignat}{2}\label{e:non-lips}
 %&\gamma = \max\{\mu > 0| (\exists i \in\NN)(\mu= \overline{\gamma}\theta^i)\notag\\ 
 %&F_{\beta}(u^+(\mu,u),y) \leq F_{\beta}(u,y) + \scal{u^+(\mu,u)-u}{\nabla F_{\beta}(u,y)} +\frac{\delta}{\mu}\|u-u^+(\mu,u)\|^2\}.
 %\end{alignat}
 %Now, let $\gamma$ be define as in \eqref{e:non-lips}. Then, since $u^+ =u^+(\mu,u) = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$, 
 %\begin{equation}
 %G_{\beta,\gamma}(u) - \nabla  F_{\beta}(u;y)) \in N_{C}(u^+).
 %\end{equation}
 %Hence, $\scal{G_{\beta,\gamma}(u) - \nabla  F_{\beta}(u;y)) }{u-u^+} \leq 0$. Using \eqref{e:non-lips}, we have 
 %\begin{alignat}{2}
 %F_{\beta}(u^+,y) &\leq F_{\beta}(u,y) + \scal{u-u^+}{ -\nabla F_{\beta}(u,y)} +\frac{\delta}{\gamma}\|u-u^+\|^2\notag\\
 %&=  F_{\beta}(u,y) + \scal{u-u^+}{ G_{\beta,\gamma}(u) -\nabla F_{\beta}(u,y)} +\frac{\delta-1}{\gamma}\|u-u^+\|^2\notag\\
 %&\leq  F_{\beta}(u,y) - \frac{\delta-1}{\gamma}\|u-u^+\|^2,
 %\end{alignat}
 %which implies that 
 %\begin{equation}
 %(1-\delta)  \| G_{\beta,\gamma}(u;y)\|^{2}\leq \frac{1}{\gamma} (F_{\beta}(u;y) - F_{\beta}(u^+;y)).
 %\end{equation}
 %In particular, by taking $\delta =1/2$, we obtain \eqref{e:deslem}.
 \edita{Optimality conditions in Section \ref{sec:opt cnds}  can also be expressed in terms of the gradient mapping. Indeed, it is straightforward to verify that $u^+$ is a first-order stationary point of Program \eqref{prob:01} if 
 \begin{align}
 \begin{cases}
 G_{\beta,\gamma}(u,y) = 0\\
 A(u^+) = b.
 \end{cases}
 \label{eq:opt grad map}
 \end{align}
 }
 %
 %
 %
 %\begin{lemma}
 %\label{l:solv}
 % Suppose that $\mathcal{L}_{\beta,\gamma}\in \mathbb{L}_d(\lambda_\beta)$. For $\gamma \in (0, 1/\lambda_\beta)$, $u^+=P_C(u-\gamma \nabla \mathcal{L}_\beta(u,y))$ is a first-order stationary point of Program (\ref{prob:01}) if 
 % 
 % $\nabla g_{\beta}(\cdot, y)$ is $\mathbb{L}_{\beta}$-Lipschitz continuous. Let $u\in C$ and  
 %$\gamma \in ]0, 1/(\mathbb{L}_h+ \mathbb{L}_{\beta})[$. Suppose that $Lu^+=b$ and $\| G_{\beta,\gamma}(u;y)\|^{2} =0$. 
 %Then $u^+$ is a stationary point, i.e., the first order optimality condition \eqref{e:inclu1}, for $ \overline{u} =u^+$, is satisfied.
 %\end{lemma}
 %\begin{proof}
 %Since $u^+ = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$. Then 
 % it follows that
 %\begin{equation}
 %\label{e:ver1}
 %G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y)    \in N_C(u^+).
 %\end{equation}
 %Adding  $\nabla F_{\beta}(u^+;y)$ to both sides of \eqref{e:ver1}, we obtain
 %\begin{equation}
 %\label{e:ver2}
 %G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y)  +\nabla F_{\beta}(u^+;y)  \in\partial f(u^+)+ \nabla F_{\beta}(u^+;y).
 %\end{equation}
 %Using the Lipschitzian gradient of $F_{\beta}(\cdot, y)$ and  $\gamma (\mathbb{L}_\beta +\mathbb{L}_{h}) \leq 1$, we see that 
 %\begin{alignat}{2}
 %&\|G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y)   - \nabla F_{\beta}(u^+;y) \| \notag\\
 %&\quad\leq \|G_{\gamma,\beta}(u;y)\|+\|\nabla F_{\beta}(u;y)   - \nabla F_{\beta}(u^+;y) \|\notag\\
 %&\quad\leq \|G_{\gamma,\beta}(u;y)\|+( \mathbb{L}_\beta +\mathbb{L}_{h})\|u-u^+\|\notag\\
 %&\quad =0,
 %\end{alignat}
 %which means that $G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y)   - \nabla F_{\beta}(u^+;y)=0$. Hence, we derive from 
 %\eqref{e:ver2} that 
 %\begin{equation}
 %\label{conkhi}
 %0   \in\partial f(u^+)+ \nabla F_{\beta}(u^+;y).
 %\end{equation}
 %By definition of $F_{\beta}(u^+;y)$ and $Lu^+=b$, we have 
 %\begin{alignat}{2}
 %\nabla F_{\beta}(u^+;y) &= \nabla h(u^+) + \nabla L(u^+)^*y + \frac{1}{\beta}  \nabla L(u^+)^*(Lu^+-b) \notag\\
 %&= \nabla h(u^+) + \nabla L(u^+)^*y,
 %\end{alignat}
 %which together with \eqref{conkhi}, shows that \eqref{e:inclu1} is satisfied.
 %\end{proof}
 \paragraph{\textbf{\emph{\edita{Sufficient Optimality Conditions.}}}}
 Sufficient optimality conditions for Program \eqref{prob:01} are also well understood in the litterature \cite{luenberger1984linear,rockafellar2009variational,mordukhovich2006variational,gfrerer2015complete}.
 Indeed, $u$ is a local minimizer of Program \eqref{prob:01} if there exists $y$ for which 
 \begin{align}
 \begin{cases}
 v^\top \left( 
 \nabla_{uu} h(u) + \sum_{i=1}^m \nabla_{uu} A_i(u)
 \right) v \ge 0,
 \qquad 
 \forall v \in T_C(u),\\
 A(u) = b.
 \end{cases}
 \label{eq:suff cnd}
 \end{align}
 \notea{Why does above look different from sufficient cnds for Lagrangian? Suppose to be equivalent problems.}
 
 %For simple, let us assume that $C = \menge{u}{g(u)\leq c}$ for some $\mathcal{C}^2$-function $g\colon\RR^d\to\RR$. In this section we assume that $h$ and $L$ are too $\mathcal{C}^2$-functions.
 %Let $\overline{u}$ be a point such that $g(\overline{u})=c$ and $0\not\in\nabla g(\overline{u})$. Then, \cite[Proposition 10.3]{rockafellar2009variational} implies that 
 %\begin{equation}
 %N_{C}(\overline{u}) =\menge{\mu \nabla g(\overline{u})}{\mu \geq 0}.
 %\end{equation}
 %Now suppose that the first oder optimality condition \eqref{e:inclu1} is satisfied for $\overline{u}$. Then there exists $\mu \geq 0$ and  $y \in\RR^m$ such that 
 %\begin{equation}\label{e:inclu1}
 % \nabla L(\overline{u})^*y + \nabla h(\overline{u}) +\mu \nabla g(\overline{u})=0.
 % \end{equation}
 % Therefore, inview of \cite[Chapter 11]{luenberger1984linear}, $\overline{u}$ is a local minimum provided that 
 % the Hessian matrix 
 % \begin{equation} 
 % H(\overline{u}) = \nabla^2h(\overline{u}) + y^T\nabla^2 L(\overline{u}) + \mu \nabla^2 g(\overline{u})
 % \end{equation}
 % is positive definite on the space 
 %\begin{equation} 
 %E= \menge{y}{\nabla L(\overline{u})y =0, \nabla g(\overline{u})y=0}.
 %\end{equation}
 %In our examples, this condition is checkable where $g(u) = \frac{1}{2}\|u\|^2$, $h$ is quadratic and $Lu = Muu^\top$.
 \section{Algorithm $\&$ Convergence}
 \subsection{Algorithm}
 We propose the following method for solving the problem \eqref{prob:01} where, the main idea is that 
 we do a projected gradient descent step  on $u$ to obtain $u^+$ and update the penalty parameter $\beta^+$  in such a way
  that the feasiblity   $\frac{1}{2\beta^+\gamma}\|Lu^+-b\|^2$ reduce faster than the gradient mapping up to some noise level $\omega$:
  \begin{equation}
  \frac{1}{2\beta^+\gamma}\|Lu^+-b\|^2\leq \frac{1}{8} \|G_{\beta,\gamma}(u,y) \|^2 + \frac{\omega}{\gamma}
  \end{equation}
   Then update the corresponding the multiplier $y$ as in the classical ADMM: 
   \begin{equation}
   y^+ = y+\frac{1}{\sigma}(LU^+-b). 
   \end{equation} 
   The formal algorithm is presented as follows.
 \begin{algorithm}
 \label{Algo:2} 
 Input: $\beta_0> 0$, $c > 0$, $\alpha \in \left[0,1\right[$,  $u_{0}\in \mathcal{C}$, $y_0 =0$, $\epsilon_1\in \left]0,1\right[$.
 Given $\beta_k$, choose   
 $\gamma_{k} \leq  \frac{1-\epsilon_1}{\mathbb{L}_{h}+\mathbb{L}_{\beta_{k}}}.$
   Iterate  \\
 For k=0,1,\ldots\\
 1. Projected gradient: $ u_{k+1} = P_{\mathcal{C} }(u_{k} - \gamma_{k} \nabla F_{\beta_{k}}(u_{k},y_{k})).$\\
  2.  Line search step\\
  \quad $s=0, d_{k,0}=2$,  $\beta_{k+1,0}= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg(  \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 + \frac{d_{k,s}}{{(1+k)}^{1+\epsilon_1}}\bigg)^{-1}.$\\
   While $\beta_{k+1,s} \geq c/(k+1)^{\alpha}$ do
   \begin{alignat}{2}\label{e:mc}
 %  \beta_{k+1}\geq \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg(  \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 + \frac{d_k}{(k+1)^\alpha}\bigg)^{-1}.
 d_{k,s+1} &= 2*d_{k,s} \\
 \beta_{k+1,s+1}&= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg(  \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 + \frac{d_{k,s+1}}{{(1+k)}^{1+\epsilon_1}}\bigg)^{-1}\\
 s&\leftarrow s+1.
   \end{alignat}
   Endwhile\\
   3. Update  $\beta_{k+1} = \beta_{k+1,s}$.\\  
   4. Chose $\sigma_{k+1} \geq 2\beta_{k}$ and update $y_{k+1} = y_{k} + \frac{1}{\sigma_{k+1}} (Lu_{k+1}-b)$.\\
 \end{algorithm}
 
 \begin{remark} The updating rule of $(\beta_k)_{k\in\NN}$  in \eqref{e:mc} plays a role in our analysis.
  Intuitively, if $u_{k+1}$ is solution then $Lu_{k+1}=b$ and \eqref{e:mc} is trivially satisfied for any $\beta_{k+1}\geq 0$.
   Hence
  $\beta_{k+1}$ enforces $u_{k+1}$ close to $\menge{u}{Lu=b}$ 
  \end{remark}
  \begin{remark}When $\sigma_k\equiv \infty$, we get $y_k\equiv 0$ and hence 
 the step 2 disappears. If we chose $\sigma_k =  c(k+1)^{\alpha_1}\|Lu_k-b\|$ where $c,\alpha_1$ is chosen such that $\sigma_{k} > 2\beta_{k-1}$, then 
 \begin{equation}
 \|y_{k+1}\| \leq \|y_k\| + \frac{\|Lu_{k+1}-b\|}{\sigma_{k+1}} = \|y_k\| + \frac{1}{c(k+2)^\alpha}.
 \end{equation} 
 Since $\sum_{k\in\NN}  \frac{1}{c(k+2)^\alpha} <+\infty$, $(\|y_k\|)_{k\in\NN}$ converges and hence bounded. Therefore, 
 \begin{equation}
 b_0 = \inf_{k\in\NN} \mathcal{L}_{\beta_{k}}(u_{k+1},y_{k}) \geq \inf_{k} h(u_{k}),
 \end{equation}
 which implies that $b_0>-\infty$ whenever $(u_k)_{k\in\NN}$ or $\dom(f)$ is bounded.
  \end{remark}
  
  \subsection{Convergence}
  In view of Lemma \ref{l:solv}, we need to estimate gradient mapping $\|G_{\beta_{k},\gamma_{k}}(u_k,y_{k})\|$ as well as feasibility $\|Lu_{n+1}-b\|^2$. 
  \begin{theorem} 
  \label{t:1}
  Suppose that $b_0 = \inf_{k\in\NN} \mathcal{L}_{\beta_{k}}(u_{k+1},y_{k}) > -\infty$  and that $z_0=\sum_{k=1}^\infty \frac{d_{k,s_k}}{(1+k)^{1+\epsilon_1}} <+\infty$,
  where $s_k$ be the smallest index such that $\beta_{k,s_k} < c/(k+1)^{\alpha}$.
   Then the following hold.
 \begin{equation}\label{e:mapp1}
 \sum_{k=1}^\infty \gamma_{k} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 \leq 4(\mathcal{L}_{\beta_0}(u_{1},y_0) + z_0-b_0+\frac{\gamma_0}{8}\|G_{\beta_0,\gamma_0}(u_0)\|^2),
 \end{equation}
 and 
 \begin{equation}\label{e:feas1}
 \sum_{k=1}^\infty \frac{1}{\beta_{k+1}} \|Lu_{k+1}-b \|^2  \leq (\mathcal{L}_{\beta_0}(u_{1},y_0) + 3z_0-b_0+\frac{\gamma_0}{8}\|G_{\beta_0,\gamma_0}(u_0)\|^2).
 \end{equation}
  \end{theorem}
  \begin{proof} Set $e_{k+1}= \frac{d_{k,s_k}}{(k+1)^\alpha}$. Then $z_0= \sum_{k\in\NN}e_k <+\infty$.
  It follows from Lemma \ref{lem:11} that 
  \begin{alignat}{2}
 G_k= \frac{\gamma_{k}}{2}\|G_{\beta_k,\gamma_k}(u_k,y_k) \|^2 &\leq  \mathcal{L}_{\beta_k}(u_k,y_k) - \mathcal{L}_{\beta_{k}}(u_{k+1},y_k)\notag\\
  &= h(u_k) - h(u_{k+1}) +  g_{\beta_k}(u_k,y_k) -g_{\beta_{k}}(u_{k+1},y_k)\notag\\
  &=h(u_k) - h(u_{k+1}) +  g_{\beta_{k-1}}(u_k,y_{k-1}) -g_{\beta_{k}}(u_{k+1},y_k)\notag\\
  &\quad  + g_{\beta_k}(u_k,y_k)-  g_{\beta_{k-1}}(u_k,y_{k-1}), \label{e:sa1}
  \end{alignat}
  where we set $g_{\beta}(u,y) =  \scal{Lu-b}{y} +\frac{1}{2\beta}\|Lu-b\|^2$. Let us estimate the last term in \eqref{e:sa1}. We have 
  \begin{alignat}{2}
  \omega_{1,k}= g_{\beta_k}(u_k,y_k)-  g_{\beta_{k-1}}(u_k,y_{k-1}) = \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+\scal{Lu_k-b}{y_k-y_{k-1}}.
  \end{alignat}
  Since $y_{k} = y_{k-1} + \frac{1}{\sigma_{k}} (Lu_{k}-b)$ and use \eqref{e:mc},  we get 
   \begin{alignat}{2}
  &\quad \omega_{1,k} = \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+ \frac{1}{\sigma_k} \|Lu_k-b\|^2.
  %&\leq  \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+ \frac{\gamma_{k-1}}{8} \|G_{\beta_{k-1},\gamma_{k-1}}(u_{k-1},y_{k-1}) \|^2 + \frac{d_{k-1}}{k^\alpha}\notag\\
  %&\leq  \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+\frac{1}{4} G_k+ \frac{1}{4} G_{k-1} + \frac{d_{k-1}}{k^\alpha}.
  \end{alignat}
   Let us estimate the first term in \eqref{e:sa1}. Set $T_k= h(u_k)  +\scal{Lu_k-b}{y_{k-1}} $. Then
     \begin{alignat}{2}
  \omega_{2,k}&= h(u_k) - h(u_{k+1}) +  g_{\beta_{k-1}}(u_k,y_{k-1}) -g_{\beta_{k}}(u_{k+1},y_k)\notag\\
  & =     T_k -T_{k+1} + \frac{1}{2\beta_{k-1}} \|Lu_k-b\|^2 - \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2.
   \end{alignat}
  Therefore, we derive from \eqref{e:sa1} that 
  \begin{alignat}{2}
 G_k &\leq \omega_{1,k} +\omega_{2,k}\notag\\
   &=  T_k -T_{k+1} + \frac{1}{2\beta_k} \big(\|Lu_k-b\|^2 -\|Lu_{k+1}-b\|^2\big)+\frac{1}{\sigma_k} \|Lu_k-b\|^2.\label{e:sa2}
  \end{alignat}
 Now using the condition \eqref{e:mc}, we obtain 
 \begin{equation}
  \frac{1}{2\beta_k} \|Lu_k-b\|^2 \leq \frac{1}{4} G_{k-1} + e_k\leq \frac{1}{4}G_k+ \frac{1}{4} G_{k-1} + e_k,
 \end{equation}
 Therefore, it follows from \eqref{e:sa2} that
  \begin{alignat}{2}
  \frac12 G_k &\leq (T_k+\frac{1}{4} G_{k-1}) -(T_{k+1} +\frac{1}{4} G_k )+ \frac{1}{\sigma_k} \|Lu_k-b\|^2 - \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2 + e_k\notag\\
   &\leq (T_k+\frac{1}{4} G_{k-1}) -(T_{k+1} +\frac{1}{4} G_k )+ \frac{1}{2\beta_{k-1}} \|Lu_k-b\|^2 - \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2 + e_k. \label{e:sa3}
  \end{alignat}
  For every $N\in\NN$, $N\geq 1$, summing \eqref{e:sa3} from $k=1$ to $k=N$, we obtain,
  \begin{alignat}{2}
  \sum_{k=1}^{N}\frac12 G_k \leq T_1 +\frac{1}{4} G_0 +  \frac{1}{\beta_{0}} \|Lu_1-b\|^2 - T_{N+1} - \frac{1}{4}G_{N} - \frac{1}{2\beta_N}  \|Lu_{N+1}-b\|^2 + z_0.
  \end{alignat}
  Note that, by the definiton of $T_{N+1}$, we have 
  \begin{alignat}{2}
  -T_{N+1} - \frac{1}{2\beta_N}  \|Lu_{N+1}-b\|^2& = - \mathcal{L}_{\beta_N}(u_{N+1},y_N) \leq -b_0.
  \end{alignat}
  Hence,
  \begin{equation}
   \sum_{k=1}^{N}\frac12 G_k \leq \mathcal{L}_{\beta_0}(u_{1},y_0) + z_0-b_0 +\frac{1}{4} G_0 ,
  \end{equation}
  which proves \eqref{e:mapp1}. Moreover,  \eqref{e:feas1} follows directly  from \eqref{e:mc}. 
   \end{proof}
  
  \begin{corollary} Under the same condition as in Theorem \ref{t:1}. Suppose that 
 $\gamma_k = \mathcal{O}(\beta_k)$. Then 
 \begin{equation}\label{e:mapp2}
 \min_{1\leq k\leq N} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 = \mathcal{O}(1/N^{1-\alpha}) \to 0,
 \end{equation}
 and 
 \begin{equation}\label{e:feas2}
 \min_{1\leq k\leq N} \|Lu_{k+1}-b \|^2 = \mathcal{O}(1/N) \to 0.
 \end{equation}
  \end{corollary}
  \begin{proof} We see that 
  \eqref{e:mapp2} and \eqref{e:feas2} follow directly 
  from \eqref{e:mapp1} and  \eqref{e:feas1}, respectively.
  \end{proof}
  \begin{corollary} \label{c:2}
  Under the same condition as in Theorem \ref{t:1}. 
  The sequence $(F_{\beta_k}(u_k,y_{k}))_{k\in\NN}$ converges to a $F^\star$. Moreover,
  if $(\|y_{k-1}\| \sqrt{\beta_k})_{k\in\NN}$  and $(\beta_{k+1}/\beta_{k})_{k\in\NN}$  are bounded by $M$, then $(h(u_k))_{k\in\NN}$ converges to $F^\star$.
  \end{corollary}
  \begin{proof} Note that $(F_{\beta_k}(u_{k+1},y_k))_{n\in\NN}$ is bounded below. Moreover, the proof of Theorem \ref{t:1} show that 
  \begin{equation}
  F_{\beta_k}(u_{k+1},y_k) + \frac{\gamma_k}{8}\|\mathcal{G}_{\beta_k,\gamma_k}(u_k)\|^2
   \leq  F_{\beta_{k-1}}(u_{k},y_{k-1}) + \frac{\gamma_{k-1}}{8}\|\mathcal{G}_{\beta_{k-1},\gamma_{k-1}}(u_{k-1})\|^2 +e_k.
  \end{equation}
  Hence $(F_{\beta_k}(u_{k+1},y_k) + \frac{\gamma_k}{8}\|\mathcal{G}_{\beta_k,\gamma_k}(u_k)\|^2)_{n\in\NN}$ converges to a finite value
  $F^\star$. Since  $\frac{\gamma_k}{8}\|\mathcal{G}_{\beta_k,\gamma_k}(u_k)\|^2\to 0$, we get $F_{\beta_k}(u_{k+1},y_k)\to F^\star$. Since 
 $\frac{1}{2\beta_k}\|Lu_{k}-b\|^2\to 0$ and $(\beta_{k+1}/\beta_{k})_{k\in\NN}$ is bounded by $M$, we obtain 
 \begin{equation}
 \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2 =\frac{\beta_{k+1}}{2 \beta_{k+1}\beta_{k}}\|Lu_{k+1}-b\|^2 \leq \frac{M}{2\beta_{k+1}} \|L u_{k+1} -b\|^2 \to 0. 
 \end{equation}
 Moreover, since $(\|y_{k-1}\| \sqrt{\beta_k})_{k\in\NN}$ is bounded by $M$, we also have 
 \begin{equation}
 |\scal{Lu_k-b}{y_{k-1}}| = |\frac{1}{\sqrt{\beta_k}}\scal{Lu_k-b}{\sqrt{\beta_k}y_{k-1}}| \leq \frac{M}{\sqrt{\beta_k}}\|Lu_{k}-b\| \to 0.
 \end{equation}
 Therefore, 
 \begin{alignat}{2}
 |h(u_n) - F^\star| &\leq |F_{\beta_k}(u_k,y_{k-1})-F^\star| + |\scal{Lu_k-b}{y_{k-1}}|  + \frac{1}{2\beta_k}\|Lu_k-b\|^2\notag\\
 &\to 0,
 \end{alignat}
 which proves the desired result.
  \end{proof}
  \subsection{Local convergence} 
  Let $\overline{u}$ and $\overline{y}$ satisfy the first order optimality condition 
  \begin{equation}
  \label{e:fr1}
  -\nabla L(\overline{u})^*\overline{y} - \nabla h(\overline{u}) \in N_{C}(\overline{u}).
  \end{equation}
  For simple, let us recall $\nabla F_{\beta}\colon (u,y) \mapsto \nabla h(u) + \nabla L(u)^*y + \frac{1}{\beta}\nabla L(u)^*(Lu-b)$.
  \begin{theorem} Under the conditions of Theorem \ref{t:1} and  $\gamma_k \geq \underline{\gamma} > 0$ .
  Suppose that  each $\overline{u}$ in \eqref{e:fr1} is a local minima of $h$, and $C$ and $(y_k)_{k\in\NN}$ are bounded. 
  Then $(h(u_k))_{k\in\NN}$ converges to local optimum $h(\overline{u})$.
  \end{theorem}
  \begin{proof} Since $C$ is bounded, $(u_k)_{k\in\NN}$ is bounded. Therefore, there exists a subsequence 
  $(n_k)_{k\in\NN}$ of $\NN$ such that $u_{n_k}\to u^*$ and $y_{n_k}\to y^*$.
  It follows from Theorem \ref{t:1} that $\gamma_k\|(u_{k+1}-u_k)/\gamma_k\|^2 \to 0$. Since $(\gamma_k)_{k\in\NN}$ 
   is bounded below by $\underline{\gamma}$, we obtain $G_{\beta_k,\gamma_k}(u_k)\to 0$.
 Hence, $u_{n_k+1}\to u^*$.
  Now, using the updating of $u_{k}$, we have $G_{\beta_k,\gamma_k}(u_k) -\nabla F_{\beta_k}(u_k,y_k) \in N_{C}(u_{k+1})$.
  Hence, 
   \begin{equation}\label{e:aa2s}
  (\forall u\in C)(\forall k\in\NN)\; \scal{-G_{\beta_k,\gamma_k}(u_k)+\nabla F_{\beta_{k}}(u_k,y_k) }{u-u_{k+1}}\geq 0.
  \end{equation}
  Since $C$ is bounded and $\nabla L$ is continuous, we obtain $\sup_{u\in C}\|\nabla L(u)\| <+\infty$, and hence 
  \begin{equation}
  \|\frac{1}{\beta_{n_k}}\nabla L(u_{n_k})^*(Lu_{n_k}-b)\| \leq \frac{1}{c}(\sup_{u\in C}\|\nabla L(u)\| )\|Lu_{n_k}-b\|\to 0.
  \end{equation}
  We also have 
  \begin{equation}
   \nabla L(u_{n_k}) \to \nabla L(u^*) \; \text{and}\;   \nabla h(u_{n_k}) \to \nabla h(u^*).
  \end{equation}
  Since $y_{n_k}\to y^*$, we get 
  \begin{alignat}{2}
 & \|\nabla L(u_{n_k})^*y_{n_k}- \nabla L(u^*)y^* \|  \quad\\
  &\leq \|\nabla L(u_{n_k})^*y_{n_k}- \nabla L(u_{n_k})y^* \| +\|\nabla L(u_{n_k})^*y^*- \nabla L(u^*)y^* \|\notag\\
  &\leq  \|\nabla L(u_{n_k})\|\|y_{n_k}- y^* \|+\|y^*\| \|\nabla L(u_{n_k})- \nabla L(u^*) \|\notag\\
  &\to 0.
  \end{alignat}
  Consequently, $\nabla F_{\beta_{n_k}}(u_{n_k}, y_{n_k}) \to \nabla h(u^*) +\nabla L(u^*)y^*$. Note that 
  $$G_{\beta_{n_k},\gamma_{n_k}}(u_{n_k})\to 0.$$ Now, passing through subsequence in \eqref{e:aa2s}, we obtain 
    \begin{equation}\label{e:aa2s}
  (\forall u\in C)\scal{\nabla h(u^*) +\nabla L(u^*)y^*  }{u-u^*}\geq 0,
  \end{equation}
  which is \eqref{e:fr1} for $\overline{u} = u^*$ and  $\overline{y} = y^*$. By assumption, $u^*$ is local minimum and 
  $h(u_{n_k}) \to h(u^*)$. Therefore, $F^\star = h(u^*)$. Using Corollary \ref{c:2}, we get $h(u_k) \to h(\overline{u})$.
  \end{proof}
   \section{Related Work \label{sec:related work}}
 
  To the best of our knowledge, the proposed method is new and different from existing methods in the literature. 
  
 As mentioned in Introduction, the connection to augmented Lagrange method is already mentioned.
 Our method is significantly different from the augmented Lagrange method, we perform 
 only step of the projected gradient step on primal variable $u$ instead of minimizing the augmented Lagrange fucntion.
 Furthermore, we update the penalty parameter $\beta$ adaptively to make sure that the feasibility reduces 
 faster than the gradient mapping.
 
 In the case when $h=0$, a modification of 
 Chambolle-Pock's method is investigated in \cite{Valkonen14} and preconditioned ADMM \cite{Matin17} 
 where the convergence of iterate is proved under strong assumptions not full-filling in our setting here. 
 
 %\noindent{\bf Connection to Linearized Alternating Direction Method}.\\
 ADMM is the classic method proposed for solving the problem \ref{prob:01} for the case where $L$ is a linear operator and $h$ is zero \cite{gabay1976dual}. 
 This method is an application of the Douglas-Rachford method to the dual problem \cite{Gabay83}.
 One of the main drawback of the ADMM is the appearance of the term $Lu$ in the update rule of $u_{k+1}$. To overcome this issue, some strategies were suggested. 
 The first strategies is proposed in \cite{shefi2014rate}, refined in \cite{banert2016fixing}, known as alternating direction proximal method of multipliers.
 The second strategies is to use linearized technique  \cite{lin2011linearized}.
  We show here that our proposed method 
 is closed related to  updating rule as the  linearized alternating direction method \cite{lin2011linearized}. Assume that $h\equiv 0$ and $L$ is a linear operator.
  Then the proposed method can be rewritten as 
 \begin{equation}
 \begin{cases}
 u_{k+1}= \arg\min_{u\in C}  \frac{1}{2\gamma_k} \| u-u_k + \gamma_kL^*\bigg( \lambda_k + \frac{1}{\beta_k}\big(Lu_k-b\big)\bigg)\|^2\notag\\
   \beta_{k+1}= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg( \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 + \frac{d_k}{(k+1)^\alpha}\bigg)^{-1}\\
 \text{Chose $\sigma_{k+1}\geq 2\beta_k$ and}\;  \lambda_{k+1} = \lambda_k +\frac{1}{\sigma_{k+1}}(Lu_{k+1}-b),
 \end{cases}
 \end{equation}
 which is a variant version of Linearized ADMM \cite{lin2011linearized}. 
 
 %\noindent
 %{\bf Connection to ALBUM3 in \cite{bolte2018nonconvex}}\\
 Very recently, \cite{bolte2018nonconvex} proposes a framework with for solving the problem \ref{prob:01} with $C=\RR^d$. 
 In particular, a special case AlBUM3 (Proximal Linearized Alternating Minimization) in this work 
 is closely related to us where their conditions are checkable only when $L$ is linear. Moreover, 
 our updating of $\beta_{k}$ in \cite{bolte2018nonconvex} depending on the smallest eigenvalue $L^*L$. For nonlinear $L$, the application of their method remains a challenge.
 
 %\noindent{\bf Connection to the deflected subgradient method}\\
 The deflected subgradient method is investigated in \cite{burachik2010deflected} can be use to solve a special case of the Problem \ref{prob:01}  for
 some a compact subset $\mathcal{C}$ in $\mathcal{X}$. The basis step of the  deflected subgradient method to solve: given $\beta, v$,
 \begin{equation}
  u^*\in \arg\min_{u\in C} h(u) + \beta \boldsymbol{\sigma}(Lu-b) - \scal{Kv}{Lu-b}
 \end{equation}
 where $\boldsymbol{\sigma}$ is a continuous penalty function such as $\|\cdot\|$, and $K$ is bounded linear operator. In general, there is no closed 
 -form expression for $u^*$ since it does not split $f$, $h$, $L$ invididually. Hence, it is hard to implement  deflected subgradient method. This is also a common drawback of the classic penalty method and its related works 
 \cite{gasimov2002augmented,burachik2010primal}.
 
  \section{Numerical experiments}
  \subsection{Hanging chain}
 %\begin{thebibliography}{}
 %
 % and use \bibitem to create references. Consult the Instructions
 % for authors for reference list style.
 \bibliographystyle{abbrv}
 
 \bibliography{references_alp.bib,bang.bib,ctuy16-small-bib.bib,JS_References.bib,lions.bib,references_optimal_sampling,references_yp,tythc16-small-bib,yhli.bib,bibliograpply,ctuy16-small-bib,bang1.bib,bang.bib}
 
 
 
 \appendix
 
 \section{Proof of Lemma \ref{lem:eval Lipsc cte} \label{sec:proof of eval Lipsc cte}}
 
 By definition of $u^+_{\gamma}$, we have  that 
 \begin{equation}
 u^+_{\gamma} - u +\gamma \nabla \mathcal{L}_\beta(u,y) \in -N_C(u^+_{\gamma}).
 \label{eq:optimality of uplus}
 \end{equation}
 On the other hand, $\gamma$ by definition satisfies 
 \begin{align}
 & \mathcal{L}_{\beta}(u^+_{\gamma},y) \nonumber\\
  & \le \mathcal{L}_\beta(u,y) + \left\langle 
 u^+_{\gamma'} - u | \nabla \mathcal{L}_\beta (u,y) 
 \right\rangle + \frac{1}{2\gamma'}\|u^+_{\gamma'} - u\|^2 \nonumber\\
 & =  \mathcal{L}_\beta(u,y) + \frac{1}{\gamma} \left\langle 
 u^+_{\gamma'} - u |u^+_\gamma - u+ \gamma \nabla \mathcal{L}_\beta (u,y) 
 \right\rangle 
 - \frac{1}{2\gamma}\|u^+_{\gamma} - u\|^2 \nonumber\\
 & \le  \mathcal{L}_\beta(u,y) 
 - \frac{1}{2\gamma}\|u^+_{\gamma} - u\|^2
 \qquad \text{(see \eqref{eq:optimality of uplus})} \nonumber\\
 & = \mathcal{L}_\beta(u,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(u,y)\|^2, 
 \qquad \text{(see Definition \ref{def:grad map})}
 \end{align}
 which completes the proof of Lemma \ref{lem:eval Lipsc cte}. 
 
 \section{Draft of convergence proof}
 
 We use the shorthand
 \begin{align}
 h_k = h(u_k),
 \qquad 
 A_k = A(u_k),
 \qquad 
 G_k = \| G_{\beta_k,\gamma_k}(u_k,y_k) \|^2.
 \end{align}
 Recall that 
 \begin{align}
 y_{k+1} =y_k + \frac{A_{k+1}}{\sigma_{k+1}}, 
 \end{align}
 \begin{align}
 \frac{\gamma_k G_k^2}{2} & \le \mathcal{L}_{\beta_k}(u_k,y_k) - \mathcal{L}_{\beta_k}(u_{k+1},y_k) \nonumber\\
 & = h_k - h_{k+1}+ (A_k-A_{k+1}) \cdot y_k + \frac{\|A_k\|^2}{2\beta_k} - \frac{\|A_{k+1}\|^2}{2\beta_k}.
 \end{align}
 Note that 
 \begin{align}
 y_k = y_0 + \sum_{i=1}^k \frac{A_i}{\sigma_i},
 \end{align}
 which after substituting above yields 
 \begin{align}
 \frac{\gamma_k G_k^2}{2} & \le h_k - h_{k+1} + (A_k - A_{k+1}) \cdot (y_0 + \sum_{i=1}^k \frac{A_i}{\sigma_i}) + \frac{\|A_k\|^2}{2\beta_k} - \frac{\|A_{k+1}\|^2}{2\beta_k}. 
 \end{align}
 By summing up, we find that 
 \begin{align}
 & \sum_{k=0}^K \frac{\gamma_k G_k^2}{2} \nonumber\\
 & \le h_0 - h_{K+1} + (A_0 - A_{K+1}) \cdot y_0 
 + \sum_{k=0}^K \sum_{i=1}^k (A_k - A_{k+1}) \cdot \frac{A_i}{\sigma_i} + \sum_{k=0}^K \frac{\|A_k\|^2}{2\beta_k} - \sum_{k=0}^K \frac{\|A_{k+1}\|^2}{2\beta_k}  \nonumber\\
 & = h_0 - h_{K+1} + (A_0 - A_{K+1}) \cdot y_0 
 + \sum_{k=0}^K \sum_{i=1}^k (A_k - A_{k+1}) \cdot \frac{A_i}{\sigma_i} + \sum_{k=0}^K \frac{\|A_k\|^2}{2\beta_k} - \sum_{k=1}^{K+1} \frac{\|A_{k}\|^2}{2\beta_{k-1}} \nonumber\\
 & = h_0 - h_{K+1} + (A_0 - A_{K+1}) \cdot y_0 + \frac{\|A_0\|^2}{2\beta_0} - \frac{\|A_{K+1}\|^2}{2\beta_K} + \sum_{i=1}^K \sum_{k=i}^K (A_k - A_{k+1}) \cdot \frac{A_i}{\sigma_i} + \sum_{k=1}^K  \left( \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A_k\|^2  \nonumber\\
 & = h_0 - h_{K+1} + (A_0 - A_{K+1}) \cdot y_0 + \frac{\|A_0\|^2}{2\beta_0} - \frac{\|A_{K+1}\|^2}{2\beta_K} + \sum_{i=1}^K (A_i - A_{K+1}) \cdot \frac{A_i}{\sigma_i} + \sum_{k=1}^K  \left( \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A_k\|^2  \nonumber\\
 & = h_0 - h_{K+1} + (A_0 - A_{K+1}) \cdot y_0 + \frac{\|A_0\|^2}{2\beta_0} - \frac{\|A_{K+1}\|^2}{2\beta_K} - A_{K+1} \cdot \sum_{k=1}^K \frac{A_k}{\sigma_k} + \sum_{k=1}^K  \left( \frac{1}{\sigma_k}+  \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A_k\|^2 \nonumber\\
 & \le h_0 - h_{K+1} + (A_0 - A_{K+1}) \cdot y_0 + \frac{\|A_0\|^2}{2\beta_0} - \frac{\|A_{K+1}\|^2}{2\beta_K} +  \sum_{k=1}^K \frac{\|A_{K+1}\| \|A_k\|}{\sigma_k} + \sum_{k=1}^K  \left( \frac{1}{\sigma_k}+  \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A_k\|^2 \nonumber\\
 & \le h_0 - h_{K+1} + (A_0 - A_{K+1}) \cdot y_0 + \frac{\|A_0\|^2}{2\beta_0} - \frac{\|A_{K+1}\|^2}{2\beta_K} +  \sum_{k=1}^K \frac{\|A_k\|^2}{2\sigma_k} + \sum_{k=1}^K \frac{\|A_{K+1}\|^2}{2\sigma_k}+ \sum_{k=1}^K  \left( \frac{1}{\sigma_k}+  \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A_k\|^2 \nonumber\\
 & = h_0 - h_{K+1} + (A_0 - A_{K+1}) \cdot y_0 + \frac{\|A_0\|^2}{2\beta_0} + \sum_{k=1}^K  \left( \frac{3}{2\sigma_k}+  \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A_k\|^2
 + (\sum_{k=1}^K \frac{1}{2\sigma_k} - \frac{1}{2\beta_K}) \|A_{K+1}\|^2.
 \end{align}
 If we let $\mu := \inf h_k + A_k\cdot y_0 > -\infty$, we conclude that 
 \begin{align}
  \sum_{k=0}^K \frac{\gamma_k G_k^2}{2}  & 
 \le  \sum_{k=1}^{K+1}  \left( \frac{3}{2\sigma_k}+  \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A_k\|^2
  + h_0 +A_0\cdot y_0 + \frac{\|A_0\|^2}{2\beta_0} - \mu \nonumber\\
  & =:\sum_{k=1}^{K+1}  \left( \frac{3}{2\sigma_k}+  \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A_k\|^2 + \mu_0.
  \label{eq:temp}
 \end{align}
 If we assume that 
 \begin{equation}
 \beta_{k+1}  \le  \beta_k,
-\label{eq:beta decays}
 \end{equation}
 we find after some algebra that 
 \begin{equation}
 \sum_{k=0}^K \gamma_k G_k^2 \le \sum_{k=1}^{K+1} \frac{3\|A_k\|^2}{\sigma_k} + 2\mu_0. 
 \label{eq:raw}
 \end{equation}
-%We now try to formalize Volkan's intuition that gradient mapping should derive down the feasibility gap. Let us assume that 
-%\begin{align}
-%\sum_{k=1}^{K+1} \frac{\|A_k\|^2}{\beta_k} \le \sum_{k=0}^K \gamma_k G_k^2.
-%\label{eq:assumption}
-%\end{align}
-%And we take 
-%\begin{equation}
-%\sigma_k = 6 \beta_k,
-%\qquad \forall k. 
-%\end{equation}
-%Then, by combining the two inequalities above, we reach 
-%\begin{align}
-% \sum_{k=1}^{K+1} \frac{\|A_k\|^2}{\beta_k} \le 4\mu_0,
-%\end{align}
-%\begin{align}
-%\sum_{k=0}^K \gamma_k G_k^2   \le 4\mu_0. 
-%\end{align}
-%That is, Volkan's assumption \eqref{eq:assumption} successfully bounds both the gradient mapping and the feasibility gap. One question is the interplay between $\{\beta_k,\gamma_k\}_k$ to ensure the validity of Volkan's assumption, which feels like some sort of \emph{uncertainty principle}. 
+We now try to formalize Volkan's intuition that gradient mapping should derive down the feasibility gap. Let us assume that 
+\begin{align}
+\sum_{k=1}^{K+1} \frac{\|A_k\|^2}{\beta_k} \le \sum_{k=0}^K \gamma_k G_k^2.
+\label{eq:assumption}
+\end{align}
+And we take 
+\begin{equation}
+\sigma_k = 6 \beta_k,
+\qquad \forall k. 
+\end{equation}
+Then, by combining the two inequalities above, we reach 
+\begin{align}
+ \sum_{k=1}^{K+1} \frac{\|A_k\|^2}{\beta_k} \le 4\mu_0,
+\end{align}
+\begin{align}
+\sum_{k=0}^K \gamma_k G_k^2   \le 4\mu_0. 
+\end{align}
+That is, Volkan's assumption \eqref{eq:assumption} successfully bounds both the gradient mapping and the feasibility gap. One question is the interplay between $\{\beta_k,\gamma_k\}_k$ to ensure the validity of Volkan's assumption, which feels like some sort of \emph{uncertainty principle}. 
 
 \paragraph{\textbf{Summary of Bang's argument.}} Bang's argument is summarized below for comparison:
 \begin{align}
 & \frac{\gamma_k G_k^2}{2} \nonumber\\
 & \le h_k - h_{k+1} + (A_k - A_{k+1}) \cdot y_k + \frac{\|A_k\|^2}{2\beta_k} - \frac{\|A_{k+1}\|^2}{2\beta_k} \nonumber\\
 & = h_k - h_{k+1} + A_k \cdot y_k - A_{k+1} \cdot y_{k+1} + A_{k+1} \cdot (y_{k+1} - y_{k}) + \frac{\|A_k\|^2}{2\beta_k} - \frac{\|A_{k+1}\|^2}{2\beta_k} \nonumber\\
 & = h_k - h_{k+1} + A_k \cdot y_k - A_{k+1} \cdot y_{k+1}  + \left(\frac{1}{\sigma_{k+1}} - \frac{1}{2\beta_k} \right) \|A_{k+1}\|^2 + \frac{\|A_k\|^2}{2\beta_k}
 \quad \text{(definition of ys)}
  \nonumber\\
 & \le (h_k  +A_k \cdot y_k + \frac{G_{k-1}}{4} + \frac{\|A_k\|^2}{2\beta_{k-1}}) - (h_{k+1}+  A_{k+1} \cdot y_{k+1}  + \frac{G_k}{4} + \frac{\|A_{k+1}\|^2}{2\beta_k}) + e_k.  
 \qquad \text{(Bang's assumption)}
 \end{align}
 We form a telescope with the last line above. 
 
-\paragraph{\textbf{Volkan's assumption.}} In \eqref{eq:raw}, we bounded the gradient map with the feasibility gap. We next try to bound the feasibility gap. To that end, let us assume for now that the boundary of $C$ is smooth. Then let $P_{T_C(u)}$ be the orthogonal projection onto the tangent space of $C$ at $u$. 
-Then the update rule for $u_k$ implies that 
-\begin{align}
-\frac{P_{T_C(u_{k+1})}  (u_{k}-u_{k+1}) }{\gamma_k} & = P_{T_C(u_{k+1})} \nabla h(u_k) + P_{T_C(u_{k+1})} DA(u_k)^\top y_k + \frac{1}{\beta_k} P_{T_C(u_{k+1})} DA(u_k)^\top A(u_k) .
-\end{align}
-Rearranging and applying the triangle inequality yields that 
+\paragraph{\textbf{Validity of Volkan's assumption.}} We would like to show that the gradient mapping should bound the feasibility gap when $\beta$ is sufficiently small. Note that 
 \begin{align}
-\| P_{T_C(u_{k+1})} DA(u_k)^\top A(u_k) \| \le 
-\beta_k \left\| \frac{P_{T_C(u_{k+1})}  (u_{k}-u_{k+1}) }{\gamma_k} - P_{T_C(u_{k+1})} \nabla h(u_k) - P_{T_C(u_{k+1})} DA(u_k)^\top y_k  \right\|. 
+G & = \frac{u - u^+}{\gamma}\nonumber\\
+& = \nabla h(u) + DA(u)^\top y + \frac{DA(u)^\top A(u)}{\beta}  - \frac{e}{\gamma},
+\qquad e\in N_C(u).
 \end{align}
-Under the assumption that $P_{T_C(u_{k+1})}DA(u_k)^\top$ is full-column rank with the smallest nonzero singular value of $\eta_k$, we arrive at 
+By triangle inequality, it follows that 
 \begin{align}
-\|A(u_k)\|  \le 
-\frac{\beta_k}{\eta_k} \left\| \frac{P_{T_C(u_{k+1})}  (u_{k}-u_{k+1}) }{\gamma_k} - P_{T_C(u_{k+1})} \nabla h(u_k) - P_{T_C(u_{k+1})} DA(u_k)^\top y_k  \right\|. 
+\|G\| & \ge \frac{\|DA(u)^\top A(u)\|}{\beta} - \|\nabla h(u) + DA(u)^\top y\| - \frac{\|e\|}{\gamma} \nonumber\\
+& \ge  \frac{\|DA(u)^\top A(u)\|}{\beta} - \|\nabla h(u) + DA(u)^\top y\| - \frac{\text{diam}}{\gamma},
 \end{align}
-That is by choosing $\beta_k$ sufficiently small, we can ensure that the feasibility gap decays with $k$. There seem to be no conflict with the other parameters $\{\gamma_k,\sigma_k\}_k$. 
+where $\text{diam}$ is the diameter ball that contains all iterates, assuming that one exists. Then, when $\beta$ is sufficiently small, the first term dominates the other ones and we find that 
+\begin{equation}
+\|G\| \gtrsim \frac{\|DA(u)^\top A(u)\|}{\beta} \ge \frac{\lambda}{\beta} \|A(u)\|, 
+\end{equation}
+assuming that $DA(u)^\top$  is full column rank with the smallest singular value of $\lambda>0$.  This qualitative discussion suggests that  Volkan's assumption might be true in general. 
+
 
 
 
 \end{document}
 % end of file template.tex
 
   Set 
  $X = \menge{u}{Lu=b} \cap C$.
  Suppose that there exist a  neighborhood  $B(\overline{u};\epsilon)$ of $\overline{u}$ and 
  a  neighborhood  $B(\overline{y};\epsilon)$ of $\overline{y}$, and positive number $\alpha_1,\alpha_2$ and non-negative $\rho_1,\rho_2$
  such that $ (\forall u \in X \cap B(\overline{u};\epsilon))(\forall y \in B(\overline{y};\epsilon))$,
  \begin{equation}\label{e:maic}
 \scal{\nabla F_{\beta}(u,y)- \nabla F_{\beta}(\overline{u},\overline{y}) }{u-\overline{u}} 
  \geq \rho_1\|u-\overline{u}\|^{\alpha_1} +\rho_2\| y- \overline{y}\|^{\alpha_2}.
  \end{equation} 
  \begin{theorem} Suppose that 
  $\gamma_k \geq \underline{\gamma} > 0$  and 
 all the conditions in Theorem \ref{t:1} is satisfied. Furthermore,
  Suppose that for  some $k_0$, $(u_{k})_{k\geq k_0} \subset B(\overline{u};\epsilon)$ and  $(y_{k})_{k\geq k_0} \subset B(\overline{y};\epsilon)$, and 
  \eqref{e:maic} is satisfied for every $\beta\in (\beta_k)_{k\geq k_0}$.  If $\rho_1 > 0$ or $\rho_2 > 0$, then $u_{k}\to \overline{u}$ or $y_k \to \overline{y}$. 
  \end{theorem}
  \begin{proof} Without lost of generality, $k_0=0$.
   It follows from Theorem \ref{t:1} that $\gamma_k\|(u_{k+1}-u_k)/\gamma_k\|^2 \to 0$. Since $(\gamma_k)_{k\in\NN}$ 
   is bounded below by $\underline{\gamma}$, we obtain $G_{\beta_k,\gamma_k}(u_k)\to 0$.
  Since $\beta_{k+1}^{-1}\|Lu_{k+1}-b\|^2 \to 0$ and $(\beta_{k})_{k\in\NN}$ is bounded above by $\overline{\beta}=c$, 
  we get $\|Lu_{k+1}-b\|^2\to 0$. Since $L\overline{u} =b$, we can rewrite \eqref{e:fr1} as $-\nabla F_{\beta}(\overline{u},\overline{y}) \in N_{C}(\overline{u})$. Hence 
  \begin{equation}\label{e:aa1s}
  (\forall u\in C)(\forall k\in\NN)\; \scal{\nabla F_{\beta_{k}}(\overline{u},\overline{y}) }{u-\overline{u}}\geq 0.
  \end{equation}
  Now, using the updating of $u_{k}$, we have $G_{\beta_k,\gamma_k}(u_k) -\nabla F_{\beta_k}(u_k,y_k) \in N_{C}(u_{k+1})$.
  Hence, 
   \begin{equation}\label{e:aa2s}
  (\forall u\in C)(\forall k\in\NN)\; \scal{-G_{\beta_k,\gamma_k}(u_k)+\nabla F_{\beta_{k}}(u_k,y_k) }{u-u_{k+1}}\geq 0.
  \end{equation}
  Now, we imply from \eqref{e:aa1s} and \eqref{e:aa2s} that 
  \begin{alignat}{2}
  \label{e:las1}
  \scal{-G_{\beta_k,\gamma_k}(u_k)}{\overline{u}-u_{k+1}} + \scal{\nabla F_{\beta_{k}}(u_k,y_k) -\nabla F_{\beta_{k}}(\overline{u},\overline{y}}{\overline{u}-u_{k+1}}\geq 0.
  \end{alignat}
  
  Using the condition \eqref{e:maic}, we derive from \eqref{e:las1} that
  \begin{alignat}{2}
  \rho_1\|u_{n+1}-\overline{u}\|^{\alpha_1} &+\rho_2 \| y_n- \overline{y}\|^{\alpha_2}) \leq  \scal{-G_{\beta_k,\gamma_k}(u_k)}{\overline{u}-u_{k+1}}\notag\\
  &\quad +\scal{\nabla F_{\beta_{k}}(u_k,y_k) -\nabla F_{\beta_{k}}(\overline{u},\overline{y})}{u_{k}-u_{k+1}} \notag\\
  & \leq (\epsilon+\gamma_k \|\nabla F_{\beta_{k}}(u_k,y_k) -\nabla F_{\beta_{k}}(\overline{u},\overline{y})\|) \| G_{\beta_k,\gamma_k}(u_k)\|\notag\\
  &\leq (2\epsilon + \|\nabla F_{\beta_{k}}(\overline{u},y_k)-\nabla F_{\beta_{k}}(\overline{u},\overline{y}) \|)\| G_{\beta_k,\gamma_k}(u_k)\|\notag\\
  &\leq (2\epsilon + \|\nabla L(\overline{u})\|\epsilon \| G_{\beta_k,\gamma_k}(u_k)\|\notag\\
 & \to 0.
  \end{alignat}
  Therefore, $u_{n}\to \overline{u}$ or $y_n\to \overline{y}$ provided that $\rho_1> 0$ or $\rho_2 >0$.
  \end{proof}