diff --git a/tex/Paper/main.tex b/tex/Paper/main.tex
index de96060..01fe9c8 100644
--- a/tex/Paper/main.tex
+++ b/tex/Paper/main.tex
@@ -1,125 +1,128 @@
 %%%%%%%%%%%%%%%%%%%%%%% file template.tex %%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % This is a general template file for the LaTeX package SVJour3
 % for Springer journals.          Springer Heidelberg 2010/09/16
 %
 % Copy it to a new file with a new name and use it as the basis
 % for your article. Delete % signs as needed.
 %
 % This template includes a few options for different layouts and
 % content for various journals. Please consult a previous issue of
 % your journal as needed.
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % First comes an example EPS file -- just ignore it and
 % proceed on the \documentclass line
 % your LaTeX will extract the file if required
 \begin{filecontents*}{example.eps}
 %!PS-Adobe-3.0 EPSF-3.0
 %%BoundingBox: 19 19 221 221
 %%CreationDate: Mon Sep 29 1997
 %%Creator: programmed by hand (JK)
 %%EndComments
 gsave
 newpath
   20 20 moveto
   20 220 lineto
   220 220 lineto
   220 20 lineto
 closepath
 2 setlinewidth
 gsave
   .4 setgray fill
 grestore
 stroke
 grestore
 \end{filecontents*}
 %
 \RequirePackage{fix-cm}
 %
 %\documentclass{svjour3}                     % onecolumn (standard format)
 %\documentclass[smallcondensed]{svjour3}     % onecolumn (ditto)
 \documentclass[smallextended]{svjour3}       % onecolumn (second format)
 %\documentclass[twocolumn]{svjour3}          % twocolumn
 %
 \smartqed  % flush right qed marks, e.g. at end of proof
 %
 \usepackage{graphicx}
 %
 % \usepackage{mathptmx}      % use Times fonts if available on your TeX system
 %
 % insert here the call for the packages your document requires
 %\usepackage{latexsym}
 % etc.
 %
 % please place your own definitions here and don't use \def but
 % \newcommand{}{}
 %
 % Insert the name of "your journal" with
  \journalname{Math Programming Comp.}
 %c
 \usepackage{amsmath}
 \usepackage{amssymb}
 \usepackage{graphicx}
 \usepackage{algorithm}
 \usepackage{color}
 \usepackage{cite}
 
 \usepackage{hyperref}
 
 \input{preamble.tex}
 \begin{document}
 
 \title{Linearized augmented Lagrangian framework for solving non-convex problems\thanks{This project has received funding from...}
 }
 %\subtitle{Do you have a subtitle?\\ If so, write it here}
 
 %\titlerunning{Short form of title}        % if too long for running head
 
 \author{First author         \and
         2nd author
 }
 
 \authorrunning{author et. al.} % if too long for running head
 
 %\institute{F. Author \at
 %              first address \\
 %              Tel.: +123-45-678910\\
 %              Fax: +123-45-678910\\
 %              \email{fauthor@example.com}           %  \\
 %             \emph{Present address:} of F. Author  %  if needed
 %           \and
 %           S. Author \at
 %              second address
 % }
 
 \date{Received: / Accepted: }
 % The correct dates will be entered by the editor
 
 
 \maketitle
 
 \input{sections/abstract.tex}
 \input{sections/introduction.tex}
 \input{sections/preliminaries.tex}
 \input{sections/algorithm.tex}
 \input{sections/related_works.tex}
-
+%\input{sections/nonadaptive_theory.tex}
+\input{sections/adaptive_theory.tex}
+\appendix
+\input{sections/appendix.tex}
 
 
 %\begin{acknowledgements}
 %If you'd like to thank anyone, place your comments here
 %and remove the percent signs.
 %\end{acknowledgements}
 
 % BibTeX users please use one of
 %\bibliographystyle{spbasic}      % basic style, author-year citations
 \bibliographystyle{spmpsci}      % mathematics and physical sciences
 %\bibliographystyle{spphys}       % APS-like style for physics
 \bibliography{bibliography.bib}   % name your BibTeX data base
 
 
 \end{document}
 % end of file template.tex
 
diff --git a/tex/Paper/sections/adaptive_theory.tex b/tex/Paper/sections/adaptive_theory.tex
new file mode 100644
index 0000000..30ee243
--- /dev/null
+++ b/tex/Paper/sections/adaptive_theory.tex
@@ -0,0 +1,793 @@
+\section{Proof of  Theorem \ref{thm:main}}
+
+
+For convenience, let us recall the updates of the algorithm in iteration $k$, namely,  
+\begin{align}
+ u_{k+1}  
+& = P_C( u_k - \gamma_k \nabla \mathcal{L}_{\beta_k} (u_k,y_k) )
+\nonumber\\
+& = P_C\Big( u_k - \gamma_k \nabla h(u_k) \nonumber\\
+& \qquad \qquad - \gamma_k DA(u_k) ^\top \left( y_k + \beta_k(A(u_k)-b) \right) \Big),
+\qquad  \text{(see \eqref{eq:Lagrangian})}
+\label{eq:update uk recall}
+\end{align}
+\begin{align}
+y_{k+1} =y_k + \sigma_{k+1}(A(u_{k+1})-b), 
+\label{eq:y update recall}
+\end{align}
+and recall also that 
+\begin{equation}
+G_k = G_{\beta_k,\gamma_k}(u_k,y_k) = \frac{u_k-u_{k+1}}{\gamma_k}.
+\qquad \text{(see \eqref{eq:grad map})}
+\label{eq:grad map recall}
+\end{equation}
+For integers $k_0\le k_1$, consider the interval 
+\begin{equation}
+ K = [k_0:k_1]=\{k_0,\cdots,k_1\}.
+ \end{equation}
+Since $\gamma_k$ is determined by the line search subroutine in Lemma \ref{lem:eval Lipsc cte},  we may  apply Lemma \ref{lem:11} for every iteration in this interval to find that 
+\begin{align}
+ \frac{\gamma_k \|G_k\|^2}{2} 
+& \le \mathcal{L}_{\beta_k}(u_k,y_k) - \mathcal{L}_{\beta_k}(u_{k+1},y_k)
+\qquad \text{(see Lemma \ref{lem:11})} \nonumber\\
+& = h(u_k) - h(u_{k+1})+ \langle A(u_k)-A(u_{k+1}) , y_k \rangle
+\nonumber\\
+& \qquad + \frac{\beta_k}{2}\left(\|A(u_k)-b\|^2 - \|A(u_{k+1})-b\|^2\right),
+\qquad \text{(see \eqref{eq:Lagrangian})}
+\label{eq:smoothness rep}
+\end{align}
+for every $k\in K$. 
+On the other hand, 
+\begin{align}
+y_k = y_{k_0} + \sum_{i=k_0+1}^k \sigma_i \left( A(u_i)-b \right),
+\qquad \text{(see \eqref{eq:y update recall})}
+\label{eq:y update unfolded}
+\end{align}
+which, after substituting in \eqref{eq:smoothness rep},  yields that 
+\begin{align}
+\frac{\gamma_k \|G_k\|^2}{2} & \le h(u_k) - h(u_{k+1}) + \left\langle A(u_k) - A(u_{k+1}) ,  y_{k_0} + \sum_{i=k_0+1}^k \sigma_i ( A(u_i)-b) \right\rangle 
+\nonumber\\ 
+& \qquad  +\frac{\beta_k}{2} \left(\|A(u_k)-b\|^2-\|A(u_{k+1})-b\|^2\right). 
+ \label{eq:key ineq}
+\end{align}
+By summing up the key inequality in \eqref{eq:key ineq} over $k$, we find that 
+\begin{align}
+& \sum_{k=k_0}^{k_1} \frac{\gamma_k \|G_k\|^2}{2}  \nonumber\\
+& \le h(u_{k_0}) - h(u_{k_1+1}) + \langle A(u_{k_0}) - A(u_{k_1+1}) , y_{k_0-1}\rangle  \nonumber\\
+& \qquad + \sum_{k=k_0}^{k_1} \sum_{i=k_0+1}^k \sigma_i  \left\langle A(u_k) - A(u_{k+1}) , A(u_i)-b \right\rangle 
+ \nonumber\\
+& \qquad + \sum_{k=k_0}^{k_1} \frac{\beta_k}{2} \|A(u_k)-b\|^2 - \sum_{k=k_0}^{k_1} \frac{\beta_k}{2} \|A(u_{k+1})-b\|^2
+\qquad \text{(see \eqref{eq:key ineq})}
+\nonumber\\
+& = h(u_{k_0}) - h(u_{{k_1}+1}) + \langle A(u_{k_0}) - A(u_{{k_1}+1}) , y_{k_0-1} \rangle \nonumber\\
+& \qquad + \sum_{k=k_0}^{k_1} \sum_{i=k_0+1}^k \sigma_i  \left\langle A(u_k) - A(u_{k+1}) , A(u_i)-b \right\rangle\nonumber\\
+& \qquad + \sum_{k=k_0}^{k_1} \frac{\beta_k}{2}\|A(u_k)-b\|^2- \sum_{k=k_0+1}^{{k_1}+1}  \frac{\beta_{k-1}}{2} \|A(u_{k})-b\|^2  \nonumber\\
+& =  h(u_{k_0}) - h(u_{{k_1}+1}) + \langle A(u_{k_0}) - A(u_{{k_1}+1}) , y_{k_0-1} \rangle + \frac{\beta_{k_0}}{2}\|A(u_{k_0})-b\|^2  \nonumber\\
+& \qquad + \sum_{i=k_0+1}^{k_1} \sum_{k=i}^{k_1} \sigma_i \left\langle A(u_k) - A(u_{k+1}) , A(u_i)-b \right\rangle 
+\nonumber\\
+& \qquad 
++ \sum_{k=k_0+1}^{k_1}  \frac{\beta_k - \beta_{k-1}}{2}  \|A(u_k)-b\|^2 - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2
+  \nonumber\\
+& \le  \mu + \sum_{i=k_0+1}^{k_1} \sigma_i \left\langle A(u_i) - A(u_{{k_1}+1}),  A(u_i)-b\right\rangle \nonumber\\
+& \qquad + \sum_{k=k_0+1}^{k_1} \frac{\beta_k - \beta_{k-1}}{2}  \|A(u_k)-b\|^2 - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2
+\qquad \text{(see \eqref{eq:defn mu})} 
+  \nonumber\\
+& = \mu   + \sum_{k=k_0+1}^{k_1}  \left( \sigma_k +\frac{\beta_k - \beta_{k-1}}{2}  \right) \|A(u_k)-b\|^2 \nonumber\\
+& \qquad - \sum_{k=k_0+1}^{k_1} \sigma_k  \left \langle A(u_{{k_1}+1})-b, A(u_k)-b\right\rangle - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2
+ \nonumber\\
+ & \le\mu  + \sum_{k=k_0}^{k_1}  \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
+\nonumber\\
+& \qquad + 
+ \sum_{k=k_0}^{k_1} \sigma_k \|A(u_{{k_1}+1})-b\| \|A(u_k)-b\| - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2,
+ \label{eq:long chain broken}
+\end{align} 
+where we assumed that 
+\begin{align}
+\mu & := \sup_k \Big( h(u_{k_0}) - h(u_k) + \langle A(u_{k_0})-A(u_k) ,y_{k_0-1}\rangle \nonumber\\
+& \qquad \qquad  + \frac{\beta_{k_0}}{2} \|A(u_{k_0})-b\|^2 \Big) < \infty. 
+\label{eq:defn mu}
+\end{align}
+Recall that, for every $k\in K$, we set the penalty weight as 
+\begin{align}
+\beta_k =
+\begin{cases}
+\beta_{k-1} & \|A(u_k)-b\| \le  \frac{\beta_{k_0}}{\sqrt{k\log^{{2}} (k+1)}},\\
+\\
+\beta_{k-1}
+\sqrt{\frac{k\log^{{2}}(k+1)}{(k-1)\log^{{2}} k}} & 
+ \|A(u_k)-b\| > \frac{\beta_{k_0}}{\sqrt{k\log^{{2}}(k+1)}}, 
+\end{cases}
+\label{eq:beta rule}
+\end{align}
+for a threshold $\beta_{k_0}>0$. 
+For the record, the above penalty weights $\{\b_k\}_{k\ge 0}$ is a nondecreasing sequence and  satisfies
+\begin{align*}
+\beta_{k_0}\le \beta_k \le \beta_{k_0} \sqrt{\frac{k \log^{{2}}(k+1)}{k_0\log^{{2}}(k_0+1)}},
+\qquad \forall k\in K,
+\end{align*}
+\begin{align}
+\beta_k- \beta_{k-1} 
+& \le \beta_{k-1} \left( \sqrt{\frac{k\log^{{2}}(k+1)}{(k-1)\log^{{2}}k}} -1 \right) \nonumber\\
+& \le \beta_{k-1} \cdot  \frac{k\log^{{2}}(k+1) - (k-1) \log^{{2}} k}{(k-1)\log^{{2}} k}   \nonumber\\
+& \le \b_{k-1} \left(   
+\frac{k\log^2(1+ \frac{1}{k})}{(k-1)\log^2k} + \frac{1}{k-1}
+ \right)
+\nonumber\\
+& \le \frac{2 \beta_{k-1}}{k-1} 
+\qquad ( k_0 \gg 1) 
+\nonumber\\
+& \le \frac{{2}\beta_{k_0}}{k-1} \sqrt{\frac{(k-1)\log^{{2}}k}{k_0 \log^{{2}}(k_0+1)}} \nonumber\\
+& =  \frac{ 2\beta_{k_0} \log k}{\sqrt{(k-1)k_0} \log (k_0+1)},
+\qquad \forall k \in K,
+\label{eq:consequences}
+\end{align}
+when $k_0$ is sufficiently large. 
+For every $k\in K$, recall also that we set the dual step sizes as  
+\begin{align}
+\sigma_k =
+\begin{cases}
+\sigma_{k-1} & \|A(u_k)-b\| \le \frac{\sigma_{k_0}}{k\log^{{2}}(k+1)},\\
+\\
+\sigma_{k-1} \sqrt{\frac{(k-1) \log^{{2}}k}{k \log^{{2}}(k+1)}} 
+& 
+\frac{\sigma_{k_0}}{k\log^{{2}}(k+1)} \le \|A(u_k)-b\| \le \frac{\sigma_{k_0}}{\sqrt{k\log^{{2}}(k+1)}},
+\\
+\\
+\sigma_{\widetilde{k}-1} \frac{ \b_k-\b_{k-1}}{\beta_{\widetilde{k}}-\beta_{\widetilde{k}-1}}  & \|A(u_k)-b\| > \frac{\sigma_{k_0}}{\sqrt{k\log {^2}(k+1)}} \text{ and } k \le k_{\max},\\
+\\
+\min\left(   \sigma_{\widetilde{k}-1} \frac{ \b_k-\b_{k-1}}{\beta_{\widetilde{k}}-\beta_{\widetilde{k}-1}} , \frac{\|A(u_k)-b\|}{k\log^2(k+1)} \right)&  \|A(u_k)-b\| > \frac{\sigma_{k_0}}{\sqrt{k\log {^2}(k+1)}} \text{ and } k > k_{\max},
+\end{cases}
+\label{eq:sigma rule}
+\end{align}
+for a threshold $\sigma_{k_0}\ge \beta_{k_0}$ and integer $k_{\max}$. {Above, $\widetilde{k}=\widetilde{k}(k)$ is the iteration at which the most recent transition occurs into the third update regime. Indeed, the factor depending on $\widetilde{k}$ is introduced above to ensure the continuity of the sequence $\{\sigma_k\}_{k\ge 0}$.} 
+%For a threshold $k_{\max}$, the third regime of \eqref{eq:sigma rule} is replaced with $\|A(u_k)-b\|/(k \log^2(k+1))$ when $k \ge k_{\max}$, namely, 
+For the record, a consequence of the above updates is that $\{\s_k\}_{k\ge 0}$ is a nonincreasing sequence and, in particular, 
+\begin{align}
+\sigma_k \le  \sigma_{k_0}, 
+\qquad \forall k\in K.
+\qquad \text{(see (\ref{eq:consequences},\ref{eq:sigma rule}))}
+\label{eq:consequences 2}
+\end{align}
+Let us partition $K$ to disjoint subsets
+\begin{align}
+K = K_{\beta,l} \cup K_{\beta,u}.
+\label{eq:partition beta}
+\end{align}
+More specifically, for every $k\in K_{\beta,l}$, the first update in \eqref{eq:beta rule} is in force whereas, for every $k\in K_{\beta,u}$, the second update in  \eqref{eq:beta rule}  is in force. Likewise, let us partition $K$ to disjoint subsets
+\begin{align}
+K= K_{\sigma,l} \cup K_{\sigma,m} \cup  K_{\sigma,u}.
+\label{eq:partition sigma}
+\end{align}
+For every $k\in K_{\sigma,l}$, the first update in \eqref{eq:sigma rule} is in force whereas, for every $k\in K_{\sigma,m}$, the second update in  \eqref{eq:sigma rule} is in force and, for every $k\in K_{\sigma,u}$, the third update there is active. By their definitions in (\ref{eq:beta rule},\ref{eq:sigma rule}) and the assumption that $\s_{k_0}\ge \b_{k_0}$, we observe that $K_{\beta,l}\cap K_{\s,u}=\emptyset$, namely, 
+\begin{equation}
+\sigma_k >0, \qquad \forall k\in K.
+\end{equation} 
+The partitioning in \eqref{eq:partition sigma} allows us to break the sum in the last line of \eqref{eq:long chain broken} to three sums. 
+To evaluate the sum over $K_{\sigma,l}$, we write that 
+\begin{align}
+&  \sum_{k\in K_{\sigma,l}}  \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
++ 
+ \|A(u_{{k_1}+1})-b\|  \sum_{k\in K_{\sigma,l}} \sigma_k \|A(u_k)-b\|
+ \nonumber\\
+ & \le  \sum_{k\in K_{\sigma,l}}  \left( \sigma_{k_0}+  \frac{ {2}\beta_{k_0} \log k}{\sqrt{(k-1) k_0} \log(k_0+1)} \right) \|A(u_k)-b\|^2 \nonumber\\
+& \qquad  +  \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,l}} \sigma_{k_0} \|A(u_k)-b\|  
+\qquad \text{(see (\ref{eq:consequences},\ref{eq:consequences 2}))} 
+ \nonumber\\
+ & \le 2 \s_{k_0}\sum_{k\in K_{\sigma,l}}   \|A(u_k)-b\|^2 +  \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,l}} \sigma_{k_0} \|A(u_k)-b\|
+ \nonumber\\
+ & \le  2\s_{k_0}\sum_{k\in K_{\sigma,l}}   \|A(u_k)-b\|^2 +  \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,l}} \frac{\sigma_{k_0}^2}{k\log {^2}(k+1)}
+ \qquad \text{(see \eqref{eq:sigma rule})}
+ \nonumber\\
+ & = 2\s_{k_0} \sum_{k\in K_{\sigma,l}}   \|A(u_k)-b\|^2 +  c\s^2_{k_0} \| A(u_{k_1+1})-b\| ,
+ \label{eq:long chain broke 2}
+\end{align} 
+where we assumed that $k_0 \gg 1$ to obtain the second inequality and 
+\begin{equation}
+c\ge  \sum_{k=1}^{\infty} \frac{1}{k\log {^2}(k+1)}.
+\label{eq:defn of c}
+\end{equation} 
+To evaluate the sum in the last line of \eqref{eq:long chain broken} over $K_{\s,m}$, 
+we write that 
+\begin{align}
+&  \sum_{k\in K_{\sigma,m}}  \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
+\nonumber\\
+& \qquad + 
+ \|A(u_{{k_1}+1})-b\|  \sum_{k\in K_{\sigma,m}} \sigma_k \|A(u_k)-b\|
+ \nonumber\\
+ & \le  \sum_{k\in K_{\sigma,m}}  \left( \sigma_{k_0}+  \frac{2\beta_{k_0} \log k}{\sqrt{(k-1) k_0} \log{^2}(k_0+1)} \right) \|A(u_k)-b\|^2 
+\nonumber\\
+& \qquad  +  \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,m}} \sigma_{k} \|A(u_k)-b\|  
+\qquad \text{(see (\ref{eq:consequences},\ref{eq:consequences 2}))} 
+ \nonumber\\
+ & \le 2\s_{k_0} \sum_{k\in K_{\sigma,m}}   \|A(u_k)-b\|^2 +  \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,m}} \sigma_{k} \|A(u_k)-b\|
+ \nonumber\\
+ & \le 2\s_{k_0} \sum_{k\in K_{\sigma,m}}   \|A(u_k)-b\|^2 +  \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,m}} \frac{\sigma_{k_0}\sigma_{k} }{\sqrt{k\log {^2}(k+1)}},
+ \label{eq:long chain broke 3}
+\end{align} 
+where the second to last line assumes that $k_0\gg 1$ and the last line uses \eqref{eq:sigma rule}. 
+To control the last sum above, let us further write that 
+\begin{align}
+K_{\s,m}=\cup_{j=1}^{J_{\s,m}} K_{\s,m,j},
+\label{eq:another partition}
+\end{align}
+where  $\{K_{\s,m,j}\}_{j=1}^{J_{\s,m}}$ are disjoint intervals, with the starting points   $\{k_{\s,m,j}\}_{j=1}^{J_{\s,m}}$. With the partitioning in \eqref{eq:another partition}, we write that 
+\begin{align}
+& \sum_{k\in K_{\sigma,m}} \frac{\sigma_{k_0}\sigma_k}{\sqrt{k\log{^2}(k+1)}} 
+\nonumber\\
+& = 
+\sum_{j=1}^{J_{\s,m}} \sum_{k\in K_{\sigma,m,j}} \frac{\sigma_{k_0}\sigma_k}{\sqrt{k\log{^2}(k+1)}} \nonumber\\
+& = \sum_{j=1}^{J_{\s,m}}
+\sigma_{k_0}\sigma_{k_{\s,m,j}} \sqrt{k_{\s,m,j} \log {^2}(k_{\s,m,j}+1)}
+ \sum_{k\in K_{\sigma,m,j}} \frac{1}{k \log {^2}(k+1)}
+ \qquad \text{(see \eqref{eq:sigma rule})} \nonumber\\
+ & \le c \s_{k_0} \s'_{k_0},
+ \label{eq:long chain broke 2.99}
+\end{align}
+where we conveniently set 
+\begin{align}
+\s'_{k_0} :=\sum_{j=1}^{J_{\s,m}}\sigma_{k_{\s,m,j}} \sqrt{k_{\s,m,j}} \log (k_{\s,m,j}+1). 
+\label{eq:defn of sigmap}
+\end{align}
+%\begin{remark}
+%Note that our convergence bounds in particular will depend on $J_\s$, the number of transitions into the middle regime in \eqref{eq:sigma rule}. 
+%\end{remark}
+Substituting \eqref{eq:long chain broke 2.99} back into \eqref{eq:long chain broke 3}, we reach
+\begin{align}
+&  \sum_{k\in K_{\sigma,m}}  \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
+\nonumber\\
+& \qquad + 
+ \|A(u_{{k_1}+1})-b\|  \sum_{k\in K_{\sigma,m}} \sigma_k \|A(u_k)-b\|
+ \nonumber\\
+ & \le 2\s_{k_0} \sum_{k\in K_{\sigma,m}}   \|A(u_k)-b\|^2 + c\s_{k_0}\sigma'_{k_0}  \| A(u_{k_1+1})-b\|. 
+\end{align}
+{Consider also the partitioning 
+\begin{align}
+K_{\s,u} = \cup_{j=1}^{J_{\s, u}} K_{\s,u,j},
+\label{eq:upper_sigma_partition}
+\end{align}
+where  $\{K_{\s,u,j}\}_{j=1}^{J_{\s,u}}$ are disjoint intervals, with the starting points $\{\widetilde{k}_j\}_{j=1}^{J_{\s,u}}$; recall \eqref{eq:sigma rule}.} To evaluate the sum in the last line of \eqref{eq:long chain broken} over $K_{\s,u}$, 
+we write that 
+\begin{align}
+& 
+\sum_{k\in K_{\sigma,u}} \left( \sigma_k + \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2 \nonumber\\
+& \qquad + \sum_{k\in K_{\sigma,u}} \sigma_k \|A(u_{k_1+1})-b\| \|A(u_k)-b\| - \frac{\beta_{k_1}}{2} \|A(u_{k_1+1})-b\|^2 \nonumber\\ 
+& \le {\sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}}
+ \left( \sigma_k + \frac{\beta_k-\beta_{k-1}}{2} + 
+{\frac{\sigma_{\widetilde{k}_j -1}\s_{k_0}}{2(\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1})}}\right) 
+\| A(u_k)-b\|^2 \nonumber\\
+& \qquad + \frac{1}{2} 
+\left({ \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}
+ \frac{ (\beta_{\widetilde{k}_j}-\beta_{\widetilde{k}_j-1}) \sigma_k^2 }{\sigma_{\widetilde{k}_j-1} \s_{k_0} }  - \beta_{k_1} }\right) 
+ \| A(u_{k_1+1})-b\|^2. 
+%\qquad ( 2ab \le \theta a^2 + b^2/\theta ) 
+\label{eq:long chain broke .99}
+\end{align}
+The last sum above is not positive because 
+\begin{align}
+& { \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}
+ \frac{ (\beta_{\widetilde{k}_j}-\beta_{\widetilde{k}_j-1}) \sigma_k^2 }{\sigma_{\widetilde{k}_j-1} \s_{k_0} }  - \beta_{k_1} } \nonumber\\
+&  = \sum_{k\in K_{\sigma,u}}  (\beta_k-\beta_{k-1}) \frac{\s_k}{\s_{k_0}} - \beta_{k_1}  
+\qquad \text{(see \eqref{eq:sigma rule})}
+\nonumber\\
+& \le \sum_{k\in K_{\sigma,u}}  (\beta_k-\beta_{k-1})  - \beta_{k_1} 
+\qquad \text{(see \eqref{eq:consequences 2})}
+\nonumber\\
+& \le  \sum_{k=k_0}^{k_1}  (\beta_k-\beta_{k-1})  - \beta_{k_1} \nonumber\\
+& \le - \b_{k_0}\le 0.
+\label{eq:telescope}
+\end{align}  
+We therefore bound the last line of \eqref{eq:long chain broke .99} as 
+\begin{align}
+& 
+\sum_{k\in K_{\sigma,u}} \left( \sigma_k + \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2 \nonumber\\
+& \qquad + \sum_{k\in K_{\sigma,u}} \sigma_k \|A(u_{k_1+1})-b\| \|A(u_k)-b\| - \frac{\beta_{k_1}}{2} \|A(u_{k_1+1})-b\|^2 \nonumber\\ 
+& \le \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}
+ \Bigg(
+  \left( \frac{\sigma_{\widetilde{k}_j-1 } }{(\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1})} +\frac{1}{2} \right) (\beta_k - \beta_{k-1}) \nonumber\\
+&\qquad \qquad \qquad \qquad  \qquad  + \frac{\sigma_{\widetilde{k}_j-1 } \s_{k_0}}{2(\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1})}
+ \Bigg)
+   \|A(u_k)-b\|^2 
+\qquad \text{(see (\ref{eq:long chain broke .99},\ref{eq:telescope}))}
+\nonumber\\
+&  \le  \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}
+ \Bigg(
+  \left( \frac{\sigma_{\widetilde{k}_{j-1} }}{(\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1})} +\frac{1}{2} \right) \left( \frac{ 2 \beta_{k_0} \log(k+1)}{\sqrt{k k_0} \log(k_0+1)}\right) \nonumber\\
+& \qquad \qquad \qquad \qquad \qquad   + \frac{\sigma_{\widetilde{k}_{j-1} }\s_{k_0}}{2(\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1})}
+ \Bigg)    
+   \|A(u_k)-b\|^2 
+\qquad \text{(see \eqref{eq:consequences})}   
+   \nonumber\\ 
+& \le \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}
+ \frac{\sigma_{\widetilde{k}_{j-1} } \s_{k_0}}{\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1}}
+    \|A(u_k)-b\|^2 .
+\qquad (k_0 \gg 1) 
+\label{eq:long chain broke 1}
+\end{align}
+We next show that the coefficient in the last line above is bounded. Indeed, for $j\in J_{\s,u}$, first note that 
+\begin{align}
+ \frac{\s_{\widetilde{k}_{j-1}}}{\beta_{\widetilde{k}_j}-\beta_{\widetilde{k}_j-1}} 
+& = \frac{\s_{\widetilde{k}_{j}-1}}{\left( 
+\sqrt{\frac{\widetilde{k}_{j} \log^2(\widetilde{k}_{j}+1)}{(\widetilde{k}_{j}-1)\log^2 \widetilde{k}_{j} }}
+-1   \right) \beta_{\widetilde{k}_j-1}}
+\qquad \text{(see (\ref{eq:beta rule},\ref{eq:sigma rule}))} \nonumber\\
+& \le  \frac{2\s_{\widetilde{k}_{j}-1}}
+{\left( \frac{\widetilde{k}_j \log^2(\widetilde{k}_j+1 ) }{(\widetilde{k}_j - 1)\log^2 \widetilde{k}_j } -1 \right) \b_{\widetilde{k}_j-1}}
+\qquad (k_0 \gg 1) \nonumber\\
+& \le \frac{2\s_{\widetilde{k}_{j}-1}}
+{\left( \frac{\widetilde{k}_j  }{\widetilde{k}_j - 1 } -1 \right) \b_{\widetilde{k}_j-1}} \nonumber\\
+& = \frac{2(\widetilde{k}_j -1) \s_{\widetilde{k}_{j}-1} }{  \b_{\widetilde{k}_j-1} }.
+\label{eq:boundedness}
+\end{align}
+But the last line above is bounded. If, to the contrary, the last line above is unbounded, then the subsequences $\{\b_k,\s_k\}_{k\ge k_2}$ would never enter the second and third regimes in, respectively, (\ref{eq:beta rule},\ref{eq:sigma rule}) for a sufficiently large $k_2$ and, in that case,  we must have that $\widetilde{k}_j \le k_2$ for every $j\in J_{\s,u}$. Therefore, $J_{\s,u}$ is a bounded set, which contradicts the unboundedness of the last line of \eqref{eq:boundedness}. Let us then set 
+\begin{align}
+\delta'_{k_0} :=  \max_{j \in J_{\sigma,u}} 
+\frac{\s_{\widetilde{k}_{{j}-1}} \s_{k_0}} {\b_{\widetilde{k}_{{j}}}-\b_{\widetilde{k}_{{j}}-1}},
+\label{eq:defn of delta}
+\end{align} 
+and bound the last line of \eqref{eq:long chain broke 1} as 
+\begin{align}
+& \sum_{k\in K_{\sigma,u}} \left( \sigma_k + \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2  \nonumber\\
+& \le  \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}
+ \frac{\sigma_{\widetilde{k}_{j-1} } \s_{k_0}}{\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1}}
+    \|A(u_k)-b\|^2
+    \qquad \text{(see \eqref{eq:long chain broke 1})} \nonumber\\
+& \le \delta'_{k_0} \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}} 
+    \|A(u_k)-b\|^2\nonumber\\
+    & = \delta'_{k_0} \sum_{k\in K_{\s,u}}     \|A(u_k)-b\|^2. 
+    \label{eq:long chain broke 0}
+\end{align}
+By combining (\ref{eq:long chain broke 2},\ref{eq:long chain broke 3},\ref{eq:long chain broke 0}), we can finally simplify \eqref{eq:long chain broken}: When $k_0$ is sufficiently large, there exists a sufficiently large $\delta_{k_0}$, which depends only on $k_0$, such that 
+\begin{align}
+& \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 \nonumber\\
+& \le 2\mu + \delta_{k_0} \sum_{k=k_0}^{k_1} \|A(u_k)-b\|^2 + c\s_{k_0}(\sigma_{k_0}+\s_{k_0}') \|A(u_{k_1+1})-b\| 
+%\qquad 
+%\left( \text{(\ref{eq:long chain broke 1},\ref{eq:long chain broke 2},\ref{eq:long chain broke 3}) and } k_0 \gg 1 \right)
+\nonumber\\
+& =: \mu' + \delta_{k_0} \sum_{k=k_0}^{k_1} \|A(u_k)-b\|^2.
+\label{eq:long chain}
+\end{align}
+In the last line above, we observed that 
+\begin{align}
+\mu' := 2\mu + c
+\s_{k_0}(\s_{k_0}+\s_{k_0}') \max_{u\in C} \|A(u)-b\| < \infty,
+\label{eq:defn of mup}
+\end{align}
+thanks to the continuity of $A$ and compactness of $C$. 
+Note that \eqref{eq:long chain} bounds the gradient mapping with the feasibility gap. We next find a converse, thus bounding  the feasibility gap with the gradient mapping. To that end, let $T_C(u)$ and $P_{T_C(u)}$ be the tangent cone of $C$ at $u\in C$ and orthogonal projection onto this subspace, respectively. Likewise, let $N_C(u)$ and $P_{N_{C}(u)}$ be the normal cone of $C$ at $u$ and the corresponding orthogonal projection. 
+The update rule for $u_k$ in  \eqref{eq:update uk recall} immediately implies that
+\begin{align}
+G_k - \nabla h(u_k) -  DA(u_k) ^\top  y_k-  \beta_k DA(u_k)^\top (A(u_k)-b) \in  N_C(u_{k+1}). 
+\label{eq:opt cnd of update}
+\end{align}
+By definition in \eqref{eq:grad map recall}, we have that  $G_k \in T_C(u_{k+1})$ which, together with \eqref{eq:opt cnd of update}, imply that 
+\begin{align}
+G_k & 
+= P_{T_C(u_{k+1})} \left(
+- \nabla h(u_k) -  DA(u_k) ^\top  y_k-  \beta_k DA(u_k)^\top (A(u_k)-b) 
+ \right)
+\nonumber\\ 
+  & 
+ = P_{T_C(u_{k+1})}(- \nabla h(u_k))  + P_{T_C(u_{k+1})}(-  DA(u_k) ^\top  y_k)
+\nonumber\\
+& \qquad +  \beta_k P_{T_C(u_{k+1})} (
+ -  DA(u_k)^\top (A(u_k)-b) ) \nonumber\\
+&  = P_{T_C(u_{k+1})}(- \nabla h(u_k))  + P_{T_C(u_{k+1})}(-  DA(u_k) ^\top  y_{k-1})
+\nonumber\\
+& \qquad +  \left(\beta_k+ \sigma_k \right) P_{T_C(u_{k+1})} (
+ -  DA(u_k)^\top (A(u_k)-b) ).
+ \label{eq:geometric decopm of Gk}
+\end{align}
+In the second line above, we used the identity $P_T(a+b) = P_T(a)+P_T(b)$, for points $a,b$ and convex cone $T$. The last line above uses \eqref{eq:y update recall}. 
+ After rearranging and applying the triangle inequality to the last line above, we reach
+\begin{align}
+& \beta_k\| P_{T_C(u_{k+1})}(DA(u_k)^\top (A(u_k)-b)) \| \nonumber\\
+& \le \left( \sigma_k+ \beta_k \right) \| P_{T_C(u_{k+1})} (DA(u_k)^\top (A(u_k)-b) ) \| 
+\nonumber\\
+& \le \|\nabla h(u_k)\| + \|DA(u_k) \| \cdot \|y_{k-1}\|+ \|G_k\| 
+\qquad \text{(see \eqref{eq:geometric decopm of Gk})}
+\nonumber\\
+& \le \lambda	'_h + \eta_{\max} \|y_{k-1}\|+ \|G_k\|,
+\label{eq:bnd on Ak raw}
+\end{align}
+where we used the non-expansiveness of $P_{T_C(u_{k+1})}$  and also set 
+\begin{align}
+\lambda'_h := \max_{u\in C} \| \nabla h(u)\|,
+\qquad 
+\eta_{\max}= \max_{u\in C} \|DA(u)\|.
+\label{eq:defn lambdap n etaMax}
+\end{align}
+The following result, proved in Appendix \ref{sec:proof of bnd Ak}, translates \eqref{eq:bnd on Ak raw} into an upper bound on $\|A(u_k)-b\|$.
+\begin{lemma}\label{lem:bnd bnd Ak}
+%Suppose that $C$ is sufficiently smooth, in the sense that there exists a constant $\tau_C$ such that 
+%\begin{equation}
+%\|P_{T_C(u)} - P_{T_C(u')}\| \le \tau_C \|u-u'\|,
+%\qquad \forall u,u'\in  C.
+%\label{eq:curvature}
+%\end{equation}
+For an integer $k_0$, consider a subspace $S_K\subseteq \mathbb{R}^d$ such that  
+\begin{align}
+S_{K} \supseteq  \bigcup_{k\in K} T_{C}(u_k),
+\end{align}
+and, with some abuse of notation, let $S_{K}$ also denote an orthonormal basis for this subspace. For $\rho>0$, let 
+\begin{align}
+ \eta_{\min } :=
+\begin{cases}
+\min_{u,v} \, \left\|  S_{K}^\top P_{T_C(u)} (DA(u)^\top v ) \right\| \\
+\|v\| =1\\
+\|A(u)-b\|\le  \rho\\
+u\in  C,
+\end{cases}
+\label{eq:new slater}
+\end{align}
+and assume that $\eta_{\min}>0$. 
+Suppose also that 
+\begin{align}
+\sup_{k\in K }\|A(u_k)-b\| \le \rho,
+\label{eq:good neighb}
+\end{align}
+\begin{align}
+\operatorname{diam}(C) \le \frac{\eta_{\min}}{2\lambda_A}.
+\label{eq:cnd for well cnd}
+\end{align}
+%\begin{align}
+% \sup_{k\ge k_0} \gamma_k \|G_k\| \le \frac{\eta_{\min}}{2\lambda_A}.
+%\label{eq:cnd for well cnd}
+%\end{align}
+%\begin{align}
+%\sup_{k\ge k_0 }\frac{2\sigma_k}{\eta_{\min}}  \left( \lambda'_h + \eta_{\max} \|y_{k-1}\|+ \|G_k\| \right) \le \rho.
+%\label{eq:to be met asympt}
+%\end{align}
+Then it holds that 
+\begin{align}
+ \|A(u_k) -b\| \le \frac{2}{\eta_{\min}\beta_k}  \left( \lambda'_h + \eta_{\max} \|y_{k-1}\|+ \|G_k\| \right) ,
+\qquad \forall k\in K.
+\label{eq:bnd on Ak final}
+\end{align}
+\end{lemma}
+
+
+
+Roughly speaking, \eqref{eq:bnd on Ak final} states that the feasibility gap is itself bounded by the gradient map. In order to apply Lemma \ref{lem:bnd bnd Ak}, let us assume  for now that  (\ref{eq:good neighb}) holds. Recall also the partitioning of $K$ in \eqref{eq:partition beta}.
+We  may now substitute \eqref{eq:bnd on Ak final} back into \eqref{eq:long chain} to find that 
+\begin{align}
+& \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2  \nonumber\\
+& \le   \sum_{k=k_0}^{{k_1}} \delta_{k_0}  \|A(u_k)-b\|^2 + \mu'
+\qquad \text{(see \eqref{eq:long chain})}
+\nonumber\\
+& = \sum_{k\in K_{\beta,l}} \delta_{k_0} \|A(u_k)-b\|^2  + \sum_{k\in K_{\beta,u}} \delta_{k_0}\|A(u_k)-b\|^2  
++ \mu' \nonumber\\
+& \le 
+\sum_{k\in K_{\beta,l}} \frac{\b_{k_0}\delta_{k_0}}{ k\log^2 (k+1)} 
++ \sum_{k\in K_{\beta,u}} \delta_{k_0} \|A(u_k)-b\|^2  
++\mu'
+\qquad \text{(see \eqref{eq:beta rule})}
+\nonumber\\
+& \le 
+\sum_{k=k_0}^{k_1} \frac{\b_{k_0}\delta_{k_0}}{ k\log^2 (k+1)} 
++ \sum_{k\in K_{\beta,u}}\delta_{k_0} \|A(u_k)-b\|^2  
++\mu' 
+\nonumber\\
+& \le 
+c\b_{k_0}\delta_{k_0}
++ \sum_{k\in K_{\beta,u}} 
+\frac{4\delta_{k_0}}{\eta_{\min}^2\b_k^2}   \left(  \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2
++\mu' ,
+\label{eq:adaptive 1}
+\end{align}
+where the last line above uses (\ref{eq:defn of c},\ref{eq:bnd on Ak final}). 
+Consider the partitioning 
+\begin{align}
+K_{\beta,u} = \cup_{j=1}^{J_\b} K_{\beta,u,j},
+\label{eq:partition on beta}
+\end{align}
+where  $\{K_{\beta,u,j}\}_{j=1}^{J_{\b}}$ are disjoint intervals, with the starting points $\{k_{\beta,u,j}\}_{j=1}^{J_{\b,u}}$. With the partitioning in \eqref{eq:partition on beta} at hand  and after recalling \eqref{eq:beta rule}, we simplify the last line of \eqref{eq:adaptive 1} as 
+\begin{align}
+& \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 \nonumber\\
+& \le \sum_{k=k_0}^{k_1} \delta_{k_0}\|A(u_k)-b\|^2
+ + \mu'\nonumber\\
+& \le c\b_{k_0}\delta_{k_0} + \sum_{j=1}^{J_\b}
+\frac{4\delta_{k_0} k_{\b,u,j} \log^2(k_{\b,u,j}+1)}{\eta_{\min}^2\b^2_{k_{\b,u,j}}} 
+\nonumber\\
+& \qquad \qquad\qquad  \cdot   \sum_{k\in K_{\beta,r,j}} \frac{1}{k\log^2(k+1)}
+ \left(  \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2
++ \mu',
+ \label{eq:adaptive 2}
+\end{align}
+where the last line uses \eqref{eq:consequences}. 
+The inner sum in the last line above can in turn be bounded as 
+\begin{align}
+& \sum_{k\in K_{\beta,r,j}} \frac{1}{k\log^2(k+1)}
+ \left(  \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2  \nonumber\\
+& \le \sum_{k\in K_{\beta,r,j}} \frac{3}{k\log^2(k+1)}
+ \left(  \lambda_h'^2 + \eta_{\max}^2 \|y_{k-1}\|^2 + \|G_k\|^2 \right)\nonumber\\
+ & \le 3c\lambda_h'^2 + 3 \eta_{\max}^2 B_K + \sum_{k\in K_{\beta,u,j}} \frac{3\|G_k\|^2}{k\log(k+1)}, 
+ \qquad \text{(see \eqref{eq:defn of c})}
+ \label{eq:adaptive 3}
+\end{align}
+where the second line above uses the inequality $(a+b+c)^2 \le 3a^2+3b^2+3c^2$ and we conveniently set  
+\begin{align}
+B_K := \sum_{k=k_0}^{k_1} \frac{\|y_{k-1}\|^2}{k \log^2(k+1)}.
+\label{eq:defn of B}
+\end{align}
+Substituting \eqref{eq:adaptive 3} back into \eqref{eq:adaptive 2}, we find that 
+\begin{align}
+& \sum_{k=k_0}^{k_1} \gamma_k  \|G_k\|^2 \nonumber\\
+& \le\sum_{k=k_0}^{k_1} \delta_{k_0} \|A(u_k)-b\|^2
+ + \mu'
+\nonumber\\
+& \le c\b_{k_0}\delta_{k_0}+ 
+\frac{12 c \b'_{k_0}\delta_{k_0} \lambda'^2_h}{\eta_{\min}^2} 
++ 
+\frac{12 \b'_{k_0}\delta_{k_0} \eta_{\max}^2 B_K}{\eta_{\min}^2} \nonumber\\
+& \qquad +
+\sum_{k\in K} \frac{12  \beta_{k_0}'  \delta_{k_0}   \|G_k\|^2}{ \eta_{\min}^2 k\log^2(k+1)} 
++
+\mu',
+\qquad \text{(see \eqref{eq:adaptive 2})}
+\label{eq:to be used for feas}
+\end{align}
+where 
+\begin{align}
+\beta'_{k_0} := \sum_{j=1}^{J_\b} \frac{k_{r,j} \log^2(k_{r,j}+1)}{ \beta_{k_{\b,r,j}}^2}.
+\label{eq:defn of kappa}
+\end{align}
+Indeed, $\b'_{k_0}$ above is finite (and depends only on $k_0$). If, to the contrary, the series on the right-hand side of \eqref{eq:defn of kappa} diverges, then there must exists a large enough $k_2$ such that the subsequence $\{\b_{k} \}_{k\ge k_2}$ remains in the first regime of \eqref{eq:beta rule}. Therefore, $J_\b$ must be finite and so does the right-hand side of \eqref{eq:defn of kappa}, thus contradicting the unboundedness of $\b'_{k_0}$ . 
+To simplify the last line of \eqref{eq:to be used for feas}, let us assume that 
+\begin{equation}
+\frac{12 \beta_{k_0}'\delta_{k_0}}{\eta_{\min}^2 k \log^2(k+1)} \le \frac{\gamma_k}{2},
+\qquad \forall  k\in K.
+\label{eq:to be used later on}
+\end{equation}
+We also let $|K|=k_1-k_0+1$ denote the size of the interval $K$. 
+After rearranging \eqref{eq:to be used for feas} and applying \eqref{eq:to be used later on}, we arrive at 
+\begin{align}
+&  \sum_{k=k_0}^{k_1} \frac{\gamma_k}{2} \|G_k\|^2   \nonumber\\
+& \le 
+\sum_{k=k_0}^{k_1} \left(\gamma_k - \frac{12 \beta_{k_0}'\delta_{k_0}}{\eta_{\min}^2 k \log^2(k+1)} \right)\|G_k\|^2 
+\qquad \text{(see \eqref{eq:to be used later on})}
+ \nonumber\\
+ & \le c\b_{k_0}\delta_{k_0} + 
+\frac{12 c \b'_{k_0}\delta_{k_0} \lambda'^2_h}{\eta_{\min}^2} 
++ 
+\frac{12 \b'_{k_0}\delta_{k_0} \eta_{\max}^2 B_K}{\eta_{\min}^2} 
++\mu'.
+\qquad \text{(see \eqref{eq:to be used for feas})}
+\label{eq:final bnd on G}
+\end{align}
+In turn, the bound above on the gradient mapping controls the feasibility gap, namely, 
+\begin{align}
+& \sum_{k=k_0}^{k_1} \|A(u_k)-b\|^2 \nonumber\\
+& \le c\b_{k_0} + \frac{12c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{12\b_{k_0}' \eta_{\max}^2 B_K}{\eta_{\min}^2} 
++ \sum_{k\in K} \frac{12\b_{k_0}' \|G_k\|^2}{\eta_{\min}^2 k \log(k+1)} 
+\qquad \text{(see \eqref{eq:to be used for feas})} 
+\nonumber\\
+& \le c\b_{k_0} + \frac{12c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{12\b_{k_0}' \eta_{\max}^2 B_K}{\eta_{\min}^2} 
++ \frac{1}{2\delta_{k_0}} \sum_{k\in K} \gamma_k \|G_k\|^2
+\qquad \text{(see \eqref{eq:to be used later on})} 
+\nonumber\\
+& \le 2c\b_{k_0} + \frac{24c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{24\b_{k_0}' \eta_{\max}^2 B_K}{\eta_{\min}^2} 
++ \frac{\mu'}{\delta_{k_0}}.
+\qquad \text{(see \eqref{eq:final bnd on G})} 
+\label{eq:feas bnd semi final}
+\end{align}
+%\subsection{Estimating $B_K$}
+ In order to interpret (\ref{eq:final bnd on G},\ref{eq:feas bnd semi final}), we next  estimate $B_{K}$, as defined in \eqref{eq:defn of B}.  To that end, let us first control the growth of the dual sequence, namely,  $\{y_k\}_k$. Recalling \eqref{eq:y update recall} and for every $k\in K$, we write that 
+\begin{align}
+\|y_k\| & \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \sigma_i \|A(u_i)-b\|
+\qquad \text{(see \eqref{eq:y update recall})}
+\nonumber\\
+& =  \|y_{k_0}\|+ \sum_{i\in K_{\sigma,l}} \sigma_i \|A(u_i)-b\|
++ \sum_{i\in K_{\sigma,m}} \sigma_i \|A(u_i)-b\|
+\nonumber\\
+& \qquad + \sum_{i\in K_{\sigma,u}} \sigma_i \|A(u_i)-b\|
+\qquad \text{(see \eqref{eq:partition sigma})}
+ \nonumber\\
+ & \le \|y_{k_0}\|+ \sum_{k\in K_{\s,l}} \frac{\s_{k_0}^2 }{k\log^2(k+1)}
+ + \sum_{i\in K_{\sigma,m}} \sigma_i \|A(u_i)-b\| \nonumber\\
+& \qquad + \sum_{i\in K_{\sigma,u}} \sigma_i \|A(u_i)-b\| 
+\qquad \text{(see \eqref{eq:sigma rule})}
+\nonumber\\
+& \le \|y_{k_0}\|+c\s_{k_0}^2 
+ + \sum_{i\in K_{\sigma,m}} \sigma_i \|A(u_i)-b\| \nonumber\\
+& \qquad + \sum_{i\in K_{\sigma,u}} \sigma_i \|A(u_i)-b\|. 
+\qquad \text{(see \eqref{eq:defn of c})}
+\label{eq:dual growth 1}
+\end{align}
+To evaluate the sum over $K_{\s,m}$ in the last line above, recall the partitioning $K_{\s,m}=\cup_{j=1}^{J_{\s,m}} K_{\s,m,j}$ in \eqref{eq:another partition}. Recall also that each $K_{\s,m,j}$ is an interval with the starting point $k_{\s,m,j}$. This allows us to write that 
+\begin{align}
+& \sum_{i\in K_{\s,m}} \s_i \|A(u_i)-b\| \nonumber\\
+ & = \sum_{j=1}^{J_{\s,m}} \sum_{i\in K_{\s,m,\j}} \s_i \|A(u_i)-b\| 
+\qquad \text{(see \eqref{eq:another partition})} 
+ \nonumber\\
+& \le 
+ \sum_{j=1}^{J_{\s,m}} \sum_{i\in K_{\s,m,\j}} \frac{\s_{k_0}\s_i }{\sqrt{i\log^2(i+1)} }
+\qquad \text{(see \eqref{eq:sigma rule})} 
+ \nonumber\\
+& = \sum_{j=1}^{J_{\s,m}}  \s_{k_0}\s_{k_{\s,m,j}} \sqrt{k_{\s,m,j}} \log(k_{\s,m,j}+1) \sum_{i\in K_{\s,m,\j}} \frac{1}{i\log^2(i+1)}
+\qquad \text{(see \eqref{eq:sigma rule})}
+\nonumber\\
+& \le  c \s_{k_0}\s'_{k_0}.
+\qquad \text{(see (\ref{eq:defn of c},\ref{eq:defn of sigmap}))} 
+\label{eq:dual growth 2}
+\end{align}
+To bound the last sum in \eqref{eq:dual growth 1}, we write that 
+\begin{align}
+& \sum_{i\in K_{\s,u}} \s_i \|A(u_i) - b\|\nonumber\\
+& = \sum_{i\in K_{\s,u},\, i \le k_{\max}} \s_i \|A(u_i) - b\| +  \sum_{i\in K_{\s,m},\, i > k_{\max}} \s_i \|A(u_i) - b\| \nonumber\\
+& \le \sum_{i\in K_{\s,u},\, i \le k_{\max}} \s_i \|A(u_i) - b\|  
++ \sum_{i\in K_{\s,m},\, i > k_{\max}} \frac{1}{i \log^2(i+1)} \nonumber\\
+& \le \sum_{i\in K_{\s,u},\, i \le k_{\max}} \s_i \|A(u_i) - b\|  + c,
+\label{eq:dual growth 2.5}
+\end{align}
+where the last line follows from the updates. To control the remaining sum in the last line above, we write that 
+\begin{align}
+& \sum_{i\in K_{\s,u},\, i\le k_{\max}} \s_i \|A(u_i)-b\|
+\nonumber\\ & 
+\le \frac{\delta'_{k_0}}{\s_{k_0}} \sum_{i\in K_{\s,u},\, i\le k_{\max}}  (\b_i- \b_{i-1})\|A(u_i)-b\|
+\qquad \text{(see (\ref{eq:sigma rule},\ref{eq:defn of delta}))}
+ \nonumber\\
+& \le   \frac{\delta'_{k_0} \rho }{\s_{k_0}}   \sum_{i\in K_{\s,u},\, i\le k_{\max}}  (\b_i- \b_{i-1})
+\qquad \text{(see \eqref{eq:good neighb})} \nonumber\\
+&\le  \frac{\delta'_{k_0} \rho }{\s_{k_0}}  \sum_{i\in K_{\s,u},\, i\le k_{\max}}   \frac{2\b_{k_0} \log i}{\sqrt{(i-1)k_0} \log(k_0+1)}
+\qquad \text{(see \eqref{eq:consequences})}\nonumber\\
+& \le 
+\frac{4 \b_{k_0} \delta'_{k_0}\rho  \log k_{\max} }{\s_{k_0} } \sqrt{\frac{k_{\max}}{k_0}}.
+\qquad \left( \sum_{i=1}^{k} \frac{1}{\sqrt{i}} \le \int_{0}^{k} \frac{dx}{\sqrt{x}} =  2 \sqrt{k} \right)
+\label{eq:dual growth 3}
+\end{align}
+By combining (\ref{eq:dual growth 2.5},\ref{eq:dual growth 3}), we reach 
+\begin{align}
+& \sum_{i\in K_{\s,u}} \s_i \|A(u_i) - b\|\nonumber\\
+& \le   \frac{4 \b_{k_0} \delta'_{k_0}\rho  \log k_{\max} }{\s_{k_0} } \sqrt{\frac{k_{\max}}{k_0}}+ c. 
+\end{align}
+Finally, by combining (\ref{eq:dual growth 1},\ref{eq:dual growth 2},\ref{eq:dual growth 3}), we find that 
+\begin{align}
+\|y_k\| & \le \|y_{k_0}\|+ c \s_{k_0}(\s_{k_0}+\s'_{k_0}) \nonumber\\
+& \qquad +  \frac{4 \b_{k_0} \delta'_{k_0}\rho  \log(k_{\max}+1) }{\s_{k_0} } \sqrt{\frac{k_{\max}}{k_0}} +c =: y_{\max},
+\label{eq:dual growth}
+\end{align}
+for every $k\in K$.  
+Equipped with \eqref{eq:dual growth}, we can now finally evaluate $B_{K}$ as 
+\begin{align}
+B_{K} & = \sum_{k=k_0-1}^{{k_1-1}} \frac{ \|y_{k}\|^2 }{(k+1)\log^2( k+2)}  
+\qquad \text{(see \eqref{eq:defn of B})}
+\nonumber\\
+& \le \sum_{k=k_0-1}^{k_1-1} \frac{y_{\max}^2}{(k+1)\log^2(k+2)} 
+\qquad \text{(see \eqref{eq:dual growth})}
+\nonumber\\
+& \le c y_{\max}^2.
+\qquad \text{(see \eqref{eq:defn of c})}
+\label{eq:Bk evaluated}
+\end{align}
+Let us now revisit and simplify the condition imposed in (\ref{eq:good neighb}). 
+To that end, we first derive a weaker but uniform bound on the feasibility gap. For every $k\in K$, it indeed holds that 
+\begin{align}
+& \|A(u_k)-b\|^2 \nonumber\\
+& \le \sum_{i=k_0}^{k_1} \|A(u_i)-b\|^2 
+\nonumber\\
+& \le 2c\b_{k_0} + \frac{24c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{24\b_{k_0}' \eta_{\max}^2 B_K}{\eta_{\min}^2} 
++ \frac{\mu'}{\delta_{k_0}}
+\qquad \text{(see \eqref{eq:feas bnd semi final})} \nonumber\\
+& \le 2c\b_{k_0} + \frac{24c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{24c\b_{k_0}' \eta_{\max}^2 y_{\max}^2}{\eta_{\min}^2} 
++ \frac{\mu'}{\delta_{k_0}}. 
+\qquad \text{(see \eqref{eq:Bk evaluated})}
+\label{eq:rate of feas gap}
+\end{align}
+We may therefore replace \eqref{eq:good neighb} with the assumption that 
+\begin{equation}
+2c\b_{k_0} + \frac{24c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{24c\b_{k_0}' \eta_{\max}^2 y_{\max}^2}{\eta_{\min}^2} 
++ \frac{\mu'}{\delta_{k_0}}\le \rho^2,
+\label{eq:suff cnd on rho}
+\end{equation}
+which ensures that \eqref{eq:good neighb} holds. We emphasize that the left-hand side above is bounded, as proved in (\ref{eq:long chain},\ref{eq:defn of kappa}).
+
+\begin{remark}
+Note that \eqref{eq:suff cnd on rho} controls the initial feasibility $\|A(u_{k_0})-b\|$, which indeed appears in (\ref{eq:defn mu},\ref{eq:defn of mup}). 
+\end{remark}
+By combining (\ref{eq:final bnd on G},\ref{eq:feas bnd semi final},\ref{eq:Bk evaluated}), we reach 
+\begin{align}
+& \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 + \|A(u_k)-b\|^2  \nonumber\\
+&\le  2c\b_{k_0}(\delta_{k_0}+1) 
++ 
+\frac{24\b_{k_0}'\lambda'^2_h}{\eta_{\min}^2}(\delta_{k_0}+1) 
+\nonumber\\
+& \qquad \qquad + 
+\frac{24\b_{k_0}'\eta_{\max}^2 B_K}{\eta_{\min}^2}(\delta_{k_0}+1) 
++\frac{\mu' }{\delta_{k_0}}\left( 2\delta_{k_0}+1 \right) 
+ \nonumber\\
+& \le  2c\b_{k_0}(\delta_{k_0}+1) 
++ 
+\frac{24\b_{k_0}'\lambda'^2_h}{\eta_{\min}^2}(\delta_{k_0}+1) 
+\nonumber\\
+& \qquad \qquad + 
+\frac{24c\b_{k_0}'\eta_{\max}^2 y_{\max}^2}{\eta_{\min}^2}(\delta_{k_0}+1) 
++\frac{\mu' }{\delta_{k_0}}\left( 2\delta_{k_0}+1 \right),
+\label{eq:raw bnd on sum}
+\end{align}
+where the last line above uses \eqref{eq:Bk evaluated}. 
+To better interpret \eqref{eq:raw bnd on sum}, we next find a lower bound on the primal step sizes $\{\gamma_k\}_k$ by invoking \eqref{eq:moreover}. To do so,  we  first need to gauge how smooth the augmented Lagrangian $\mathcal{L}_{\beta_k}(\cdot,y_k)$ is (as a function of its first argument). For every $k\in K$ and after recalling Lemma \ref{lem:11}, we  note that 
+\begin{align}
+\lambda_{\beta_k} 
+& \le \lambda_h+ \sqrt{m}\lambda_A \left( \|y_k\| + \beta_k \|A(u_k)-b\| \right)+ \beta_k \|DA(u_k)\|_F^2  
+\qquad \text{(see \eqref{eq:smoothness of Lagrangian})} \nonumber\\
+& \le \lambda_h+ \sqrt{m}\lambda_A \left( y^{\max} +  \beta_k \rho \right) + 
+\beta_k  m \eta_{\max}^2 
+\qquad \text{(see (\ref{eq:defn lambdap n etaMax},\ref{eq:good neighb},\ref{eq:dual growth}))} \nonumber\\
+& = \lambda_h+\sqrt{m}\lambda_A y_{\max} + 
+\beta_k \left( \sqrt{m}\lambda_A \rho + m \eta_{\max}^2 \right)
+\nonumber\\
+& \le  \lambda_h + \sqrt{m}\lambda_A y_{\max} + \b_{k_0} \sqrt{\frac{k \log^2(k+1)}{k_0 \log^2(k_0+1)}} (\sqrt{m}\lambda_A\rho + m \eta_{\max}^2 )
+\quad \text{(see \eqref{eq:consequences})} \nonumber\\
+& \le 2\b_{k_0} \sqrt{\frac{k \log^2(k+1)}{k_0 \log^2(k_0+1)}} (\sqrt{m}\lambda_A\rho + m \eta_{\max}^2 ),
+\label{eq:smoothness at k}
+\end{align}
+where the last line holds if $k_0$ is sufficiently large.  We are now in position to invoke \eqref{eq:moreover} and write that 
+\begin{align}
+\gamma_k & \ge  \frac{\theta}{\lambda_{\beta_k}} \qquad \text{(see \eqref{eq:moreover})}\nonumber\\
+& \ge \frac{\theta }{2\beta_{k_0}( \sqrt{m}\lambda_A \rho + 2m \eta_{\max}^2)  }
+ \sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}} 
+\qquad \text{(see \eqref{eq:smoothness at k})} \nonumber\\
+& =: \gamma' \sqrt{\frac{k_0 \log^2 (k_0+1) }{ k \log^2(k+1)}},
+\label{eq:low bnd on gammas}
+\end{align}
+for every $k\in K$. 
+The lower bound on the step sizes in \eqref{eq:low bnd on gammas} has two immediate consequences. First, we find that \eqref{eq:to be used later on} automatically holds if $k_0$  is sufficiently large, because both $\delta_{k_0}$ and $\b'_{k_0}$ are bounded, respectively, as (\ref{eq:defn of delta},\ref{eq:defn of kappa}). Second, \eqref{eq:low bnd on gammas}   allows us to better interpret \eqref{eq:raw bnd on sum}, namely, 
+\begin{align}
+& \sum_{k=k_0}^{k_1} \gamma' \sqrt{\frac{k_0 \log^2 (k_0+1) }{ k \log^2(k+1)}} \|G_k\|^2 +\|A(u_k)-b\|^2 \nonumber\\
+& \le \sum_{k=k_0}^{k_1}\gamma_k \|G_k\|^2 +\|A(u_k)-b\|^2 
+\qquad \text{(see (\ref{eq:low bnd on gammas}))}
+\nonumber\\
+& \le 
+ 2c\b_{k_0}(\delta_{k_0}+1) 
++ 
+\frac{24\b_{k_0}'\lambda'^2_h}{\eta_{\min}^2}(\delta_{k_0}+1) 
+\nonumber\\
+& \qquad \qquad + 
+\frac{24c\b_{k_0}'\eta_{\max}^2 y_{\max}^2}{\eta_{\min}^2}(\delta_{k_0}+1) 
++\frac{\mu' }{\delta_{k_0}}\left( 2\delta_{k_0}+1 \right),
+\qquad \text{(see \eqref{eq:raw bnd on sum})}
+\label{eq:detailed final bnd}
+\end{align}
+which immediately yields that 
+\begin{align}
+\min_{k\in K} \gamma' \sqrt{\frac{k_0 \log^2(k_0+1)}{k_1\log^2(k_1+1)}} \|G_k\|^2+ \|A(u_k)-b\|^2 \lesssim  \frac{1}{|K|},
+\end{align}
+where $\gamma'$ is independent of $k_1$ and  $\lesssim$ suppresses the dependence on all parameters except $k_1$. 
+
+
+
+
+
+
+
+\begin{theorem}
+\label{thm:main}
+Let $\gamma_k$ be the output of the line search subroutine in our algorithm in iteration $k$. 
+For integers $k_0\le k_1$, consider the interval $K=[k_0:k_1]$ and the choice of parameters in (\ref{eq:beta rule},\ref{eq:sigma rule}) for $\s_{k_0}\ge \b_{k_0}>0$ and an integer $k_{\max}$. We impose the following geometric requirements on the constraints. Let $P_{T_C(u)}$ and $P_{N_C(u)}$ denote the orthogonal projection onto the tangent and normal cones at $u\in C$, respectively.
+Consider a subspace $S_{K}\subseteq \mathbb{R}^d$ such that 
+\begin{align}
+S_{K} \supseteq \bigcup_{k\in K} T_C(u_k),
+\end{align}
+and, with some abuse of notation, let $S_{K}$ also denote an orthonormal basis for this subspace. For $\rho>0$, we assume that the nonconvex Slater's condition holds. That is, assume that  $\eta_{\min}>0$, where 
+\begin{align}
+ \eta_{\min} :=
+\begin{cases}
+\min_{u,v} \, \left\| S_{k_0}^\top P_{T_C(u)} (DA(u)^\top v) \right\| \\
+\|v\|=1\\
+\|A(u)-b\|\le  \rho\\
+u\in  C. 
+\end{cases}
+\label{eq:new slater}
+\end{align}
+Suppose that 
+\begin{align}
+\operatorname{diam}(C) \le \frac{2\eta_{\min}}{\lambda_A},
+\end{align}
+and that $\rho$ is large enough so that (\ref{eq:suff cnd on rho}) holds, where the left-hand side is independent of $k_1$. Assuming that $k_0\gg 1$, it then holds that 
+\begin{align}
+\min_{k\in K} \gamma' \sqrt{\frac{k_0 \log^2(k_0+1)}{k_1\log^2(k_1+1)}} \|G_k\|^2+ \|A(u_k)-b\|^2 \lesssim  \frac{1}{k_1-k_0},
+\label{eq:final thm}
+\end{align}
+where $\gamma'$ is independent of $k_1$ and $\lesssim$ suppresses the dependence on all parameters except $k_1$. The exact expression for the right-hand side of (\ref{eq:final thm}) is given in (\ref{eq:detailed final bnd}). The factor $\gamma'$ is defined in (\ref{eq:low bnd on gammas}).
+\end{theorem}
+
diff --git a/tex/Paper/sections/appendix.tex b/tex/Paper/sections/appendix.tex
new file mode 100644
index 0000000..1c96a4a
--- /dev/null
+++ b/tex/Paper/sections/appendix.tex
@@ -0,0 +1,65 @@
+\section{Proof of Lemma \ref{lem:bnd bnd Ak} \label{sec:proof of bnd Ak}}
+If $A(u_k)=b$, then \eqref{eq:bnd on Ak final} holds trivially. Otherwise, 
+%we  show that $P_{T_C(u_{k+1})} DA(u_k)^\top$ is a well-conditioned matrix, in order to lower bound \eqref{eq:bnd on Ak raw}. More specifically, 
+for an integer $k_0$, consider a subspace 
+\begin{align}
+S _{K} \supseteq \bigcup_{k\in K} T_{C}(u_k),
+\label{eq:defn of S}
+\end{align}
+and let $S_{K}$ with orthonormal columns  denote a basis for this subspace, with some abuse of notation. We then assume that  
+\begin{align}
+0 < \eta_{\min} :=
+\begin{cases}
+\min_{u,v} \, \left\| S_{K}^\top P_{T_C(u)} ( DA(u)^\top v )  \right\| \\
+\|v\|=1\\
+\|A(u)-b\|\le  \rho\\
+u\in  C.
+\end{cases}
+\label{eq:new slater proof}
+\end{align}
+If $ \max_{k\in K} \|A(u_k)-b\| \le \rho $,
+then \eqref{eq:new slater proof} is in force and, for every $k\ge k_0$, we may write that 
+\begin{align}
+&  \left\| P_{T_C(u_{k+1})}( DA(u_{k})^\top (A(u_k)-b) ) \right\| \nonumber\\
+& \ge \left\| P_{T_C(u_{k+1})} ( DA(u_{k+1})^\top (A(u_k)-b) ) \right\| -  \left\|  (   DA(u_{k+1}) - DA(u_k) )^\top (A(u_k)-b) \right\| 
+\nonumber\\
+& \ge \eta_{\min} \|A(u_k)-b\| -  \left\|    DA(u_{k+1}) - DA(u_k)  \right\| \|A(u_k)-b\|
+\qquad \text{(see \eqref{eq:new slater proof})}
+\nonumber\\
+& \ge \eta_{\min}  \|A(u_k)-b\| - \lambda_A \|u_{k+1}-u_k\| \cdot \|A(u_k)-b\|
+\qquad \text{(see \eqref{eq:smoothness basic})}
+\nonumber\\
+& = \left( \eta_{\min}  -\lambda_A \gamma_{k} \|G_{k}\|  \right) \|A(u_k)-b\|
+\qquad \text{(see \eqref{eq:grad map recall})} \nonumber\\
+& \ge \frac{\eta_{\min}}{2} \|A(u_k)-b\|,
+\label{eq:well cnd}
+\end{align}
+where the second line above uses the triangle inequality and the non-expansiveness of the projection operator. The last line above uses the observation that 
+\begin{align}
+\lambda_A\gamma_k\|G_k\| & = \lambda_A \|u_{k+1}-u_k\| \nonumber\\
+& \le \lambda_A \text{diam}(C) \nonumber\\
+& \le \frac{\eta_{\min}}{2}.
+\qquad \text{(see \eqref{eq:cnd for well cnd})}
+\end{align}
+We can now lower bound \eqref{eq:bnd on Ak raw} by using \eqref{eq:well cnd}, namely,  
+\begin{align}
+ \frac{\eta_{\min}\beta_k}{2} \|A(u_{k})-b\|& 
+ \le \beta_{k}\| P_{T_C(u_{k+1})} ( DA(u_{k})^\top (A(u_{k})-b) )\| 
+\qquad \text{(see \eqref{eq:well cnd})} 
+ \nonumber\\
+ & \le    \lambda_h' + \eta_{\max} \|y_{k-1}\|+ \|G_{k}\|.
+\qquad \text{(see \eqref{eq:bnd on Ak raw})} 
+% \nonumber\\
+%& \le \frac{ \eta_{\min}\rho}{2\sigma_{k}},
+% \qquad \text{(see \eqref{eq:to be met asympt})}
+%
+\end{align}
+%The step of the induction is proved similarly by replacing $k_0$ above with $k$.
+which completes the proof of Lemma \ref{lem:bnd bnd Ak}.
+
+
+
+
+
+
+
diff --git a/tex/Paper/sections/nonadaptive_theory.tex b/tex/Paper/sections/nonadaptive_theory.tex
new file mode 100644
index 0000000..2781b81
--- /dev/null
+++ b/tex/Paper/sections/nonadaptive_theory.tex
@@ -0,0 +1,444 @@
+\section{Proof of Nonadaptive Theorem}
+For convenience, let us recall the updates of the algorithm in iteration $k$, namely,  
+\begin{align}
+u_{k+1} & = P_C( u_k - \gamma_k \nabla \mathcal{L}_{\beta_k} (u_k,y_k) )
+\nonumber\\
+& = P_C\left( u_k - \gamma_k \nabla h(u_k) - \gamma_k DA(u_k) ^\top \left( y_k + \frac{A(u_k)-b}{\beta_k} \right) \right),
+\qquad \text{(see \eqref{eq:Lagrangian})}
+\label{eq:update uk recall}
+\end{align}
+\begin{align}
+y_{k+1} =y_k + \frac{A(u_{k+1})-b}{\sigma_{k+1}}, 
+\label{eq:y update recall}
+\end{align}
+\begin{equation}
+G_k = G_{\beta_k,\gamma_k}(u_k,y_k) = \frac{u_k-u_{k+1}}{\gamma_k}.
+\qquad \text{(see \eqref{eq:grad map})}
+\label{eq:grad map recall}
+\end{equation}
+For integers $k_0\le k_1$, consider the interval 
+\begin{equation}
+ K = [k_0:k_1]=\{k_0,\cdots,k_1\}.
+ \end{equation}
+Since $\gamma_k$ is determined by the line search subroutine in Lemma \ref{lem:eval Lipsc cte},  we may now apply Lemma \ref{lem:11} for every iteration in this interval to find that 
+\begin{align}
+ \frac{\gamma_k \|G_k\|^2}{2} 
+& \le \mathcal{L}_{\beta_k}(u_k,y_k) - \mathcal{L}_{\beta_k}(u_{k+1},y_k)
+\qquad \text{(see Lemma \ref{lem:11})} \nonumber\\
+& = h(u_k) - h(u_{k+1})+ \langle A(u_k)-A(u_{k+1}) , y_k \rangle+ \frac{\|A(u_k)-b\|^2 - \|A(u_{k+1})-b\|^2}{2\beta_k},
+\qquad \text{(see \eqref{eq:Lagrangian})}
+\label{eq:smoothness rep}
+\end{align}
+for every $k\in K$. 
+On the other hand, 
+\begin{align}
+{y_k = y_{k_0-1} + \sum_{i=k_0}^k \frac{A(u_i)-b}{\sigma_i}},
+\qquad \text{(see \eqref{eq:y update recall})}
+\label{eq:y update unfolded}
+\end{align}
+which, after substituting in \eqref{eq:smoothness rep},  yields that 
+\begin{align}
+\frac{\gamma_k \|G_k\|^2}{2} & \le h(u_k) - h(u_{k+1}) + \left\langle A(u_k) - A(u_{k+1}) ,  y_{k_0} + \sum_{i=k_0+1}^k \frac{A(u_i)-b}{\sigma_i} \right\rangle 
+ +\frac{\|A(u_k)-b\|^2-\|A(u_{k+1})-b\|^2}{2\beta_k}. 
+ \label{eq:key ineq}
+\end{align}
+%Additionally, let us assume that 
+%\begin{align}
+%\beta_k = \frac{\beta}{\sqrt{k}},
+%\qquad 
+%\sigma_k = \beta k,
+%\qquad \forall k \in K,
+%\label{eq:beta n sigma}
+%\end{align}
+%with $\beta>0$. For the record, the above assumptions imply that 
+%\begin{align}
+%\frac{1}{\beta_k}- \frac{1}{\beta_{k-1}} \le \frac{1 }{\beta \sqrt{k}},
+%\qquad 
+%\frac{1}{\sigma_k} \le \frac{1}{\beta \sqrt{k}},
+%\qquad \forall k\in K,
+%\label{eq:consequences}
+%\end{align}
+%for sufficiently large $k_0$. 
+
+{Additionally, let us assume that, for $\beta>0$,
+\begin{align}
+\frac{1}{\beta_k}- \frac{1}{\beta_{k-1}} \le \frac{1 }{\beta \sqrt{k}},
+\qquad 
+\frac{1}{\sigma_k} \le \frac{1}{\beta \sqrt{k}},
+\qquad \forall k\in K,
+\label{eq:consequences}
+\end{align}
+for sufficiently large $k_0$. 
+
+For example, following choices satisfy the requirements:
+
+\begin{align}
+\beta_k = \frac{\beta}{{k}^\alpha} \text{ where } \alpha \leq 1/2,
+\qquad 
+%\sigma_k \geq \beta \sqrt{k},
+\sigma_k \geq \max(\beta \sqrt{k}, \beta k \| Au_k - b \|),
+\qquad \forall k \in K,
+\label{eq:beta n sigma}
+\end{align}
+}
+
+By summing up the key inequality in \eqref{eq:key ineq} over $k$ from $k_0$ to $k_1$ and using \eqref{eq:beta n sigma}, we find that 
+\begin{align}
+& \sum_{k=k_0}^{k_1} \frac{\gamma_k \|G_k\|^2}{2}  \nonumber\\
+& \le h(u_{k_0}) - h(u_{k_1+1}) + \langle A(u_{k_0}) - A(u_{k_1+1}) , y_{k_0-1}\rangle  
++ \sum_{k=k_0}^{k_1} \sum_{i=k_0}^k \left\langle A(u_k) - A(u_{k+1}) , \frac{A(u_i)-b}{\sigma_i} \right\rangle 
+ \nonumber\\
+& \qquad + \sum_{k=k_0}^{k_1} \frac{\|A(u_k)-b\|^2}{2\beta_k} - \sum_{k=k_0}^{k_1} \frac{\|A(u_{k+1})-b\|^2}{2\beta_k}  
+\qquad \text{(see \eqref{eq:key ineq})}
+\nonumber\\
+& = h(u_{k_0}) - h(u_{{k_1}+1}) + \langle A(u_{k_0}) - A(u_{{k_1}+1}) , y_{k_0-1} \rangle 
++ \sum_{k=k_0}^{k_1} \sum_{i=k_0}^k  \left\langle A(u_k) - A(u_{k+1}) , \frac{A(u_i)-b}{\sigma_i}  \right\rangle\nonumber\\
+& \qquad + \sum_{k=k_0}^{k_1} \frac{\|A(u_k)-b\|^2}{2\beta_k} - \sum_{k=k_0+1}^{{k_1}+1} \frac{\|A(u_{k})-b\|^2}{2\beta_{k-1}} \nonumber\\
+& =  h(u_{k_0}) - h(u_{{k_1}+1}) + \langle A(u_{k_0}) - A(u_{{k_1}+1}) , y_{k_0-1} \rangle + \frac{\|A(u_{k_0})-b\|^2}{2\beta_{k_0}}  + \sum_{i=k_0}^{k_1} \sum_{k=i}^{k_1} \left\langle A(u_k) - A(u_{k+1}) , \frac{A(u_i)-b}{\sigma_i} \right\rangle 
+\nonumber\\
+& \qquad 
++ \sum_{k=k_0+1}^{k_1}  \left( \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2 - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2
+  \nonumber\\
+& \le  \mu + \sum_{i=k_0}^{k_1} \left\langle A(u_i) - A(u_{{k_1}+1}),  \frac{A(u_i)-b}{\sigma_i} \right\rangle + \sum_{k=k_0}^{k_1}  \left( \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2 - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2
+\qquad \text{(see \eqref{eq:defn mu})} 
+  \nonumber\\
+& = \mu   + \sum_{k=k_0}^{k_1}  \left( \frac{1}{\sigma_k} +\frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2
+- \sum_{k=k_0}^{k_1} \left \langle A(u_{{k_1}+1})-b, \frac{A(u_k)-b}{\sigma_k} \right\rangle - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2
+ \nonumber\\
+& \le\mu  + \sum_{k=k_0}^{k_1}  \left( \frac{1}{\sigma_k}+ \frac{1}{2\beta_k}- \frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2
++ 
+ \sum_{k=k_0}^{k_1} \frac{1}{\sigma_k} \|A(u_{{k_1}+1})-b\| \|A(u_k)-b\| - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2 \nonumber\\
+& \le \mu  + \sum_{k=k_0}^{k_1}  \left( \frac{1 }{\beta \sqrt{k}}+ \frac{1}{2\beta_k}-\frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2
++ 
+ \sum_{k=k_0}^{k_1} \frac{ 1}{\beta \sqrt{k}} \|A(u_{{k_1}+1})-b\| \|A(u_k)-b\| - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2
+ \qquad \text{(see (\ref{eq:consequences}))}
+ \nonumber\\
+ & \le 
+ \mu + \sum_{k=k_0}^{k_1} \frac{3 }{2\beta \sqrt{k}} \|A(u_k)-b\|^2 + \sum_{k=k_0}^{k_1} \frac{1}{\beta \sqrt{k}}  \|A(u_{{k_1}+1})-b\| \|A(u_k)-b\| - \frac{\sqrt{k}}{2\beta} \| A(u_{k_1+1})-b\|^2
+ \qquad \text{(see (\ref{eq:consequences}))}
+ \nonumber\\
+ & \le \mu + \sum_{k=k_0}^{k_1} \left( \frac{3}{2\beta \sqrt{k}} +  \frac{1}{\beta \sqrt{k} }  \right) \|A(u_k)-b\|^2 + \sum_{k=k_0}^{k_1}    \frac{1}{4\beta\sqrt{k}}  \|A(u_{{k_1}+1})-b\|^2 - \frac{\sqrt{k_1}}{2\beta} \| A(u_{k_1+1})-b\|^2
+ \qquad \left( 2ab \le ca^2 + c^{-1}b^2 \right)
+  \nonumber\\
+  & \le  \mu + \sum_{k=k_0}^{k_1}   \frac{5 }{2\beta \sqrt{k} }   \|A(u_k)-b\|^2 + \frac{\sqrt{k_1}}{2\beta} \| A(u_{k_1+1})-b\|^2 - \frac{\sqrt{k_1}}{2\beta} \| A(u_{k_1+1})-b\|^2  \qquad \left( \sum_{k=1}^{k_1} \frac{1}{\sqrt{k}} \le \int_{1}^{k_1} \frac{d\kappa }{\sqrt{\kappa}} =2 \sqrt{k_1} \right) \nonumber\\
+& 
+\le  \mu + \sum_{k=k_0}^{k_1}   \frac{5 }{2\beta\sqrt{k}}   \|A(u_k)-b\|^2   
+   \label{eq:long chain}
+\end{align}
+where we assumed that 
+\begin{equation}
+\mu := \sup_k h(u_{k_0}) - h(u_k) + \langle A(u_{k_0})-A(u_k) ,y_{k_0-1}\rangle  + \frac{\|A(u_{k_0})-b\|^2}{2\beta_{k_0}}< \infty. 
+\label{eq:defn mu}
+\end{equation}
+%
+%Let us also assume that $\{\beta_k\}_k$ is non-increasing, namely,
+%\begin{equation}
+%\beta_{k+1}  \le \beta_k ,
+%\qquad 
+%\forall k \ge k_0,
+%\label{eq:sigma dec}
+%\end{equation}
+%which allows us to further simplify the last line in \eqref{eq:long chain} as 
+%\begin{align}
+%\sum_{k=k_0}^K \frac{\gamma_k\|G_k\|^2}{2} 
+%& \le 
+%\mu + \sum_{k=k_0+1}^K   \frac{\sqrt{k}+3}{2\beta_k} 
+% \|A(u_k)-b\|^2  +
+% \sum_{k=k_0+1}^K \frac{\|A(u_{K+1})-b\|^2}{2\sqrt{k}\beta_k}   
+%\nonumber\\
+%& \le \mu + \sum_{k=k_0+1}^K  \frac{\sqrt{k}+3}{2\beta_k}  \|A(u_k)-b\|^2  +
+% \sum_{k=k_0+1}^K \frac{\|A(u_{K+1})-b\|^2}{2\sqrt{k}\beta_{K+1}}   
+%\qquad \text{(see \eqref{eq:sigma dec})}
+%\nonumber\\
+%& \le \mu + \sum_{k=k_0+1}^K  \frac{\sqrt{k}+3}{2\beta_k}  \|A(u_k)-b\|^2  +
+% \frac{\sqrt{K+1}}{\beta_{K+1}}   \|A(u_{K+1})-b\|^2
+% \qquad \left( \sum_{k=1}^K \frac{1}{\sqrt{k}} \le \int_{0}^K \frac{d\kappa}{\sqrt{\kappa}}  = 2\sqrt{K} \right) \nonumber\\
+%& \le \mu + \sum_{k=k_0+1}^{K}   \frac{2\sqrt{k}}{\beta_k}  \|A(u_k)\|^2  + \frac{\sqrt{K+1}}{\beta_{K+1}}   \|A(u_{K+1})-b\|^2 \nonumber\\
+%& \le \mu + \sum_{k=k_0+1}^{K+1}   \frac{2\sqrt{k}}{\beta_k}  \|A(u_k)-b\|^2.  
+%\label{eq:raw}
+%\end{align}
+%We now try to formalize Volkan's intuition that gradient mapping should derive down the feasibility gap. Let us assume that 
+%\begin{align}
+%\sum_{k=1}^{K+1} \frac{\|A_k\|^2}{\beta_k} \le \sum_{k=0}^K \gamma_k G_k^2.
+%\label{eq:assumption}
+%\end{align}
+%And we take 
+%\begin{equation}
+%\sigma_k = 6 \beta_k,
+%\qquad \forall k. 
+%\end{equation}
+%Then, by combining the two inequalities above, we reach 
+%\begin{align}
+% \sum_{k=1}^{K+1} \frac{\|A_k\|^2}{\beta_k} \le 4\mu_0,
+%\end{align}
+%\begin{align}
+%\sum_{k=0}^K \gamma_k G_k^2   \le 4\mu_0. 
+%\end{align}
+%That is, Volkan's assumption \eqref{eq:assumption} successfully bounds both the gradient mapping and the feasibility gap. One question is the interplay between $\{\beta_k,\gamma_k\}_k$ to ensure the validity of Volkan's assumption, which feels like some sort of \emph{uncertainty principle}. 
+Note that \eqref{eq:long chain} bounds the gradient mapping with the feasibility gap. We next find a converse, thus bounding  the feasibility gap with the gradient mapping. To that end, let $T_C(u)$ and $P_{T_C(u)}$ be the tangent cone of $C$ at $u\in C$ and orthogonal projection onto this subspace, respectively. Likewise, let $N_C(u)$ and $P_{N_{C}(u)}$ be the normal cone of $C$ at $u$ and the corresponding orthogonal projection. 
+
+Roughly speaking, \eqref{eq:bnd on Ak final} states that the feasibility gap is itself bounded by the gradient map. In order to apply Lemma \ref{lem:bnd bnd Ak}, let us assume that  (\ref{eq:good neighb}) holds. Lemma \ref{lem:bnd bnd Ak} is then in force and we  may now substitute \eqref{eq:bnd on Ak final} back into \eqref{eq:long chain} to find that 
+\begin{align}
+\sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 
+& \le \frac{5 }{2\beta} \sum_{k=k_0}^{{k_1}} {\frac{1}{\sqrt{k}}} \|A(u_k)-b\|^2 +2 \mu
+\qquad \text{(see \eqref{eq:long chain})}
+\nonumber\\
+& \le \frac{20 }{\beta \eta_{\min}^2} \sum_{k=k_0}^{{k_1}} 
+ \beta_k^2 \frac{1}{\sqrt{k}} \left(  \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2+ 2\mu
+\qquad \text{(see \eqref{eq:bnd on Ak final})}
+ \nonumber\\
+ & \le 
+ \frac{20\beta}{ \eta_{\min}^2} \sum_{k=k_0}^{{k_1}} 
+ \frac{1}{k^{3/2}}  \left(  \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2+ 2\mu
+\qquad \text{(see (\ref{eq:beta n sigma}))} \nonumber\\
+& \le 
+\frac{60\beta}{ \eta_{\min}^2} \sum_{k=k_0}^{{k_1}} 
+ \frac{1}{k^{3/2}}  \left(  \lambda_h'^2 + \eta_{\max}^2 \|y_{k-1}\|^2 + \|G_k\|^2 \right)+ 2\mu,
+\qquad \left( (a+b+c)^2 \le 3(a^2+b^2+c^2) \right)
+\label{eq:to be used for feas}
+\end{align}
+where in the third inequality, we used the particular choice of $\beta_k = \frac{\beta}{k^{1/2}}$ in accordance with~\eqref{eq:beta n sigma}.
+
+%To move forward, let us assume that 
+%\begin{align}
+%\sigma_k = o(\sqrt{k}),
+%\qquad 
+%\frac{\sqrt{k}\sigma_k}{\gamma_k} = o(1),
+%\qquad 
+%k^{\frac{3}{2}} \sigma_k \|y_{k-1}\|^2 =o(1).
+%\label{eq:cnd on sigma gamma}
+%\end{align}
+To simplify the above expression, let us assume that 
+\begin{equation}
+\frac{60\beta}{ \eta_{\min}^2 k}  \le \frac{\gamma_k}{2},
+\qquad \forall  k\in K.
+\label{eq:to be used later on}
+\end{equation}
+Let $|K|=k_1-k_0+1$ be the size of the interval $K$. 
+
+After rearranging \eqref{eq:to be used for feas} and applying \eqref{eq:to be used later on}, we arrive at 
+\begin{align}
+& { \frac{|K|}{2} \cdot \min_{k \in K} \gamma_k \|{G}_{k}\|^2} \nonumber\\
+& \le \sum_{k=k_0}^{k_1} \frac{\gamma_k}{2} \|G_k\|^2  \nonumber\\
+& \le \sum_{k=k_0}^{k_1} \left(\gamma_k - \frac{60\beta}{\eta_{\min}^2 k^{3/2}}  \right)\|G_k\|^2 
+\qquad \text{(see \eqref{eq:to be used later on})}
+ \nonumber\\
+ & \le \frac{60 \beta \lambda'^2_h }{\eta_{\min}^2  } \sum_{k=k_0}^{{k_1}} \frac{1}{k^{3/2}}
++ \frac{60\beta \eta_{\max}^2}{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2 }{k^{3/2}} 
+%+\frac{48\beta  \log(k_1+1) \|G_{k_1+1}\|^2}{\eta_{\min}^2 (k_1+1)} 
++2\mu 
+\qquad \text{(see \eqref{eq:to be used for feas})} 
+  \nonumber\\
+%  & =:  \frac{48 \beta \lambda'^2_h }{\eta_{\min}^2  } \sum_{k=k_0+1}^{{k_1}+1} \frac{\log k}{k}
+%+ \frac{48\beta \eta_{\max}^2}{\eta_{\min}^2} \sum_{k=k_0+1}^{{k_1}+1} \frac{\|y_{k-1}\|^2\log k}{k} 
+%+2\mu' \nonumber\\
+%  & \le \frac{60\beta \lambda'^2_h }{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{1}{k}
+%  + 
+%  \frac{60\beta\eta_{\max}^2 }{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2}{k} +2 \mu \nonumber\\
+  & \le \frac{120\beta \lambda'^2_h C_1}{\eta_{\min}^2} 
+  + 
+  \frac{60\beta\eta_{\max}^2 }{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2}{k^{3/2}} +2 \mu \nonumber\\
+  & \le  \frac{120\beta C_1}{\eta_{\min}^2}  \left(  \lambda'^2_h + \frac{\eta_{\max}^2}{2C_1}\sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2}{k^{3/2}} \right)
+  + 
+2 \mu, 
+\label{eq:final bnd on G}
+\end{align}
+or, equivalently, 
+\begin{align}
+\min_{k \in K} \gamma_k \|{G}_{k}\|^2 & \le  
+ \frac{240\beta C_1}{\eta_{\min}^2|K|}  \left(  \lambda'^2_h + \frac{\eta_{\max}^2}{2C_1} |K|B_K \right)
++ \frac{4\mu}{|K|},
+\label{eq:bnd on gamma G}
+\end{align}
+where 
+%\begin{align}
+%\mu' := \frac{24\beta \log(k_1+1) \|G_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1)} + \mu,
+%\label{eq:defn of muprime}
+%\end{align}
+\begin{align}
+B_{K} := \frac{1 }{|K|} \sum_{k=k_0}^{{k_1}}  \frac{\|y_{k-1}\|^2}{k}, \text{ and } C_1 = \sum_{k=k_0}^{k_1} \frac{1}{k^{3/2}},
+\label{eq:defn of B}
+\end{align}
+and we will later estimate $B_K$. 
+In turn, the bound above on the gradient mapping controls the feasibility gap, namely, 
+\begin{align}
+& |K| \min_{k\in K} \|A({u}_k)-b\|^2  \nonumber\\
+& \le \sum_{k=k_0}^{{k_1}} \|A(u_k)-b\|^2 \nonumber\\
+& \le \frac{12\beta^2 \lambda'^2_h}{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{1}{k} 
++ \frac{12\beta^2 \eta_{\max}^2}{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2 }{k}
++  \sum_{k=k_0+1}^{{k_1}+1} \frac{12 \beta^2 }{\eta_{\min}^2 k} \|G_k\|^2 
+\qquad 
+\text{(see \eqref{eq:to be used for feas})}
+\nonumber\\ 
+& \le \frac{24\beta^2 \lambda'^2_h \log ({k_1}+1) }{\eta_{\min}^2 } 
++ \frac{12\beta^2 \eta_{\max}^2}{\eta_{\min}^2} \cdot |K| B_{K}
++ \frac{\beta}{10}\sum_{k=k_0}^{{k_1}} \gamma_k \|G_k\|^2 
+%+ \frac{12\beta^2 \log (k_1+1)  \|G_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1)}
+\qquad \text{(see (\ref{eq:to be used later on},\ref{eq:defn of B}))}
+\nonumber\\
+& \le
+ \frac{24\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K  \right)
++ 
+%\frac{24\beta^2 \log^2(k_1+1) }{\eta_{\min}^2} \left( \lambda'^2_h+ \eta_{\max}^2 |K|B_K  \right) + \frac{\beta \mu'}{2} 
+ \frac{24\beta^2 C_1}{\eta_{\min}^2}  \left(  \lambda'^2_h + \frac{\eta_{\max}^2}{2C_1}|K|B_K \right) + \frac{2\beta}{5} \mu, 
+%+ \frac{12\beta^2 \log (k_1+1) \|G_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1)} 
+\qquad \text{(see \eqref{eq:final bnd on G})} \nonumber\\
+& \le
+ \frac{24\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K  \right)
++ 
+\frac{24\beta^2 \log (k_1+1) }{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K  \right) + \frac{2\beta}{5} \mu 
+\qquad \text{(see \eqref{eq:defn of muprime})}
+ \nonumber\\
+& =
+ \frac{48\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K  \right) + \frac{2\beta}{5} \mu,
+\label{eq:final bnd on feas gap}
+\end{align}
+which in turn  implies that 
+\begin{align}
+\min_{k\in K}{\|A({u}_k)-b\|^2
+\le  \frac{48\beta^2 \log(k_1+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K  \right) + \frac{2\beta\mu}{5|K|}},
+\label{eq:feas bnd semi final}
+\end{align}
+if $k_0\ge 2$. 
+Let us now revisit and simplify the condition imposed in (\ref{eq:good neighb}). 
+To that end, we first derive a weaker but uniform bound on the feasibility gap. For every $k-1\in K$, it holds that 
+\begin{align}
+\|A(u_k)-b\|^2 & \le \sum_{i=k_0}^{k_1} \|A(u_i)-b\|^2 
+\qquad \left( k_0\ge 2 \right)
+\nonumber\\
+& \le \frac{48\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K  \right) + \frac{2\beta}{5} \mu.
+\qquad \text{(see \eqref{eq:feas bnd semi final})}
+\label{eq:rate of feas gap}
+\end{align}
+Therefore, we may replace \eqref{eq:good neighb} with the assumption that 
+\begin{equation}
+\|A(u_{k_0})-b\| \le \rho,
+\qquad 
+\frac{48\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K  \right) + \frac{2\beta}{5} \mu \le \rho^2, 
+\label{eq:suff cnd on rho}
+\end{equation}
+which ensures that 
+\begin{align}
+\|A(u_k)-b\| \le \rho,
+\qquad \forall k\in [k_0:k_1+1].
+\label{eq:uniform feas larger interval}
+\end{align}
+In order to interpret (\ref{eq:final bnd on G},\ref{eq:feas bnd semi final},\ref{eq:suff cnd on rho}), we next  estimate $B_{K}$ in \eqref{eq:defn of B}.  To that end, let us first control the growth of the dual sequence $\{y_k\}_k$. Recalling \eqref{eq:y update recall} and for every $k\in [k_0:k_1+1]$, we write that 
+\begin{align}
+\|y_k\| & \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \frac{\|A(u_i)-b\|}{\sigma_k} 
+\qquad \text{(see \eqref{eq:y update recall})}
+\nonumber\\
+& \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \frac{\rho}{\beta k}
+\qquad \text{(see (\ref{eq:beta n sigma},\ref{eq:uniform feas larger interval}))} \nonumber\\
+& \le \|y_{k_0}\|+ \frac{2\rho \log k}{\beta}.
+\label{eq:dual growth}
+\end{align}
+With the growth of the dual sequence uncovered, we evaluate $B_{K}$ as 
+\begin{align}
+B_{K} & = \frac{1}{|K|}\sum_{k=k_0}^{{k_1}} \frac{ \|y_{k}\|^2 }{k+1}  
+\qquad \text{(see \eqref{eq:defn of B})}
+\nonumber\\
+& \le \frac{1}{|K|} \sum_{k=k_0}^{{k_1}} \frac{1}{k+1}\left( \|y_{k_0}\|+ \frac{2\rho \log k}{\beta} \right)^2
+\qquad \text{(see (\ref{eq:dual growth}))}
+\nonumber\\
+& \le \frac{1}{|K|}  \sum_{k=k_0}^{{k_1}}  \frac{2\|y_{k_0}\|^2}{k+1}+ \frac{8\rho^2 \log^2 k}{\beta^2  (k+1)}   
+\qquad ((a+b)^2 \le 2a^2+2b^2)
+\nonumber\\
+& \le \frac{4\|y_{k_0}\|^2 \log ({k_1}+1)}{|K|} + \frac{16 \rho^2 \log^3 ({k_1}+1)}{|K|} \nonumber\\
+& \le \frac{16 \log^3(k_1+1)}{ |K|} \left( \|y_{k_0}\|^2+\rho^2 \right). 
+\label{eq:Bk evaluated}
+\end{align}
+In order to interpret (\ref{eq:final bnd on G},\ref{eq:feas bnd semi final},\ref{eq:suff cnd on rho}), it still remains to estimate $\mu'$ in \eqref{eq:defn of muprime}. To that end, we first derive a lower bound on the step sizes $\{\gamma_k\}_k$. To invoke \eqref{eq:moreover}, we in turn need to gauge how smooth the augmented Lagrangian $\mathcal{L}_{\beta_k}(\cdot,y_k)$ is. For every $k\in [k_0:k_1+1]$,  note that 
+\begin{align}
+\lambda_{\beta_k} & \le \lambda_h+ \sqrt{m}\lambda_A \left( \|y_k\| + \frac{\|A(u_k)-b\|}{\beta_k} \right)+ \frac{\|DA(u_k)\|_F^2}{\beta_k} 
+\qquad \text{(see \eqref{eq:smoothness of Lagrangian})} \nonumber\\
+& \le \lambda_h+ \sqrt{m}\lambda_A \left(  \|y_{k_0}\|+ \frac{2\rho \log (k_1+1)}{\beta_k}  + \frac{\rho}{\beta_k} \right) + \frac{ m \eta_{\max}^2}{\beta_k}
+\qquad \text{(see (\ref{eq:defn lambdap n etaMax},\ref{eq:uniform feas larger interval},\ref{eq:dual growth}))} \nonumber\\
+& = \lambda_h+\sqrt{m}\lambda_A \|y_{k_0}\| + \frac{ 1}{\beta_k}(2\sqrt{m}\lambda_A\rho \log(k_1+1) + \sqrt{m}\lambda_A \rho + m\eta_{\max}^2 ) \nonumber\\
+& \le  \frac{ 2}{\beta_k}(2\sqrt{m}\lambda_A\rho \log(k_1+1) + \sqrt{m}\lambda_A \rho + m\eta_{\max}^2 ),
+\label{eq:smoothness at k}
+\end{align}
+where the last line holds if $k_0$ is sufficiently large. We are now in position to invoke \eqref{eq:moreover} by writing that 
+\begin{align}
+\gamma_k & \ge  \frac{\theta}{\lambda_{\beta_k}} \qquad \text{(see \eqref{eq:moreover})}\nonumber\\
+& \ge \frac{\theta \beta_k}{ 4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2  }
+\qquad \text{(see \eqref{eq:smoothness at k})} \nonumber\\
+& = \frac{\theta \beta}{ (4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 ) \sqrt{k} },
+\qquad \text{(see \eqref{eq:beta n sigma})}
+\label{eq:low bnd on gammas}
+\end{align}
+for every $k\in [k_0:k_1+1]$. 
+In particular, the lower bound on $\gamma_{k_1+1}$ above allows us to estimate $\mu'$ by writing that 
+\begin{align}
+\mu' & = \frac{24\beta \log(k_1+1) \|G_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1)} + \mu 
+\qquad \text{(see \eqref{eq:defn of muprime})} \nonumber\\
+& = \frac{24\beta \log(k_1+1) \|u_{k_1+2}-u_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1) \gamma_{k_1+1}^2} + \mu
+\qquad \text{(see \eqref{eq:grad map recall})} \nonumber\\
+& \le 
+ \frac{24\beta \log(k_1+1) \text{diam}(C)^2 }{\eta_{\min}^2 (k_1+1) \gamma_{k_1+1}^2} + \mu 
+ \qquad \left( \{u_k\} \subset  C \right) 
+ \nonumber\\
+ & \le 
+ \frac{24\beta \log(k_1+1) \text{diam}(C)^2 (4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 )^2 }{\eta_{\min}^2  \theta^2 \beta^2} 
++2\mu
+\qquad \text{(see \eqref{eq:low bnd on gammas})} 
+\nonumber\\
+& =: \mu''.
+\label{eq:mup to mupp}
+\end{align}
+Having estimated $B_K$ and $\mu'$, we can rewrite the bound on the feasibility gap as 
+\begin{align}
+\min_{k-1\in[K]}\|A(u_k)-b\|^2 
+& \le \frac{48\beta^2 \log^2(k_1+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h+ \eta_{\max}^2 |K|B_K  \right) + \frac{\beta\mu'}{|K|}
+\qquad \text{(see \eqref{eq:feas bnd semi final})} \nonumber\\
+& \le \frac{48\beta^2 \log^2(k_1+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h+ 16 \eta_{\max}^2 (\|y_{k_0}\|^2 +\rho^2) \log^3(k_1+1)  \right) + \frac{\beta\mu''}{|K|}
+\qquad \text{(see (\ref{eq:Bk evaluated},\ref{eq:mup to mupp}))}
+\end{align}
+Moreover, we can simplify the assumption in \eqref{eq:suff cnd on rho}. To be specific, thanks to (\ref{eq:Bk evaluated},\ref{eq:mup to mupp}), we can replace \eqref{eq:suff cnd on rho} with the assumption 
+\begin{align}
+\|A(u_{k_0})-b\|\le \rho,
+\qquad 
+\frac{48\beta^2 \log^2(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ 16 \eta_{\max}^2 (\|y_{k_0}\|^2+\rho^2) \log^3(k_1+1) \right) + \beta\mu'' \le \rho^2.
+\label{eq:suff cnd 2nd time}
+\end{align}
+The lower bound on the step sizes in \eqref{eq:low bnd on gammas} has two other consequences. First, we find that \eqref{eq:to be used later on} automatically holds if $k_0$ is sufficiently large. Second, it allows us to improve \eqref{eq:bnd on gamma G} by writing that 
+\begin{align}
+& \min_{ k\in K } \| G_k\|^2  \nonumber\\
+& \le \frac{\min_{k \in K} \gamma_k \|G_k\|^2}{\min_{k\in K} \gamma_k } \nonumber\\ 
+& \le  \frac{1}{\min_{k\in K} \gamma_k} 
+\left( \frac{192\beta  \log^2({k_1}+1)}{\eta_{\min}^2|K|}  \left(  \lambda'^2_h + \eta_{\max}^2 |K|B_K \right)
++ \frac{4\mu'}{|K|} \right)
+\qquad \text{(see \eqref{eq:bnd on gamma G})}
+ \nonumber\\
+& \le \frac{1}{\min_{k\in K} \gamma_k} 
+\left( \frac{192\beta  \log^2({k_1}+1)}{\eta_{\min}^2|K|}  \left(  \lambda'^2_h + 16 \eta_{\max}^2  (\|y_{k_0}\|^2+\rho^2) \log^3 (k_1+1)  \right)
++ \frac{4\mu''}{|K|} \right)
+\qquad \text{(see (\ref{eq:Bk evaluated},\ref{eq:mup to mupp}))}
+ \nonumber\\
+ & \le  \frac{  (4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 ) \sqrt{k}}{\theta \beta}  
+\left( \frac{192\beta  \log^2({k_1}+1)}{\eta_{\min}^2|K|}  \left(  \lambda'^2_h + 16 \eta_{\max}^2  (\|y_{k_0}\|^2+\rho^2) \log^3 (k_1+1)  \right)
++ \frac{4\mu''}{|K|} \right)
+\quad \text{(see \eqref{eq:low bnd on gammas})} \nonumber\\
+& \le   \frac{  4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 }{c\theta \beta \sqrt{k_1}}  
+\left( \frac{192\beta  \log^2({k_1}+1)}{\eta_{\min}^2  }  \left(  \lambda'^2_h + 16 \eta_{\max}^2  (\|y_{k_0}\|^2+\rho^2) \log^3 (k_1+1)  \right)
++ 4\mu'' \right),
+\end{align}
+where the last line holds if there exists  $c>0$ for which $k_1-k_0+1 \ge ck_1 $. 
+
+
+Note that $\log$ factors in the convergence proof might be shaved up to a factor by choosing a more conservative step size for the dual which will automatically bound the norm of the dual. A possible choice
+\begin{align}\label{eq:yk bounded}
+\sigma_k \geq \max(\beta \sqrt{k}, \beta k \log^2(k+1) \| Au_k - b \|),
+\end{align}
+
+which immediately yields
+\begin{align}
+\|y_k\| & \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \frac{\|A(u_i)-b\|}{\sigma_k} 
+\qquad 
+\nonumber\\
+& \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \frac{\rho}{\beta k\log^2(k+1)}
+\qquad \text{(see (\ref{eq:yk bounded}))} \nonumber\\
+& \le \|y_{k_0}\|+ \frac{\rho c}{\beta} \nonumber\\
+&:= y_{max}.
+\label{eq:dual bounded}
+\end{align}
diff --git a/tex/nonlinear.tex b/tex/nonlinear.tex
index acb8250..48e94e5 100644
--- a/tex/nonlinear.tex
+++ b/tex/nonlinear.tex
@@ -1,2256 +1,2256 @@
 %%%%%%%%%%%%%%%%%%%%%%% file template.tex %%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % This is a general template file for the LaTeX package SVJour3
 % for Springer journals.          Springer Heidelberg 2010/09/16
 %
 % Copy it to a new file with a new name and use it as the basis
 % for your article. Delete % signs as needed.
 %
 % This template includes a few options for different layouts and
 % content for various journals. Please consult a previous issue of
 % your journal as needed.
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % First comes an example EPS file -- just ignore it and
 % proceed on the \documentclass line
 % your LaTeX will extract the file if required
 \begin{filecontents*}{example.eps}
 %!PS-Adobe-3.0 EPSF-3.0
 %%BoundingBox: 19 19 221 221
 %%CreationDate: Mon Sep 29 1997
 %%Creator: programmed by hand (JK)
 %%EndComments
 gsave
 newpath
   20 20 moveto
   20 220 lineto
   220 220 lineto
   220 20 lineto
 closepath
 2 setlinewidth
 gsave
   .4 setgray fill
 grestore
 stroke
 grestore
 \end{filecontents*}
 %
 \RequirePackage{fix-cm}
 %
 %\documentclass{svjour3}                     % onecolumn (standard format)
 %\documentclass[smallcondensed]{svjour3}     % onecolumn (ditto)
 \documentclass[smallextended]{svjour3}       % onecolumn (second format)
 %\documentclass[twocolumn]{svjour3}          % twocolumn
 %
 \smartqed  % flush right qed marks, e.g. at end of proof
 %
 \usepackage{amsmath}
 \usepackage{amssymb}
 \usepackage{graphicx}
 \usepackage{algorithm}
 \usepackage{color}
 
 \usepackage{hyperref}
 
 \hypersetup{
    colorlinks=true,%
    citecolor=black,%
    filecolor=black,%
    linkcolor=black,%
    urlcolor=blue
 }
 
 
 % 1 inch margins when printed from ps
 \hoffset=0in \voffset=0in \evensidemargin=0in \oddsidemargin=0in
 \textwidth=6.5in \topmargin=0in \headheight=0.0in \headsep=.5in
 \textheight=8in
 
 
 %
 % \usepackage{mathptmx}      % use Times fonts if available on your TeX system
 
 % insert here the call for the packages your document requires
 %\usepackage{latexsym}
 % etc.
 
 \PassOptionsToPackage{normalem}{ulem}
 \newcommand{\minimize}[2]{\ensuremath{\underset{\substack{{#1}}}%
 {\mathrm{minimize}}\;\;#2 }}
 \newcommand{\Argmind}[2]{\ensuremath{\underset{\substack{{#1}}}%
 {\mathrm{Argmin}}\;\;#2 }}
 \newcommand{\Frac}[2]{\displaystyle{\frac{#1}{#2}}} 
 \newcommand{\menge}[2]{\big\{{#1} \;|\; {#2}\big\}} 
 \newcommand{\Pair}[2]{{\big\langle{{#1},{#2}}\big\rangle}}
 \newcommand{\pair}[2]{{\langle{{#1},{#2}}\rangle}}
 \newcommand{\Scal}[2]{{\bigg\langle{{#1}\:\bigg |~{#2}}\bigg\rangle}}
 \newcommand{\Menge}[2]{\bigg\{{#1}~\bigg|~{#2}\bigg\}} 
 \newcommand{\lev}[1]{\ensuremath{\mathrm{lev}_{\leq #1}\:}}
 \newcommand{\enve}[1]{\ensuremath{\operatorname{env}#1}}
 \newcommand{\emp}{\ensuremath{{\varnothing}}}
 \newcommand{\bdry}{\ensuremath{\operatorname{bdry}}}
 \newcommand{\yosi}[2]{\ensuremath{\sideset{^{#2}}{}{\operatorname{}}\!\!#1}}
 \newcommand{\infconv}{\ensuremath{\mbox{\small$\,\square\,$}}}
 \newcommand{\scal}[2]{\left\langle{#1}\mid {#2} \right\rangle} 
 \newcommand{\pscal}[2]{\langle\langle{#1}\mid{#2}\rangle\rangle} 
 \newcommand{\norm}[1]{\|#1\|}
 \newcommand{\vuo}{\ensuremath{\mbox{\footnotesize$\square$}}}
 \newcommand{\exi}{\ensuremath{\exists\,}}
 \newcommand{\zeroun}{\ensuremath{\left]0,1\right[}}   
 \newcommand{\HH}{\ensuremath{\mathcal H}}
 \newcommand{\GG}{\ensuremath{\mathcal G}}
 \newcommand{\YY}{\ensuremath{\mathcal Y}}
 \newcommand{\XX}{\ensuremath{\mathcal X}}
 \newcommand{\bary}{\ensuremath{\widetilde{\boldsymbol{y}}}}
 \newcommand{\barp}{\ensuremath{\widetilde{\boldsymbol{p}}}}
 \newcommand{\barq}{\ensuremath{\widetilde{\boldsymbol{q}}}}
 \newcommand{\barx}{\ensuremath{\widetilde{\boldsymbol{x}}}}
 \newcommand{\BL}{\ensuremath{\EuScript B}\,}
 \newcommand{\BP}{\ensuremath{\EuScript P}}
 \newcommand{\HHH}{\ensuremath{\boldsymbol{\mathcal H}}}
 \newcommand{\WW}{\ensuremath{\boldsymbol{\mathcal W}}}
 \newcommand{\KKK}{\ensuremath{\boldsymbol{\mathcal K}}}
 \newcommand{\GGG}{\ensuremath{\boldsymbol{\mathcal G}}}
 \newcommand{\VV}{\ensuremath{\boldsymbol{V}}}
 \newcommand{\CCC}{\ensuremath{\boldsymbol{C}}}
 \newcommand{\MM}{\ensuremath{\boldsymbol{M}}}
 \newcommand{\SG}{\ensuremath{\boldsymbol{S}}}
 \newcommand{\EE}{\ensuremath{\boldsymbol{E}}}
 \newcommand{\XP}{\ensuremath{\mathcal X}^*}
 \newcommand{\sri}{\ensuremath{\operatorname{sri}}}
 \newcommand{\ri}{\ensuremath{\operatorname{ri}\,}}
 \newcommand{\RR}{\ensuremath{\mathbb R}}
 \newcommand{\KK}{\ensuremath{\mathcal K}}
 \newcommand{\RP}{\ensuremath{\left[0,+\infty\right[}}
 \newcommand{\RPP}{\ensuremath{\,\left]0,+\infty\right[}}
 \newcommand{\RX}{\ensuremath{\,\left]-\infty,+\infty\right]}}
 \newcommand{\NN}{\ensuremath{\mathbb N}}
 \newcommand{\SL}{\ensuremath{\EuScript S}\,}
 \newcommand{\dom}{\ensuremath{\operatorname{dom}}}
 \newcommand{\cont}{\ensuremath{\operatorname{cont}}}
 %\newcommand{\gr}{\ensuremath{\operatorname{gra}}}
 \newcommand{\prox}{\ensuremath{\operatorname{prox}}}
 \newcommand{\Prox}{\ensuremath{\operatorname{Prox}}}
 \newcommand{\intdom}{\ensuremath{\operatorname{int}\operatorname{dom}}\,}
 \newcommand{\inte}{\ensuremath{\operatorname{int}}}
 \newcommand{\cart}{\ensuremath{\mbox{\huge{$\times$}}}}
 \newcommand{\scart}{\ensuremath{\mbox{\LARGE{$\times$}}}}
 \newcommand{\WC}{\ensuremath{{\mathfrak W}}}
 \newcommand{\TT}{\ensuremath{{\mathbf T}}}
 \newcommand{\SC}{\ensuremath{{\mathfrak S}}}
 \newcommand{\rh}{\ensuremath{{\mathrm  a}}}
 \newcommand{\og}{\ensuremath{{\mathrm  b}}}
 \newcommand{\rk}{\ensuremath{{\mathsf {\mathbf  a}}}}
 \newcommand{\ck}{\ensuremath{{\mathsf {\mathbf  u}}}}
 \newcommand{\xk}{\ensuremath{{\mathsf{\mathbf  x}}}}
 \newcommand{\yk}{\ensuremath{{\mathsf{\mathbf  y}}}}
 \newcommand{\RPX}{\ensuremath{{[0,\pinf]}}}
 \newcommand{\card}{\ensuremath{\operatorname{card}}}
 \newcommand{\bd}{\ensuremath{\operatorname{bdry}}}
 \newcommand{\Argmin}{\ensuremath{\operatorname{Argmin}}}
 \newcommand{\argmin}{\ensuremath{\operatorname{argmin}}}
 \newcommand{\ran}{\ensuremath{\operatorname{ran}}}
 \newcommand{\zer}{\ensuremath{\operatorname{zer}}}
 \newcommand{\gra}{\ensuremath{\operatorname{gra}}}
 \newcommand{\conv}{\ensuremath{\operatorname{conv}}}
 \newcommand{\vv}{\ensuremath{\boldsymbol{v}}}
 \newcommand{\sss}{\ensuremath{\boldsymbol{s}}}
 \newcommand{\xx}{\ensuremath{\boldsymbol{x}}}
 \newcommand{\xs}{\ensuremath{\textsf{x}}}
 \newcommand{\xo}{\ensuremath{\overline{\boldsymbol{x}}}}
 \newcommand{\pp}{\ensuremath{\boldsymbol{p}}}
 \newcommand{\qq}{\ensuremath{\boldsymbol{q}}}
 \newcommand{\yy}{\ensuremath{\boldsymbol{y}}}
 \newcommand{\ff}{\ensuremath{\boldsymbol{f}}}
 \newcommand{\hh}{\ensuremath{\boldsymbol{h}}}
 \newcommand{\ttt}{\ensuremath{\boldsymbol{t}}}
 \newcommand{\ee}{\ensuremath{\boldsymbol{e}}}
 \newcommand{\rr}{\ensuremath{\boldsymbol{r}}}
 \newcommand{\gggg}{\ensuremath{\boldsymbol{g}}}
 \newcommand{\zz}{\ensuremath{\boldsymbol{z}}}
 \newcommand{\bb}{\ensuremath{\boldsymbol{b}}}
 \newcommand{\uu}{\ensuremath{\boldsymbol{u}}}
 \newcommand{\cc}{\ensuremath{\boldsymbol{c}}}
 \newcommand{\dd}{\ensuremath{\boldsymbol{d}}}
 \newcommand{\aaa}{\ensuremath{\boldsymbol{a}}}
 \newcommand{\ww}{\ensuremath{\boldsymbol{w}}}
 \newcommand{\BB}{\ensuremath{\boldsymbol{B}}}
 \newcommand{\LL}{\ensuremath{\boldsymbol{L}}}
 \newcommand{\PPP}{\ensuremath{\boldsymbol{\mathsf{P}}}}
 \newcommand{\UU}{\ensuremath{\boldsymbol{U}}}
 \newcommand{\EEE}{\ensuremath{\boldsymbol{E}}}
 \newcommand{\E}{\ensuremath{\mathbf{E}}}
 \newcommand{\D}{\ensuremath{\mathbf{D}}}
 \newcommand{\ep}{\ensuremath{\boldsymbol{\varepsilon}}}
 \newcommand{\RRR}{\ensuremath{\boldsymbol{R}}}
 %\newcommand{\RRR}{\ensuremath{\boldsymbol{R}}}
 \newcommand{\AAA}{\ensuremath{\boldsymbol{A}}}
 \newcommand{\BBB}{\ensuremath{\boldsymbol{B}}}
 \newcommand{\QQ}{\ensuremath{\boldsymbol{Q}}}
 \newcommand{\SSS}{\ensuremath{\boldsymbol{S}}}
 \newcommand{\DD}{\ensuremath{\boldsymbol{D}}}
 \newcommand{\PP}{\ensuremath{\boldsymbol{P}}}
 \newcommand{\FF}{\ensuremath{\boldsymbol{\mathcal{F}}}}
 \newcommand{\cone}{\ensuremath{\operatorname{cone}}}
 \newcommand{\Fix}{\ensuremath{\operatorname{Fix}}}
 \newcommand{\Id}{\ensuremath{\operatorname{Id}}}
 \newcommand{\diam}{\ensuremath{\operatorname{diam}}}
 \newcommand{\IId}{\ensuremath{\boldsymbol{\operatorname{Id}}}}
 \newcommand{\weakly}{\ensuremath{\rightharpoonup}}
 \newcommand{\minf}{\ensuremath{-\infty}}
 \newcommand{\pinf}{\ensuremath{+\infty}}
 \newcommand{\LLambda}{\ensuremath{\boldsymbol{\Lambda}}}
 \newcommand{\vva}{\ensuremath{\boldsymbol{\epsilon}}}
 \newcommand{\trace}{\operatorname{tr}}
 
 
 % AE's commands
 \newcommand{\edita}[1]{{\color{blue} #1}}
 \newcommand{\notea}[1]{{\color{magenta} \textbf{Note:AE: #1}}}
 \newcommand{\ol}{\overline}
 \newcommand{\Null}{\operatorname{null}}
 \newcommand{\relint}{\operatorname{relint}}
 \newcommand{\row}{\operatorname{row}}
 \newcommand{\s}{\sigma}
 \newcommand{\editaa}[1]{{\color{red} #1}}
 \renewcommand{\b}{\beta}
 
 
 % MFS's commands
 \newcommand{\editf}[1]{{\color[rgb]{1,0.4,0.4} #1}}
 \DeclareMathOperator*{\argmax}{argmax} % thin space, limits underneath in displays
 
 
 % please place your own definitions here and don't use \def but
 % \newcommand{}{}
 %
 % Insert the name of "your journal" with
 % \journalname{myjournal}
 %
 \begin{document}
 
 \title{A relaxation of the augmented Lagrange method
 }
 %\subtitle{Do you have a subtitle?\\ If so, write it here}
 
 %\titlerunning{Short form of title}        % if too long for running head
 
 \author{
 %C. Vu\and Alcaoglu Ahmet\and   Sahin M. Fatih\and Alp Yurtsever\and Volkan Cevher \\[5mm]
 }
 
 %\authorrunning{Short form of author list} % if too long for running head
 
 %\institute{Laboratory for Information and Inference Systems (LIONS), EPFL, Switzerland\\
 %              \email{bang.vu@epfl.ch\and ahmet.alacaoglu@epfl.ch; mehmet.sahin@epfl.ch;alp.yurtsever@epfl.ch; mehmet.sahin@epfl.ch;volkan.cehver@epfl.ch}                  }
 
 \date{Received: date / Accepted: date}
 % The correct dates will be entered by the editor
 
 
 \maketitle
 
 \begin{abstract}
 We propose a splitting method for solving ....
 \keywords{Non-linear constraint \and Non-convex \and Smoothing\and Primal-dual}
 % \PACS{PACS code1 \and PACS code2 \and more}
  \subclass{47H05\and 49M29\and 49M27\and 90C25}
 \end{abstract}
 
 \section{Introduction \label{intro}}
 
 \edita{Various problems in engineering and computational sciences can be cast as non-linear optimization programs, and the design of efficient numerical algorithms to provably solve such problems is therefore of fundamental importance. cite?
 %Non-linear programming is a broad discipline in applied mathematics. 
 In this paper, we are particularly interested in solving the optimization program
 \begin{equation}
 \label{prob:01}
 \begin{cases}
 \min_{u} h(u),\\
 A(u) = b,\\
 u\in C,
 \end{cases}
 \end{equation}
 where $h:\mathbb{R}^d\rightarrow\mathbb{R}$ and $A:\mathbb{R}^d\rightarrow\mathbb{R}^m$ satisfy 
 \begin{align}
 \| \nabla h(u) - \nabla h(u')\| \le \lambda_h \|u-u'\|,
 \qquad
 \| Dh(u) - Dh(u') \| \le \lambda_A \|u-u'\|,
 \label{eq:smoothness basic}
 \end{align}
 for every $u,u'\in \mathbb{R}^d$. Above, $\nabla h(u)\in \mathbb{R}^d$ is the gradient of $h$ and $DA(u)\in \mathbb{R}^{m\times d}$ is the Jacobian of $A$. 
 %where $h\in \mathbb{L}_{d}(\lambda_h)$ is a continuously-differentiable function from $\mathbb{R}^d$ to $\RR$ with $\lambda_h$ Lipschitz-continuous gradient, $A: \mathbb{R}^d\rightarrow\mathbb{R}^m$ is a non-linear function, with each component $A_i\in \mathbb{L}_d(\lambda_A) $ for $i\in \{1,\cdots,m\}$. 
 Moreover, 
 $C\subset \RR^d$ is  non-empty, closed, and  convex.
 %\edita{Program \eqref{prob:01} is typically} non-convex and \edita{can be considered as a standard non-linear program, where the set $C$
 %is modeled by inequalities constraints, namely, $C = \menge{u}{g(u) \leq 0}$ for some $g\colon\RR^d\to \RR^s$.}
 Variants of Program~\eqref{prob:01} naturally arise in a broad range of applications in ?? \notea{Please add some representative applications above alongside some references. } For the sake of brevity, we showcase here one instance of Program $\eqref{prob:01}$.}  
  
  \begin{example}\edita{{{\textbf{(Burer-Monteiro factorization)}}}
 Let $\mathbb{S}^{d'\times d'}$ be the space of $d'\times d'$ symmetric matrices, equipped with the standard inner product $\langle x|y\rangle = \text{tr}(x^*y)$.  In particular, when $x\in \mathbb{S}^{d'\times d'}$ is positive semi-definite, we write that $x\succeq 0$.}
 % Consider $\mathcal{C}'\subseteq \mathcal{X}$, and let $h_0\colon\mathcal{X}\to \RR$ be a differentiable 
 %convex function, with $\mathbb{L}_{0}$ Lipschitz-continuous gradient.} 
 % 
 \edita{Consider the program 
 \begin{equation}
 \label{e:fac}
 \begin{cases}
 \min_x h'(x) \\
 A'(x) = b'\\
 x\in C'\\
 x \succeq 0 ,
 \end{cases}
 \end{equation}
 where $h': \mathbb{S}^{d'\times d'} \to \RR$, $A'\colon\mathbb{S}^{d'\times d'}\to\RR^m$, $b\in\RR^m$, and  $C' \subseteq \mathbb{R}^{d'\times d'}$. 
 
 Variants of Program \eqref{e:fac} are popular in matrix completion and sensing \cite{park2016provable}, with a broad range of applications to problems in collaborative filtering, geophysics, and imaging, among others~\cite{Burer2005,Burer2003,tu2014practical}. Two common choices for $C'$ in Program \eqref{e:fac} are 
 $C' =\{x:  x \ge 0\}$ and $C' = \{x: \text{tr}(x) \le 1\}$ \cite{mixon2016clustering}.} 
 
 \edita{Solving Program \eqref{e:fac} with semi-definite programming is not scalable,  becoming increasingly cumbersome as the dimension $d'$ grows.
 To overcome this {computational bottleneck}, the factorized technique sets $x = uu^\top$ for $u\in \mathbb{R}^{d'\times r}$ and a sufficiently large $r$. The resulting non-convex program is then solved with respect to the much lower-dimensional variable $u$. If we also replace the constraint $uu^\top \in C'$ with $u\in C$ for a properly chosen convex set, the new problem in $u$ matches Program \eqref{prob:01} with $h(u) = h'(uu^\top)$ and $A(u) = A'(uu^\top)$. For our examples of $C'$ above, we might choose $C=\{u:u\ge 0\}$ and $C=\{\|u\|_F^2 \le 1\}$, respectively. Here, $\|\cdot\|_F$ stands for the Frobenius norm. 
 }
  \end{example}
 
 
 \edita{The \emph{augmented Lagrangian method} \cite{luenberger1984linear} is a powerful approach to solve Program \eqref{prob:01}, see Section \ref{sec:related work} for a review of the related literature as well as other approaches to solve Program \eqref{prob:01}}. \edita{ Indeed, for positive $\beta$, it is easy to verify that Program \eqref{prob:01} is equivalent to 
 \begin{align}
 \min_{u\in C} \max_y \, \mathcal{L}_\beta(u,y), 
 \label{eq:minmax}
 \end{align}
 where 
 \begin{align}
 \label{eq:Lagrangian}
 \mathcal{L}_\beta(u,y) := h(u) + \langle A(u)-b, y \rangle + \frac{\|A(u)-b\|^2}{2\beta},
 \end{align}
 is the augmented Lagrangian corresponding to Program \eqref{prob:01}. The equivalent formulation in Program \eqref{eq:minmax} naturally suggests the following algorithm to solve Program \eqref{prob:01}:}
 \begin{equation}\label{e:exac}
 u_{k+1} \in \underset{u\in C}{\argmin} \, \mathcal{L}_{\beta}(u,y_k), 
 \end{equation} 
 \begin{equation}
 y_{k+1} = y_k+\frac{A(u_{k+1})-b}{\beta}.
 \end{equation}
 In fact, when the penalty parameter $\beta$ is sufficiently small, the augmented Lagrangian has
 a local minimum point near the true optimal point. However, we do not know exactly how small $\beta$ is. 
 Hence, the choice of $\beta$ plays a centreral role in practices. \notea{Is the last claim really true? Programs \eqref{prob:01} and \eqref{eq:minmax} seem to be equivalent. }
 \edita{In our nonlinear framework, updating $u$ in the augmented Lagrangian method requires solving the non-convex Program \eqref{e:exac} to global optimality, which is often intractable. } \notea{We should discuss fixes to this issue, if any, and explain why they are not satisfactory.} \edita{The key contribution of this paper is to provably and efficiently address this challenge.}
 
 \edita{
 \paragraph{\emph{\textbf{Contributions.}} }
 
 In order to solve Program \eqref{prob:01}, this paper proposes to replace the (intractable) Program \eqref{e:exac} with the update 
 \begin{equation}
 u_{k+1} = P_C (u_k - \gamma_k \nabla \mathcal{L}_{\beta_k} (u_k,y_k)), 
 \label{eq:new update}
 \end{equation}
 for carefully selected  sequences $\{\beta_k,\gamma_k\}_k$. Here, $P_C$ is the orthogonal projection onto the convex set $C$ which is often easy to compute in various applications and consequently the update in \eqref{eq:new update} is inexpensive and fast. 
 
 Put differently, instead of fully solving Program \eqref{e:exac}, this paper  proposes to apply one iteration of the projected gradient algorithm for every update. We provide the convergence guarantees for this fast and scalable new algorithm.
 }\notea{We should summarize the guarantees.}
 
 %%%%%%%%%%%%%%%%%%%%%
 \section{ Preliminaries}
 \notea{I think the whole of this section should move down. The actual results are hidden deep in the paper!}
 
 \paragraph{\textbf{\emph{Notation.}}}
 We use the notations $\scal{\cdot}{\cdot}$ and $\|\cdot\|$ for the \edita{standard inner} product and \edita{the} associated norm on $\RR^d$\edita{, respectively}.
 \edita{The adjoint of \edita{a} linear operator is denoted the superscript $\top$.}
 Let $C\subset \mathbb{R}^d$ be nonempty, closed, \edita{and convex}. The indicator function of $C$ is denoted by $\iota_{\mathcal{C}}$, and the projection onto $C$ is denoted by $P_C$. 
 %The distance function is $d_{\mathcal{C}}\colon u\mapsto \inf_{a\in\mathcal{C}}\|u-a\|$. The projection of $x$ onto $C$ is denoted by $P_Cx$. 
 \edita{For $u\in C$, the tangent cone to $C$ at $u$ is 
 \begin{equation}
 T_{C}(u) = \left\{v\in \RR^d : \exists t > 0 \text{ such that } u+t v \in C\right\}.
 \end{equation}
 The corresponding normal cone $N_C(u)$ at $u$ is the polar of the tangent cone, namely,
 \begin{align}
 N_C(u) = \left\{ v':  \langle v, v' \rangle \le 0,\, \forall v\in T_C(u)  \right\}. 
 \end{align}}
 %The regular normal cone $\hat{N}_C(\overline{u})$ is defined as the dual to the tangent cone, $\hat{N}_C(\overline{u}) = T_{C}(\overline{u})^*$.
 %The (Mordukhovich) limiting  normal cone to $C$ at $\overline{u}$ is defined by 
 %\begin{equation}
 %N_C(\overline{u}) = \menge{v\in \RR^d}{\exists u_k\to \overline{u}, v_k\to v \;\text{with}\; (\forall k\in\NN)\;  v_k \in \hat{N}_{C}(u_k)}.
 %\end{equation}
  \edita{
 The sub-differential of a convex function $f$ at $u$ is defined as 
 %
 %Let $f\colon \RR^d\to (-\infty, +\infty$ be a proper, lower semi-continuous, convex function. The sub-differential of $f$ at $p$ is
 \begin{equation}
 \partial f(u)=
 \left\{ g : 
 f(u') - f(u) \ge \langle g, u'-u\rangle,
 \,\, \forall u'
  \right\}. 
 \end{equation} }
 \edita{In particular,} if $f$ is differentiable at $u$, $\partial f(u)$ is a singleton and denoted by $\nabla f(u)$.
 \label{s:nota}
 \paragraph{\textbf{\emph{\edita{Necessary Optimality Conditions.}}} \label{sec:opt cnds}}
 \edita{Necessary optimality conditions} for  \edita{Program} \eqref{prob:01} are well studied in the literature \cite[Corollary 6.15]{rockafellar2009variational}. \edita{Indeed, $u$ is a (first-order) stationary point of Program \eqref{prob:01} if there exists $y$ for which 
 \begin{align}
 \begin{cases}
 -\nabla h(u) - DA(u)^\top y \in N_C(u)\\
 A(u) = b.
 \end{cases}
 \label{e:inclu1}
 \end{align}
 Here, $DA(u)$ is the Jacobian of $A$ at $u$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to 
 \begin{align}
 \begin{cases}
 -\nabla_u \mathcal{L}_\beta(u,y) \in N_C(u)\\
 A(u) = b,
 \end{cases}
 \label{e:inclu1}
 \end{align}
 which is in turn the necessary optimality condition for Program \eqref{eq:minmax}. 
 }
 %
 %Let $\overline{u}$ be a locally optimal and suppose that there no vector $y\not=0$ such that 
 %\begin{equation}
 %-\nabla L(\overline{u})^* y \in N_C(\overline{u}). 
 %\end{equation}
 %Then, the first order optimality condition for  $\overline{u}$ is 
 %\begin{equation}\label{e:inclu1}
 %(\exists y \in\RR^m)\;  - \nabla L(\overline{u})^*y - \nabla h(\overline{u}) \in N_{C}(\overline{u}).
 %\end{equation}
 %Since $\partial \iota_{\{0\}} = \RR^m$, the condition \eqref{e:inclu1} is equivalent to 
 %\begin{equation}
 %0 \in  \partial \iota_C(\overline{u}) +\nabla h(\overline{u}) + \nabla L(\overline{u})^* \partial \iota_{\{0\}}(L\overline{u}-b).
 %\end{equation}
 %Observe that the condition \eqref{e:inclu1} is also equivalent to   the following condition
 %\begin{equation}\label{e:inclu2}
 %(\exists y \in\RR^m)\; 0 \in \nabla \mathcal{L}(\overline{u},y),
 %\end{equation}
 %where $\mathcal{L}(u,y)$ is the  Lagrangian function associated to the non-linear constraint $Lu=b$,
 %\begin{equation}
 % \mathcal{L}_{\beta}(u,y) = (h +\iota_C)(u) + \scal{Lu-b}{y}.
 %\end{equation}
 %The corresponding augmented Lagrangian function associated to the non-linear constraint $Lu=b$ is defined by 
 %\begin{equation}
 %(\forall \beta \in \left]0,+\infty\right[)\quad \mathcal{L}_{\beta}(u,y) = (h+\iota_C)(u) + \scal{Lu-b}{y} +\frac{1}{2\beta}\|Lu-b\|^2.
 %\end{equation}
 %For convenience, we define 
 %\begin{equation}
 %(\forall \beta \in \left]0,+\infty\right[)\quad g_{\beta}(u,y) = \scal{Lu-b}{y} +\frac{1}{2\beta}\|Lu-b\|^2.
 %\end{equation}
 %and 
 %\begin{equation}
 %(\forall \beta \in \left]0,+\infty\right[)\quad F_{\beta}(u,y) = h(u)+ g_{\beta}(u,y).
 %\end{equation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \paragraph{\textbf{\emph{Gradient Mapping.}}}
 In nonconvex optimization, the relation between the gradient mapping  and stationarity is well-understood \cite{Lan16,Hare2009,bolte2014proximal}, \edita{which we review here for completeness.} 
 \begin{definition} \label{def:grad map}  Given $u$ and $\gamma >0$, define 
  the gradient mapping 
 \begin{equation} 
  G_{\beta,\gamma}(\cdot; y)\colon u\rightarrow \frac{u-u^+}{\gamma},
  \label{eq:grad map}
 \end{equation} 
 where $u^+=P_{C}(u-\gamma \nabla \mathcal{L}_ {\beta}(u,y))$.
 \end{definition}
 %\begin{definition} Given $u$ and $\gamma >0$, let $r_{\beta}(\xi,y)$ be a stochastic estimimate of $\nabla F_{\beta}(u,y)$, define
  %stochastic gradient mapping $SG_{\beta,\gamma}(\cdot; y)\colon u\mapsto \gamma^{-1}(u-u_+)$, where $u_+=\prox_{\gamma f}(u-\gamma r_{\beta}(\xi;y))$.
 %\end{definition}
 \edita{In particular, if we remove the constraints of Program \eqref{prob:01}, the gradient mapping reduces to $G_{\beta,\gamma}(u,y)=\nabla h(u) $. The gradient mapping is closely related to $\mathcal{L}_\beta$. The following standard result is proved in Appendix~\ref{sec:proof of smoothness}.}
 \begin{lemma}\label{lem:11}
  \edita{For fixed $y\in \RR^m$, suppose that $\nabla_u \mathcal{L}_{\beta}(\cdot, y)$ is $\lambda_\beta$ Lipschitz-continuous. For $u\in C$ and $\gamma \in (0, 1/\lambda_\beta)$, it holds that }
  \begin{equation}
  \label{e:deslem}
  \| G_{\beta,\gamma}(u;y)\|^{2}\leq \frac{2}{\gamma} (\mathcal{L}_\beta(u;y) - \mathcal{L}_\beta(u^+;y)),
 \end{equation}
 \edita{where 
 \begin{align}
 \lambda_\beta 
 & \le \lambda_h + \sqrt{m}\lambda_A \left(\|y\| + \frac{\|A(u)\|}{\beta} \right) + \frac{\|DA(u)\|_F^2}{\beta},
 \label{eq:smoothness of Lagrangian}
 \end{align}
 where $DA(u)$ is the Jacobian of $A$ at $u$, \editf{$\lambda_h$ and $\lambda_A$ are the bound on the hessian of h and operators $A_i(u)$\footnote{ Notice that $(A(u) )_i = (A_i(u)$), where $A: \mathbb{R}^d \rightarrow \mathbb{R}^m$ and $A_i(u): \mathbb{R}^d \rightarrow \mathbb{R}$. }, respectively.}
 }
 %where $u^+ =u^+(\mu,u) = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$.
 %\end{enumerate}
 \end{lemma}
 %\begin{proof} For a fixed $y$, the function $u\mapsto h(u)+F_{\beta}(u;y)$ is $(\mathbb{L}_h+ \mathbb{L}_{\beta})$-Lipschitz. Hence, the results follows from 
 %\cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}.
 %\end{proof}
 
 In \edita{practice}, the Lipschitz constant $\lambda_{\beta}$ is often hard to evaluate exactly \edita{and we might resort to the classic line search technique, reviewed below and proved in Appendix \ref{sec:proof of eval Lipsc cte} for completeness.} 
 \edita{
 \begin{lemma} \label{lem:eval Lipsc cte} Fix $\theta \in (0,1)$ and ${\gamma}_0$. For $\gamma'>0$, let $u^+_{\gamma'} = P_C(u - \gamma' \nabla \mathcal{L}_\beta(u,y))$ and  define 
 \begin{equation*}
 \gamma := 
 \max \left\{
 \gamma' ={\gamma}_0 \theta^i : 
 \mathcal{L}_\beta (u^+_{\gamma'},y) \le \mathcal{L}_\beta (u,y) + 
 \left\langle u^+_{\gamma'} - u ,  \nabla \mathcal{L}_{\beta}(u,y) \right\rangle 
 + \frac{1}{2\gamma'} \| u^+_{\gamma'} - u\|^2   
 \right\}.
 \label{eq:defn of gamma line search}
 \end{equation*}
 Then, (\ref{e:deslem}) holds and moreover we have that 
 \begin{align}
  \gamma \ge \frac{\theta}{\lambda_\beta}.
  \label{eq:moreover}
 \end{align}
 \end{lemma}
 }
 
 
 %Let $\delta $ and  $\theta$ be in $\left]0,1\right[$ and $\overline{\gamma} > 0$. Define 
 %\begin{alignat}{2}\label{e:non-lips}
 %&\gamma = \max\{\mu > 0| (\exists i \in\NN)(\mu= \overline{\gamma}\theta^i)\notag\\ 
 %&F_{\beta}(u^+(\mu,u),y) \leq F_{\beta}(u,y) + \scal{u^+(\mu,u)-u}{\nabla F_{\beta}(u,y)} +\frac{\delta}{\mu}\|u-u^+(\mu,u)\|^2\}.
 %\end{alignat}
 %Now, let $\gamma$ be define as in \eqref{e:non-lips}. Then, since $u^+ =u^+(\mu,u) = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$, 
 %\begin{equation}
 %G_{\beta,\gamma}(u) - \nabla  F_{\beta}(u;y)) \in N_{C}(u^+).
 %\end{equation}
 %Hence, $\scal{G_{\beta,\gamma}(u) - \nabla  F_{\beta}(u;y)) }{u-u^+} \leq 0$. Using \eqref{e:non-lips}, we have 
 %\begin{alignat}{2}
 %F_{\beta}(u^+,y) &\leq F_{\beta}(u,y) + \scal{u-u^+}{ -\nabla F_{\beta}(u,y)} +\frac{\delta}{\gamma}\|u-u^+\|^2\notag\\
 %&=  F_{\beta}(u,y) + \scal{u-u^+}{ G_{\beta,\gamma}(u) -\nabla F_{\beta}(u,y)} +\frac{\delta-1}{\gamma}\|u-u^+\|^2\notag\\
 %&\leq  F_{\beta}(u,y) - \frac{\delta-1}{\gamma}\|u-u^+\|^2,
 %\end{alignat}
 %which implies that 
 %\begin{equation}
 %(1-\delta)  \| G_{\beta,\gamma}(u;y)\|^{2}\leq \frac{1}{\gamma} (F_{\beta}(u;y) - F_{\beta}(u^+;y)).
 %\end{equation}
 %In particular, by taking $\delta =1/2$, we obtain \eqref{e:deslem}.
 \edita{Optimality conditions in Section \ref{sec:opt cnds}  can also be expressed in terms of the gradient mapping. Indeed, it is straightforward to verify that $u^+$ is a first-order stationary point of Program \eqref{prob:01} if 
 \begin{align}
 \begin{cases}
 G_{\beta,\gamma}(u,y) = 0\\
 A(u^+) = b.
 \end{cases}
 \label{eq:opt grad map}
 \end{align}
 }
 %
 %
 %
 %\begin{lemma}
 %\label{l:solv}
 % Suppose that $\mathcal{L}_{\beta,\gamma}\in \mathbb{L}_d(\lambda_\beta)$. For $\gamma \in (0, 1/\lambda_\beta)$, $u^+=P_C(u-\gamma \nabla \mathcal{L}_\beta(u,y))$ is a first-order stationary point of Program (\ref{prob:01}) if 
 % 
 % $\nabla g_{\beta}(\cdot, y)$ is $\mathbb{L}_{\beta}$-Lipschitz continuous. Let $u\in C$ and  
 %$\gamma \in ]0, 1/(\mathbb{L}_h+ \mathbb{L}_{\beta})[$. Suppose that $Lu^+=b$ and $\| G_{\beta,\gamma}(u;y)\|^{2} =0$. 
 %Then $u^+$ is a stationary point, i.e., the first order optimality condition \eqref{e:inclu1}, for $ \overline{u} =u^+$, is satisfied.
 %\end{lemma}
 %\begin{proof}
 %Since $u^+ = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$. Then 
 % it follows that
 %\begin{equation}
 %\label{e:ver1}
 %G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y)    \in N_C(u^+).
 %\end{equation}
 %Adding  $\nabla F_{\beta}(u^+;y)$ to both sides of \eqref{e:ver1}, we obtain
 %\begin{equation}
 %\label{e:ver2}
 %G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y)  +\nabla F_{\beta}(u^+;y)  \in\partial f(u^+)+ \nabla F_{\beta}(u^+;y).
 %\end{equation}
 %Using the Lipschitzian gradient of $F_{\beta}(\cdot, y)$ and  $\gamma (\mathbb{L}_\beta +\mathbb{L}_{h}) \leq 1$, we see that 
 %\begin{alignat}{2}
 %&\|G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y)   - \nabla F_{\beta}(u^+;y) \| \notag\\
 %&\quad\leq \|G_{\gamma,\beta}(u;y)\|+\|\nabla F_{\beta}(u;y)   - \nabla F_{\beta}(u^+;y) \|\notag\\
 %&\quad\leq \|G_{\gamma,\beta}(u;y)\|+( \mathbb{L}_\beta +\mathbb{L}_{h})\|u-u^+\|\notag\\
 %&\quad =0,
 %\end{alignat}
 %which means that $G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y)   - \nabla F_{\beta}(u^+;y)=0$. Hence, we derive from 
 %\eqref{e:ver2} that 
 %\begin{equation}
 %\label{conkhi}
 %0   \in\partial f(u^+)+ \nabla F_{\beta}(u^+;y).
 %\end{equation}
 %By definition of $F_{\beta}(u^+;y)$ and $Lu^+=b$, we have 
 %\begin{alignat}{2}
 %\nabla F_{\beta}(u^+;y) &= \nabla h(u^+) + \nabla L(u^+)^*y + \frac{1}{\beta}  \nabla L(u^+)^*(Lu^+-b) \notag\\
 %&= \nabla h(u^+) + \nabla L(u^+)^*y,
 %\end{alignat}
 %which together with \eqref{conkhi}, shows that \eqref{e:inclu1} is satisfied.
 %\end{proof}
 \paragraph{\textbf{\emph{\edita{Sufficient Optimality Conditions.}}}}
 Sufficient optimality conditions for Program \eqref{prob:01} are also well understood in the litterature \cite{luenberger1984linear,rockafellar2009variational,mordukhovich2006variational,gfrerer2015complete}.
 Indeed, $u$ is a local minimizer of Program \eqref{prob:01} if there exists $y$ for which 
 \begin{align}
 \begin{cases}
 v^\top \left( 
 \nabla_{uu} h(u) + \sum_{i=1}^m \nabla_{uu} A_i(u)
 \right) v \ge 0,
 \qquad 
 \forall v \in T_C(u),\\
 A(u) = b.
 \end{cases}
 \label{eq:suff cnd}
 \end{align}
 \notea{Why does above look different from sufficient cnds for Lagrangian? Suppose to be equivalent problems.}
 
 %For simple, let us assume that $C = \menge{u}{g(u)\leq c}$ for some $\mathcal{C}^2$-function $g\colon\RR^d\to\RR$. In this section we assume that $h$ and $L$ are too $\mathcal{C}^2$-functions.
 %Let $\overline{u}$ be a point such that $g(\overline{u})=c$ and $0\not\in\nabla g(\overline{u})$. Then, \cite[Proposition 10.3]{rockafellar2009variational} implies that 
 %\begin{equation}
 %N_{C}(\overline{u}) =\menge{\mu \nabla g(\overline{u})}{\mu \geq 0}.
 %\end{equation}
 %Now suppose that the first oder optimality condition \eqref{e:inclu1} is satisfied for $\overline{u}$. Then there exists $\mu \geq 0$ and  $y \in\RR^m$ such that 
 %\begin{equation}\label{e:inclu1}
 % \nabla L(\overline{u})^*y + \nabla h(\overline{u}) +\mu \nabla g(\overline{u})=0.
 % \end{equation}
 % Therefore, inview of \cite[Chapter 11]{luenberger1984linear}, $\overline{u}$ is a local minimum provided that 
 % the Hessian matrix 
 % \begin{equation} 
 % H(\overline{u}) = \nabla^2h(\overline{u}) + y^T\nabla^2 L(\overline{u}) + \mu \nabla^2 g(\overline{u})
 % \end{equation}
 % is positive definite on the space 
 %\begin{equation} 
 %E= \menge{y}{\nabla L(\overline{u})y =0, \nabla g(\overline{u})y=0}.
 %\end{equation}
 %In our examples, this condition is checkable where $g(u) = \frac{1}{2}\|u\|^2$, $h$ is quadratic and $Lu = Muu^\top$.
 \section{Algorithm $\&$ Convergence}
 \subsection{Algorithm}
 We propose the following method for solving the problem \eqref{prob:01} where, the main idea is that 
 we do a projected gradient descent step  on $u$ to obtain $u^+$ and update the penalty parameter $\beta^+$  in such a way
  that the feasiblity   $\frac{1}{2\beta^+\gamma}\|Lu^+-b\|^2$ reduce faster than the gradient mapping up to some noise level $\omega$:
  \begin{equation}
  \frac{1}{2\beta^+\gamma}\|Lu^+-b\|^2\leq \frac{1}{8} \|G_{\beta,\gamma}(u,y) \|^2 + \frac{\omega}{\gamma}
  \end{equation}
   Then update the corresponding the multiplier $y$ as in the classical ADMM: 
   \begin{equation}
   y^+ = y+\frac{1}{\sigma}(LU^+-b). 
   \end{equation} 
   The formal algorithm is presented as follows.
 \begin{algorithm}
 \label{Algo:2} 
 Input: $\beta_0> 0$, $c > 0$, $\alpha \in \left[0,1\right[$,  $u_{0}\in \mathcal{C}$, $y_0 =0$, $\epsilon_1\in \left]0,1\right[$.
 Given $\beta_k$, choose   
 $\gamma_{k} \leq  \frac{1-\epsilon_1}{\mathbb{L}_{h}+\mathbb{L}_{\beta_{k}}}.$
   Iterate  \\
 For k=0,1,\ldots\\
 1. Projected gradient: $ u_{k+1} = P_{\mathcal{C} }(u_{k} - \gamma_{k} \nabla F_{\beta_{k}}(u_{k},y_{k})).$\\
  2.  Line search step\\
  \quad $s=0, d_{k,0}=2$,  $\beta_{k+1,0}= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg(  \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 + \frac{d_{k,s}}{{(1+k)}^{1+\epsilon_1}}\bigg)^{-1}.$\\
   While $\beta_{k+1,s} \geq c/(k+1)^{\alpha}$ do
   \begin{alignat}{2}\label{e:mc}
 %  \beta_{k+1}\geq \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg(  \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 + \frac{d_k}{(k+1)^\alpha}\bigg)^{-1}.
 d_{k,s+1} &= 2*d_{k,s} \\
 \beta_{k+1,s+1}&= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg(  \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 + \frac{d_{k,s+1}}{{(1+k)}^{1+\epsilon_1}}\bigg)^{-1}\\
 s&\leftarrow s+1.
   \end{alignat}
   Endwhile\\
   3. Update  $\beta_{k+1} = \beta_{k+1,s}$.\\  
   4. Chose $\sigma_{k+1} \geq 2\beta_{k}$ and update $y_{k+1} = y_{k} + \frac{1}{\sigma_{k+1}} (Lu_{k+1}-b)$.\\
 \end{algorithm}
 
 \begin{remark} The updating rule of $(\beta_k)_{k\in\NN}$  in \eqref{e:mc} plays a role in our analysis.
  Intuitively, if $u_{k+1}$ is solution then $Lu_{k+1}=b$ and \eqref{e:mc} is trivially satisfied for any $\beta_{k+1}\geq 0$.
   Hence
  $\beta_{k+1}$ enforces $u_{k+1}$ close to $\menge{u}{Lu=b}$ 
  \end{remark}
  \begin{remark}When $\sigma_k\equiv \infty$, we get $y_k\equiv 0$ and hence 
 the step 2 disappears. If we chose $\sigma_k =  c(k+1)^{\alpha_1}\|Lu_k-b\|$ where $c,\alpha_1$ is chosen such that $\sigma_{k} > 2\beta_{k-1}$, then 
 \begin{equation}
 \|y_{k+1}\| \leq \|y_k\| + \frac{\|Lu_{k+1}-b\|}{\sigma_{k+1}} = \|y_k\| + \frac{1}{c(k+2)^\alpha}.
 \end{equation} 
 Since $\sum_{k\in\NN}  \frac{1}{c(k+2)^\alpha} <+\infty$, $(\|y_k\|)_{k\in\NN}$ converges and hence bounded. Therefore, 
 \begin{equation}
 b_0 = \inf_{k\in\NN} \mathcal{L}_{\beta_{k}}(u_{k+1},y_{k}) \geq \inf_{k} h(u_{k}),
 \end{equation}
 which implies that $b_0>-\infty$ whenever $(u_k)_{k\in\NN}$ or $\dom(f)$ is bounded.
  \end{remark}
  
  \subsection{Convergence}
  In view of Lemma \ref{l:solv}, we need to estimate gradient mapping $\|G_{\beta_{k},\gamma_{k}}(u_k,y_{k})\|$ as well as feasibility $\|Lu_{n+1}-b\|^2$. 
  \begin{theorem} 
  \label{t:1}
  Suppose that $b_0 = \inf_{k\in\NN} \mathcal{L}_{\beta_{k}}(u_{k+1},y_{k}) > -\infty$  and that $z_0=\sum_{k=1}^\infty \frac{d_{k,s_k}}{(1+k)^{1+\epsilon_1}} <+\infty$,
  where $s_k$ be the smallest index such that $\beta_{k,s_k} < c/(k+1)^{\alpha}$.
   Then the following hold.
 \begin{equation}\label{e:mapp1}
 \sum_{k=1}^\infty \gamma_{k} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 \leq 4(\mathcal{L}_{\beta_0}(u_{1},y_0) + z_0-b_0+\frac{\gamma_0}{8}\|G_{\beta_0,\gamma_0}(u_0)\|^2),
 \end{equation}
 and 
 \begin{equation}\label{e:feas1}
 \sum_{k=1}^\infty \frac{1}{\beta_{k+1}} \|Lu_{k+1}-b \|^2  \leq (\mathcal{L}_{\beta_0}(u_{1},y_0) + 3z_0-b_0+\frac{\gamma_0}{8}\|G_{\beta_0,\gamma_0}(u_0)\|^2).
 \end{equation}
  \end{theorem}
 % \begin{proof} Set $e_{k+1}= \frac{d_{k,s_k}}{(k+1)^\alpha}$. Then $z_0= \sum_{k\in\NN}e_k <+\infty$.
 % It follows from Lemma \ref{lem:11} that 
 % \begin{alignat}{2}
 %G_k= \frac{\gamma_{k}}{2}\|G_{\beta_k,\gamma_k}(u_k,y_k) \|^2 &\leq  \mathcal{L}_{\beta_k}(u_k,y_k) - \mathcal{L}_{\beta_{k}}(u_{k+1},y_k)\notag\\
 % &= h(u_k) - h(u_{k+1}) +  g_{\beta_k}(u_k,y_k) -g_{\beta_{k}}(u_{k+1},y_k)\notag\\
 % &=h(u_k) - h(u_{k+1}) +  g_{\beta_{k-1}}(u_k,y_{k-1}) -g_{\beta_{k}}(u_{k+1},y_k)\notag\\
 % &\quad  + g_{\beta_k}(u_k,y_k)-  g_{\beta_{k-1}}(u_k,y_{k-1}), \label{e:sa1}
 % \end{alignat}
 % where we set $g_{\beta}(u,y) =  \scal{Lu-b}{y} +\frac{1}{2\beta}\|Lu-b\|^2$. Let us estimate the last term in \eqref{e:sa1}. We have 
 % \begin{alignat}{2}
 % \omega_{1,k}= g_{\beta_k}(u_k,y_k)-  g_{\beta_{k-1}}(u_k,y_{k-1}) = \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+\scal{Lu_k-b}{y_k-y_{k-1}}.
 % \end{alignat}
 % Since $y_{k} = y_{k-1} + \frac{1}{\sigma_{k}} (Lu_{k}-b)$ and use \eqref{e:mc},  we get 
 %  \begin{alignat}{2}
 % &\quad \omega_{1,k} = \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+ \frac{1}{\sigma_k} \|Lu_k-b\|^2.
 % %&\leq  \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+ \frac{\gamma_{k-1}}{8} \|G_{\beta_{k-1},\gamma_{k-1}}(u_{k-1},y_{k-1}) \|^2 + \frac{d_{k-1}}{k^\alpha}\notag\\
 % %&\leq  \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+\frac{1}{4} G_k+ \frac{1}{4} G_{k-1} + \frac{d_{k-1}}{k^\alpha}.
 % \end{alignat}
 %  Let us estimate the first term in \eqref{e:sa1}. Set $T_k= h(u_k)  +\scal{Lu_k-b}{y_{k-1}} $. Then
 %    \begin{alignat}{2}
 % \omega_{2,k}&= h(u_k) - h(u_{k+1}) +  g_{\beta_{k-1}}(u_k,y_{k-1}) -g_{\beta_{k}}(u_{k+1},y_k)\notag\\
 % & =     T_k -T_{k+1} + \frac{1}{2\beta_{k-1}} \|Lu_k-b\|^2 - \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2.
 %  \end{alignat}
 % Therefore, we derive from \eqref{e:sa1} that 
 % \begin{alignat}{2}
 %G_k &\leq \omega_{1,k} +\omega_{2,k}\notag\\
 %  &=  T_k -T_{k+1} + \frac{1}{2\beta_k} \big(\|Lu_k-b\|^2 -\|Lu_{k+1}-b\|^2\big)+\frac{1}{\sigma_k} \|Lu_k-b\|^2.\label{e:sa2}
 % \end{alignat}
 %Now using the condition \eqref{e:mc}, we obtain 
 %\begin{equation}
 % \frac{1}{2\beta_k} \|Lu_k-b\|^2 \leq \frac{1}{4} G_{k-1} + e_k\leq \frac{1}{4}G_k+ \frac{1}{4} G_{k-1} + e_k,
 %\end{equation}
 %Therefore, it follows from \eqref{e:sa2} that
 % \begin{alignat}{2}
 % \frac12 G_k &\leq (T_k+\frac{1}{4} G_{k-1}) -(T_{k+1} +\frac{1}{4} G_k )+ \frac{1}{\sigma_k} \|Lu_k-b\|^2 - \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2 + e_k\notag\\
 %  &\leq (T_k+\frac{1}{4} G_{k-1}) -(T_{k+1} +\frac{1}{4} G_k )+ \frac{1}{2\beta_{k-1}} \|Lu_k-b\|^2 - \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2 + e_k. \label{e:sa3}
 % \end{alignat}
 % For every $N\in\NN$, $N\geq 1$, summing \eqref{e:sa3} from $k=1$ to $k=N$, we obtain,
 % \begin{alignat}{2}
 % \sum_{k=1}^{N}\frac12 G_k \leq T_1 +\frac{1}{4} G_0 +  \frac{1}{\beta_{0}} \|Lu_1-b\|^2 - T_{N+1} - \frac{1}{4}G_{N} - \frac{1}{2\beta_N}  \|Lu_{N+1}-b\|^2 + z_0.
 % \end{alignat}
 % Note that, by the definiton of $T_{N+1}$, we have 
 % \begin{alignat}{2}
 % -T_{N+1} - \frac{1}{2\beta_N}  \|Lu_{N+1}-b\|^2& = - \mathcal{L}_{\beta_N}(u_{N+1},y_N) \leq -b_0.
 % \end{alignat}
 % Hence,
 % \begin{equation}
 %  \sum_{k=1}^{N}\frac12 G_k \leq \mathcal{L}_{\beta_0}(u_{1},y_0) + z_0-b_0 +\frac{1}{4} G_0 ,
 % \end{equation}
 % which proves \eqref{e:mapp1}. Moreover,  \eqref{e:feas1} follows directly  from \eqref{e:mc}. 
 %  \end{proof}
 % 
 % \begin{corollary} Under the same condition as in Theorem \ref{t:1}. Suppose that 
 %$\gamma_k = \mathcal{O}(\beta_k)$. Then 
 %\begin{equation}\label{e:mapp2}
 %\min_{1\leq k\leq N} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 = \mathcal{O}(1/N^{1-\alpha}) \to 0,
 %\end{equation}
 %and 
 %\begin{equation}\label{e:feas2}
 %\min_{1\leq k\leq N} \|Lu_{k+1}-b \|^2 = \mathcal{O}(1/N) \to 0.
 %\end{equation}
 % \end{corollary}
 % \begin{proof} We see that 
 % \eqref{e:mapp2} and \eqref{e:feas2} follow directly 
 % from \eqref{e:mapp1} and  \eqref{e:feas1}, respectively.
 % \end{proof}
 % \begin{corollary} \label{c:2}
 % Under the same condition as in Theorem \ref{t:1}. 
 % The sequence $(F_{\beta_k}(u_k,y_{k}))_{k\in\NN}$ converges to a $F^\star$. Moreover,
 % if $(\|y_{k-1}\| \sqrt{\beta_k})_{k\in\NN}$  and $(\beta_{k+1}/\beta_{k})_{k\in\NN}$  are bounded by $M$, then $(h(u_k))_{k\in\NN}$ converges to $F^\star$.
 % \end{corollary}
 % \begin{proof} Note that $(F_{\beta_k}(u_{k+1},y_k))_{n\in\NN}$ is bounded below. Moreover, the proof of Theorem \ref{t:1} show that 
 % \begin{equation}
 % F_{\beta_k}(u_{k+1},y_k) + \frac{\gamma_k}{8}\|\mathcal{G}_{\beta_k,\gamma_k}(u_k)\|^2
 %  \leq  F_{\beta_{k-1}}(u_{k},y_{k-1}) + \frac{\gamma_{k-1}}{8}\|\mathcal{G}_{\beta_{k-1},\gamma_{k-1}}(u_{k-1})\|^2 +e_k.
 % \end{equation}
 % Hence $(F_{\beta_k}(u_{k+1},y_k) + \frac{\gamma_k}{8}\|\mathcal{G}_{\beta_k,\gamma_k}(u_k)\|^2)_{n\in\NN}$ converges to a finite value
 % $F^\star$. Since  $\frac{\gamma_k}{8}\|\mathcal{G}_{\beta_k,\gamma_k}(u_k)\|^2\to 0$, we get $F_{\beta_k}(u_{k+1},y_k)\to F^\star$. Since 
 %$\frac{1}{2\beta_k}\|Lu_{k}-b\|^2\to 0$ and $(\beta_{k+1}/\beta_{k})_{k\in\NN}$ is bounded by $M$, we obtain 
 %\begin{equation}
 %\frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2 =\frac{\beta_{k+1}}{2 \beta_{k+1}\beta_{k}}\|Lu_{k+1}-b\|^2 \leq \frac{M}{2\beta_{k+1}} \|L u_{k+1} -b\|^2 \to 0. 
 %\end{equation}
 %Moreover, since $(\|y_{k-1}\| \sqrt{\beta_k})_{k\in\NN}$ is bounded by $M$, we also have 
 %\begin{equation}
 %|\scal{Lu_k-b}{y_{k-1}}| = |\frac{1}{\sqrt{\beta_k}}\scal{Lu_k-b}{\sqrt{\beta_k}y_{k-1}}| \leq \frac{M}{\sqrt{\beta_k}}\|Lu_{k}-b\| \to 0.
 %\end{equation}
 %Therefore, 
 %\begin{alignat}{2}
 %|h(u_n) - F^\star| &\leq |F_{\beta_k}(u_k,y_{k-1})-F^\star| + |\scal{Lu_k-b}{y_{k-1}}|  + \frac{1}{2\beta_k}\|Lu_k-b\|^2\notag\\
 %&\to 0,
 %\end{alignat}
 %which proves the desired result.
 % \end{proof}
 % \subsection{Local convergence} 
 % Let $\overline{u}$ and $\overline{y}$ satisfy the first order optimality condition 
 % \begin{equation}
 % \label{e:fr1}
 % -\nabla L(\overline{u})^*\overline{y} - \nabla h(\overline{u}) \in N_{C}(\overline{u}).
 % \end{equation}
 % For simple, let us recall $\nabla F_{\beta}\colon (u,y) \mapsto \nabla h(u) + \nabla L(u)^*y + \frac{1}{\beta}\nabla L(u)^*(Lu-b)$.
 % \begin{theorem} Under the conditions of Theorem \ref{t:1} and  $\gamma_k \geq \underline{\gamma} > 0$ .
 % Suppose that  each $\overline{u}$ in \eqref{e:fr1} is a local minima of $h$, and $C$ and $(y_k)_{k\in\NN}$ are bounded. 
 % Then $(h(u_k))_{k\in\NN}$ converges to local optimum $h(\overline{u})$.
 % \end{theorem}
 % \begin{proof} Since $C$ is bounded, $(u_k)_{k\in\NN}$ is bounded. Therefore, there exists a subsequence 
 % $(n_k)_{k\in\NN}$ of $\NN$ such that $u_{n_k}\to u^*$ and $y_{n_k}\to y^*$.
 % It follows from Theorem \ref{t:1} that $\gamma_k\|(u_{k+1}-u_k)/\gamma_k\|^2 \to 0$. Since $(\gamma_k)_{k\in\NN}$ 
 %  is bounded below by $\underline{\gamma}$, we obtain $G_{\beta_k,\gamma_k}(u_k)\to 0$.
 %Hence, $u_{n_k+1}\to u^*$.
 % Now, using the updating of $u_{k}$, we have $G_{\beta_k,\gamma_k}(u_k) -\nabla F_{\beta_k}(u_k,y_k) \in N_{C}(u_{k+1})$.
 % Hence, 
 %  \begin{equation}\label{e:aa2s}
 % (\forall u\in C)(\forall k\in\NN)\; \scal{-G_{\beta_k,\gamma_k}(u_k)+\nabla F_{\beta_{k}}(u_k,y_k) }{u-u_{k+1}}\geq 0.
 % \end{equation}
 % Since $C$ is bounded and $\nabla L$ is continuous, we obtain $\sup_{u\in C}\|\nabla L(u)\| <+\infty$, and hence 
 % \begin{equation}
 % \|\frac{1}{\beta_{n_k}}\nabla L(u_{n_k})^*(Lu_{n_k}-b)\| \leq \frac{1}{c}(\sup_{u\in C}\|\nabla L(u)\| )\|Lu_{n_k}-b\|\to 0.
 % \end{equation}
 % We also have 
 % \begin{equation}
 %  \nabla L(u_{n_k}) \to \nabla L(u^*) \; \text{and}\;   \nabla h(u_{n_k}) \to \nabla h(u^*).
 % \end{equation}
 % Since $y_{n_k}\to y^*$, we get 
 % \begin{alignat}{2}
 %& \|\nabla L(u_{n_k})^*y_{n_k}- \nabla L(u^*)y^* \|  \quad\\
 % &\leq \|\nabla L(u_{n_k})^*y_{n_k}- \nabla L(u_{n_k})y^* \| +\|\nabla L(u_{n_k})^*y^*- \nabla L(u^*)y^* \|\notag\\
 % &\leq  \|\nabla L(u_{n_k})\|\|y_{n_k}- y^* \|+\|y^*\| \|\nabla L(u_{n_k})- \nabla L(u^*) \|\notag\\
 % &\to 0.
 % \end{alignat}
 % Consequently, $\nabla F_{\beta_{n_k}}(u_{n_k}, y_{n_k}) \to \nabla h(u^*) +\nabla L(u^*)y^*$. Note that 
 % $$G_{\beta_{n_k},\gamma_{n_k}}(u_{n_k})\to 0.$$ Now, passing through subsequence in \eqref{e:aa2s}, we obtain 
 %   \begin{equation}\label{e:aa2s}
 % (\forall u\in C)\scal{\nabla h(u^*) +\nabla L(u^*)y^*  }{u-u^*}\geq 0,
 % \end{equation}
 % which is \eqref{e:fr1} for $\overline{u} = u^*$ and  $\overline{y} = y^*$. By assumption, $u^*$ is local minimum and 
 % $h(u_{n_k}) \to h(u^*)$. Therefore, $F^\star = h(u^*)$. Using Corollary \ref{c:2}, we get $h(u_k) \to h(\overline{u})$.
 % \end{proof}
   \section{Related Work \label{sec:related work}}
 
  To the best of our knowledge, the proposed method is new and different from existing methods in the literature. 
  
 As mentioned in Introduction, the connection to augmented Lagrange method is already mentioned.
 Our method is significantly different from the augmented Lagrange method, we perform 
 only step of the projected gradient step on primal variable $u$ instead of minimizing the augmented Lagrange fucntion.
 Furthermore, we update the penalty parameter $\beta$ adaptively to make sure that the feasibility reduces 
 faster than the gradient mapping.
 
 In the case when $h=0$, a modification of 
 Chambolle-Pock's method is investigated in \cite{Valkonen14} and preconditioned ADMM \cite{Matin17} 
 where the convergence of iterate is proved under strong assumptions not full-filling in our setting here. 
 
 %\noindent{\bf Connection to Linearized Alternating Direction Method}.\\
 ADMM is the classic method proposed for solving the problem \ref{prob:01} for the case where $L$ is a linear operator and $h$ is zero \cite{gabay1976dual}. 
 This method is an application of the Douglas-Rachford method to the dual problem \cite{Gabay83}.
 One of the main drawback of the ADMM is the appearance of the term $Lu$ in the update rule of $u_{k+1}$. To overcome this issue, some strategies were suggested. 
 The first strategies is proposed in \cite{shefi2014rate}, refined in \cite{banert2016fixing}, known as alternating direction proximal method of multipliers.
 The second strategies is to use linearized technique  \cite{lin2011linearized}.
  We show here that our proposed method 
 is closed related to  updating rule as the  linearized alternating direction method \cite{lin2011linearized}. Assume that $h\equiv 0$ and $L$ is a linear operator.
  Then the proposed method can be rewritten as 
 \begin{equation}
 \begin{cases}
 u_{k+1}= \arg\min_{u\in C}  \frac{1}{2\gamma_k} \| u-u_k + \gamma_kL^*\bigg( \lambda_k + \frac{1}{\beta_k}\big(Lu_k-b\big)\bigg)\|^2\notag\\
   \beta_{k+1}= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg( \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 + \frac{d_k}{(k+1)^\alpha}\bigg)^{-1}\\
 \text{Chose $\sigma_{k+1}\geq 2\beta_k$ and}\;  \lambda_{k+1} = \lambda_k +\frac{1}{\sigma_{k+1}}(Lu_{k+1}-b),
 \end{cases}
 \end{equation}
 which is a variant version of Linearized ADMM \cite{lin2011linearized}. 
 
 %\noindent
 %{\bf Connection to ALBUM3 in \cite{bolte2018nonconvex}}\\
 Very recently, \cite{bolte2018nonconvex} proposes a framework with for solving the problem \ref{prob:01} with $C=\RR^d$. 
 In particular, a special case AlBUM3 (Proximal Linearized Alternating Minimization) in this work 
 is closely related to us where their conditions are checkable only when $L$ is linear. Moreover, 
 our updating of $\beta_{k}$ in \cite{bolte2018nonconvex} depending on the smallest eigenvalue $L^*L$. For nonlinear $L$, the application of their method remains a challenge.
 
 %\noindent{\bf Connection to the deflected subgradient method}\\
 The deflected subgradient method is investigated in \cite{burachik2010deflected} can be use to solve a special case of the Problem \ref{prob:01}  for
 some a compact subset $\mathcal{C}$ in $\mathcal{X}$. The basis step of the  deflected subgradient method to solve: given $\beta, v$,
 \begin{equation}
  u^*\in \arg\min_{u\in C} h(u) + \beta \boldsymbol{\sigma}(Lu-b) - \scal{Kv}{Lu-b}
 \end{equation}
 where $\boldsymbol{\sigma}$ is a continuous penalty function such as $\|\cdot\|$, and $K$ is bounded linear operator. In general, there is no closed 
 -form expression for $u^*$ since it does not split $f$, $h$, $L$ invididually. Hence, it is hard to implement  deflected subgradient method. This is also a common drawback of the classic penalty method and its related works 
 \cite{gasimov2002augmented,burachik2010primal}.
 
  \section{Numerical experiments}
  \subsection{Hanging chain}
 %\begin{thebibliography}{}
 %
 % and use \bibitem to create references. Consult the Instructions
 % for authors for reference list style.
 \bibliographystyle{abbrv}
 
 \bibliography{references_alp.bib,bang.bib,ctuy16-small-bib.bib,JS_References.bib,lions.bib,references_optimal_sampling,references_yp,tythc16-small-bib,yhli.bib,bibliograpply,ctuy16-small-bib,bang1.bib,bang.bib}
 
 
 
 \appendix
 \edita{
 \section{Proof of  Lemma \ref{lem:11} \label{sec:proof of smoothness}} 
 
 
 Note that \eqref{e:deslem} follows immediately from an application of \cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}. It only remains to compute the smoothness parameter of $\mathcal{L}_\beta(\cdot,y)$, namely, $\lambda_\beta$. To that end, note that 
 \begin{align}
 \mathcal{L}_{\beta}(u,y) = h(u) + \sum_{i=1}^m y_i (A_i (u)-b_i)  + \frac{1}{2\beta} \sum_{i=1}^m (A_i(u)-b_i)^2,
 \end{align}
 which implies that 
 \begin{align}
 \nabla_u \mathcal{L}_\beta(u,y) & = \nabla h(u) + \sum_{i=1}^m y_i  \nabla A_i(u) + \frac{1}{\beta} \sum_{i=1}^m (A_i(u)-b_i) \nabla A_i(u) \nonumber\\
 & = \nabla h(u) + DA(u)^\top y  + \frac{DA(u)^\top (A(u)-b)}{\beta},
 \end{align}
 where $DA(u)$ is the Jacobian of $A$ at $u$. Likewise, 
 \begin{align}
 \nabla^2_u \mathcal{L}_\beta(u,y) & = \nabla^2 h(u) + \sum_{i=1}^m \left( y_i + \frac{A_i(u)}{\beta} \right) \nabla^2 A_i(u) + \frac{1}{\beta} \sum_{i=1}^m \nabla A_i(u) \nabla A_i(u)^\top. 
 \end{align}
 It follows that 
 \begin{align}
 \|\nabla_u^2 \mathcal{L}_\beta(u,y)\| & \le \| \nabla^2 h(u) \|+  \max_i \| \nabla^2 A_i(u)\| \left (\|y\|_1+\frac{\|A(u)-b\|_1}{\beta} \right) + \frac{1}{\beta} \sum_{i=1}^m \|\nabla A_i(u)\|^2 \nonumber\\
 & \le \lambda_h+  \sqrt{m} \lambda_A \left (\|y\|+\frac{\|A(u)-b\|}{\beta} \right) + \frac{\|DA(u)\|^2_F}{\beta} 
 \qquad \left(h \in \mathbb{L}(\lambda_h),\,\,  A_i \in \mathbb{L}(\lambda_A)  \right) \nonumber\\
 & 
 =  \lambda_h+  \sqrt{m} \lambda_A \left (\|y\|+\frac{\|A(u)-b\|}{\beta} \right) + \frac{ \|DA(u)\|_F^2}{\beta},
 \end{align}
 and, consequently, 
 \begin{align}
 \lambda_\beta & = \sup_u \|\nabla_u^2\mathcal{L}_\beta(u,y)\| \nonumber\\
 & \le \lambda_h + \sqrt{m}\lambda_A \left(\|y\| + \frac{\|A(u)-b\|}{\beta} \right) + \frac{\|DA(u)\|_F^2}{\beta},
 \end{align}
 which completes the proof of Lemma \ref{lem:11}.
 }
 
 \edita{
 \section{Proof of Lemma \ref{lem:eval Lipsc cte} \label{sec:proof of eval Lipsc cte}}
 
 Since $u,u^+_\gamma \in C$, it holds that 
 \begin{align}
 u^+_\gamma  - u \in - T_C(u^+_\gamma).
 \label{eq:both C feas}
 \end{align}
 Also, recalling $u^+_{\gamma}$ in Definition \ref{def:grad map}, we have  that 
 \begin{equation}
 u^+_{\gamma} - u +\gamma \nabla \mathcal{L}_\beta(u,y) \in -N_C(u^+_{\gamma}).
 \label{eq:optimality of uplus}
 \end{equation}
 Lastly, $\gamma$ by definition in \eqref{eq:defn of gamma line search}  satisfies 
 \begin{align}
 & \mathcal{L}_{\beta}(u^+_{\gamma},y) \nonumber\\
  & \le \mathcal{L}_\beta(u,y) + \left\langle 
 u^+_{\gamma} - u , \nabla \mathcal{L}_\beta (u,y) 
 \right\rangle + \frac{1}{2\gamma}\|u^+_{\gamma} - u\|^2 \nonumber\\
 & =  \mathcal{L}_\beta(u,y) + \frac{1}{\gamma} \left\langle 
 u^+_{\gamma} - u ,u^+_\gamma - u+ \gamma \nabla \mathcal{L}_\beta (u,y) 
 \right\rangle 
 - \frac{1}{2\gamma}\|u^+_{\gamma} - u\|^2 \nonumber\\
 & \le  \mathcal{L}_\beta(u,y) 
 - \frac{1}{2\gamma}\|u^+_{\gamma} - u\|^2
 \qquad \text{(see (\ref{eq:both C feas},\ref{eq:optimality of uplus}))} \nonumber\\
 & = \mathcal{L}_\beta(u,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(u,y)\|^2, 
 \qquad \text{(see Definition \ref{def:grad map})}
 \end{align}
 which completes the proof of Lemma \ref{lem:eval Lipsc cte}. }
 
 
 \section{Draft of nonadaptive convergence proof}
 
 Recall that, we set the penalty weight as 
 \begin{align}
 \b_k = \b_{k_0} \sqrt{\frac{k \log^2(k+1)}{k_0 \log^2(k_0+1)}},
 \qquad 
 \forall k\ge K,
 \label{eq:beta_update_nonadpt}
 \end{align}
 for a threshold $\b_{k_0}>0$. For the record, note that 
 \begin{align}
 \b_k - \b_{k-1} &= \frac{\b_{k_0}}{\sqrt{k_0}} (\sqrt{k}-\sqrt{k-1}) \nonumber\\
 & = \frac{\b_{k_0}}{\sqrt{k_0}} \frac{1}{\sqrt{k}+\sqrt{k-1}} \nonumber\\
 & \le \frac{\b_{k_0}}{\sqrt{k_0}} \cdot \frac{1}{\sqrt{k}+\sqrt{k-1}}  \nonumber\\
 & \le \frac{2\b_{k_0}}{\sqrt{kk_0}},
 \end{align}
 where the last line holds if $k_0$ is sufficiently large. Recall also that,  the dual step size is set to 
 \begin{align}
 \s_k = \s_{k_0} \sqrt{\frac{k_0}{k}}. 
 \qquad \forall k\in K. 
 \label{eq:sigma update nonadapt}
 \end{align} 
 Recall the bound on gradient mapping from \eqref{eq:long chain broken}, namely, 
 \begin{align}
 & \sum_{k=k_0}^{k_1} \frac{\gamma_k \|G_k\|^2}{2}  \nonumber\\
 & \le \mu  + \sum_{k=k_0}^{k_1}  \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
 + 
  \sum_{k=k_0}^{k_1} \sigma_k \|A(u_{{k_1}+1})-b\| \|A(u_k)-b\| - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2.
  \label{eq:conseq of beta update}
 \end{align}
 Recall also the inequality $2ab\le a^2 + b^2$ for scalars $a,b$. After substituting the above choices for $\{\b_k,\s_k\}_k$ above and using this inequality, we find that 
 \begin{align}
 & \sum_{k=k_0}^{k_1} \frac{\gamma_k \|G_k\|^2}{2}  \nonumber\\
  & =
  \mu  + \sum_{k=k_0}^{k_1}  \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
 + 
 \frac{1}{2} \sum_{k=k_0}^{k_1}  (\s_k^2 \|A(u_{{k_1}+1})-b\|^2 +\|A(u_k)-b\|^2) - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2 \nonumber\\ 
  & = 
  \mu  + \sum_{k=k_0}^{k_1} \frac{1}{2} \left( 1+2\sigma_k + \beta_k-\beta_{k-1} \right) \|A(u_k)-b\|^2
 + 
 \frac{1}{2}\left(  \sum_{k=k_0}^{k_1} \sigma_k^2   - \beta_{k_1} \right) \| A(u_{k_1+1})-b\|^2 \nonumber\\
  & \le \mu+ \sum_{k=k_0}^{k_1} \frac{1 }{2}
  \left( 1+ 2\s_{k_0}\sqrt{\frac{k_0}{k}}  +\frac{2\b_{k_0}}{\sqrt{k_0 k}}\right) \|A(u_k)-b\|^2 
  + \frac{1}{2}\left( \sum_{k=k_0}^{k_1}  \frac{\s_{k_0}^2 k_0}{k} -  \b_{k_0}\sqrt{\frac{k_1}{k_0}} \right) \|A(u_{k_1+1})-b\|^2 
 \qquad \text{(see (\ref{eq:sigma update nonadapt},\ref{eq:conseq of beta update}))} 
  \nonumber\\
  & =: \mu+  \sum_{k=k_0}^{k_1} \frac{\delta_{k_0}}{\sqrt{k}} \|A(u_k) - b\|^2 + \frac{1}{2}\left( \sum_{k=k_0}^{k_1} \frac{\s^2_{k_0}k_0}{k} -  \b_{k_0}\sqrt{\frac{k}{k_0}} \right) \|A(u_{k_1+1})-b\|^2. 
  \label{eq:long chain 10}
 \end{align}
 We now show that the last term above is not positive if $\s_{k_0}\ll \b_{k_0}$. Indeed, 
 \begin{align}
 & \sum_{k=k_0}^{k_1}  \frac{\s_{k_0}^2 k_0}{k} \nonumber\\
 & \le 2 \s_{k_0} \sqrt{k_0 k_1} 
 \qquad \left( \sum_{k=1}^{k_1} \frac{1}{\sqrt{k}} \le \int_0^{k_1} \frac{dx}{\sqrt{x}}= 2\sqrt{k_1} \right) \nonumber\\
 & \le \b_{k_0} \sqrt{\frac{k_1}{k_0}},
 \end{align} 
 provided that 
 \begin{align}
 \s_{k_0} \le \frac{\b_{k_0}}{2k_0} .
 \end{align}
 The above inequality simplifies \eqref{eq:long chain 10} to read
 \begin{align}
  \sum_{k=k_0}^{k_1} \frac{\gamma_k \|G_k\|^2}{2} 
  & \le \mu+ \sum_{k=k_0}^{k_1} \frac{\delta_{k_0}}{\sqrt{k}} \|A(u_k) - b\|^2 . 
  \label{eq:grad to feas}
 \end{align}
 We now find a converse to \eqref{eq:grad to feas}, thus bounding the feasibility gap with the gradient mapping. By invoking Lemma \ref{lem:bnd bnd Ak}, we can write that 
 \begin{align}
 \sum_{k=k_0}^{k_1} \frac{\gamma_k \|G_k\|^2}{2} 
  & \le \mu+ \sum_{k=k_0}^{k_1} \frac{\delta_{k_0}}{\sqrt{k}} \|A(u_k) - b\|^2 
  \qquad \text{(see \eqref{eq:grad to feas})} \nonumber\\
  & \le \mu + \sum_{k=k_0}^{k_1}
 \frac{4 \delta_{k_0}}{ \eta_{\min}^2 \b_k^2 \sqrt{k}}
 \left( \lambda'_h+ \eta_{\max} \|y_{k-1}\| + \|G_k\|   \right)^2 
 \qquad \text{(see \eqref{eq:bnd on Ak final})}
  \nonumber\\
  & \le \mu+ \sum_{k=k_0}^{k_1} \frac{12\delta_{k_0}}{\eta_{\min}^2 \b_k^2 \sqrt{k}} ( \lambda'^2_h+ \eta_{\max}^2 \|y_{k-1}\|^2 + \|G_k\|^2  ) 
  \qquad \left((a+b+c)^2 \le 3a^2+3b^2+3c^2  \right)\nonumber\\
  & =  \mu+ \sum_{k=k_0}^{k_1} \frac{12 \delta_{k_0} k_0  }{\eta_{\min}^2 k^{\frac{3}{2}}} ( \lambda'^2_h+ \eta_{\max}^2 \|y_{k-1}\|^2 + \|G_k\|^2  ) 
 \qquad \text{(see \eqref{eq:beta_update_nonadpt})} 
   \nonumber\\
   & =: \mu + 
 \frac{12 \delta_{k_0} k_0  }{\eta_{\min}^2 }   \sum_{k=k_0}^{k_1} k^{-\frac{3}{2}} 
 + 
 \frac{12 \delta_{k_0} k_0  }{\eta_{\min}^2 } B_K 
 + 
 \sum_{k=k_0}^{k_1} \frac{12 \delta_{k_0} k_0 \|G_k\|^2  }{\eta_{\min}^2 k^{\frac{3}{2}}} \nonumber\\
 & \le \mu + \frac{96 \delta_{k_0}\sqrt{k_0}}{\eta_{\min}^2} 
 + \frac{12 \delta_{k_0} k_0  }{\eta_{\min}^2 } B_K 
 + 
 \sum_{k=k_0}^{k_1} \frac{12 \delta_{k_0} k_0 \|G_k\|^2  }{\eta_{\min}^2 k^{\frac{3}{2}}}, 
 \qquad 
 \left(  \sum_{k=k_0}^{k_1} k^{-\frac{3}{2}}  \le 4 \int_{k_0}^{k_1} x^{-\frac{3}{2}} \le \frac{8}{\sqrt{ k_0}}  \right)
 \label{eq:converse 1}
 \end{align}
 where we set 
 \begin{align}
 B_K := \sum_{k=k_0}^{k_1} \frac{\|y_{k-1}\|^2}{k^{\frac{3}{2}}}. 
 \end{align}
 To simplify the above bound, let us assume that 
 \begin{align}
 \frac{12 \delta_{k_0} k_0 }{\eta_{\min}^2 k^{\frac{3}{2}}} \le \frac{\gamma_k}{4}, 
 \qquad \forall k\in K, 
 \label{eq:assump on gk}
 \end{align}
 which allows us to simplify \eqref{eq:assump on gk} as 
 \begin{align}
 \sum_{k=k_0}^{k_1} \frac{\gamma_k \|G_k\|^2}{4} & 
 \le \sum_{k=k_0}^{k_1} 
 \left( \frac{\gamma_k}{2} - \frac{12\delta_{k_0}k_0}{\eta_{\min}^2 k^{\frac{3}{2}}}  \right) \|G_k\|^2
 \qquad \text{(see \eqref{eq:assump on gk})} \nonumber\\
 & \le \mu+ \frac{96\delta_{k_0}\sqrt{k_0}}{\eta_{\min}^2} + \frac{12\delta_{k_0}k_0 B_K}{\eta_{\min}^2 }.\qquad 
 \text{(see \eqref{eq:converse 1})}
 \end{align}
 In turn, the above bound on the gradient mapping controls the feasibility as 
 \begin{align}
 u
 \end{align}
 
 
 
 \section{Draft of adaptive convergence proof}
 
 
 For convenience, let us recall the updates of the algorithm in iteration $k$, namely,  
 \begin{align}
 u_{k+1} & = P_C( u_k - \gamma_k \nabla \mathcal{L}_{\beta_k} (u_k,y_k) )
 \nonumber\\
 & = P_C\left( u_k - \gamma_k \nabla h(u_k) - \gamma_k DA(u_k) ^\top \left( y_k + \beta_k(A(u_k)-b) \right) \right),
 \qquad \text{(see \eqref{eq:Lagrangian})}
 \label{eq:update uk recall}
 \end{align}
 \begin{align}
 y_{k+1} =y_k + \sigma_{k+1}(A(u_{k+1})-b), 
 \label{eq:y update recall}
 \end{align}
 and recall also that 
 \begin{equation}
 G_k = G_{\beta_k,\gamma_k}(u_k,y_k) = \frac{u_k-u_{k+1}}{\gamma_k}.
 \qquad \text{(see \eqref{eq:grad map})}
 \label{eq:grad map recall}
 \end{equation}
 For integers $k_0\le k_1$, consider the interval 
 \begin{equation}
  K = [k_0:k_1]=\{k_0,\cdots,k_1\}.
  \end{equation}
 Since $\gamma_k$ is determined by the line search subroutine in Lemma \ref{lem:eval Lipsc cte},  we may  apply Lemma \ref{lem:11} for every iteration in this interval to find that 
 \begin{align}
  \frac{\gamma_k \|G_k\|^2}{2} 
 & \le \mathcal{L}_{\beta_k}(u_k,y_k) - \mathcal{L}_{\beta_k}(u_{k+1},y_k)
 \qquad \text{(see Lemma \ref{lem:11})} \nonumber\\
 & = h(u_k) - h(u_{k+1})+ \langle A(u_k)-A(u_{k+1}) , y_k \rangle+ \frac{\beta_k}{2}\left(\|A(u_k)-b\|^2 - \|A(u_{k+1})-b\|^2\right),
 \qquad \text{(see \eqref{eq:Lagrangian})}
 \label{eq:smoothness rep}
 \end{align}
 for every $k\in K$. 
 On the other hand, 
 \begin{align}
 y_k = y_{k_0} + \sum_{i=k_0+1}^k \sigma_i \left( A(u_i)-b \right),
 \qquad \text{(see \eqref{eq:y update recall})}
 \label{eq:y update unfolded}
 \end{align}
 which, after substituting in \eqref{eq:smoothness rep},  yields that 
 \begin{align}
 \frac{\gamma_k \|G_k\|^2}{2} & \le h(u_k) - h(u_{k+1}) + \left\langle A(u_k) - A(u_{k+1}) ,  y_{k_0} + \sum_{i=k_0+1}^k \sigma_i ( A(u_i)-b) \right\rangle 
  +\frac{\beta_k}{2} \left(\|A(u_k)-b\|^2-\|A(u_{k+1})-b\|^2\right). 
  \label{eq:key ineq}
 \end{align}
 %\editaa{Additionally, let us assume that, for $\beta>0$,
 %\begin{align}
 %\frac{1}{\beta_k}- \frac{1}{\beta_{k-1}} \le \frac{1 }{\beta \sqrt{k}},
 %\qquad 
 %\frac{1}{\sigma_k} \le \frac{1}{\beta \sqrt{k}},
 %\qquad \forall k\in K,
 %\label{eq:consequences}
 %\end{align}
 %for sufficiently large $k_0$. 
 %For example, following choices satisfy the requirements:
 %
 %\begin{align}
 %\beta_k = \frac{\beta}{{k}^\alpha} \text{ where } \alpha \leq 1/2,
 %\qquad 
 %%\sigma_k \geq \beta \sqrt{k},
 %\sigma_k \geq \max(\beta \sqrt{k}, \beta k \| Au_k - b \|),
 %\qquad \forall k \in K,
 %\label{eq:beta n sigma}
 %\end{align}
 %}
 %\editaa{Ahmet: Below, I changed the indices for double sum in the first line to be consistent (see \eqref{eq:y update unfolded}), and also kept the negative term from the second equality onwards.}
 %\notea{Consider the case $k_0=0$. Please use a similar command to add notes as opposed to highlight edits.}
 By summing up the key inequality in \eqref{eq:key ineq} over $k$, we find that 
 \begin{align}
 & \sum_{k=k_0}^{k_1} \frac{\gamma_k \|G_k\|^2}{2}  \nonumber\\
 & \le h(u_{k_0}) - h(u_{k_1+1}) + \langle A(u_{k_0}) - A(u_{k_1+1}) , y_{k_0-1}\rangle  
 + \sum_{k=k_0}^{k_1} \sum_{i=k_0+1}^k \sigma_i  \left\langle A(u_k) - A(u_{k+1}) , A(u_i)-b \right\rangle 
  \nonumber\\
 & \qquad + \sum_{k=k_0}^{k_1} \frac{\beta_k}{2} \|A(u_k)-b\|^2 - \sum_{k=k_0}^{k_1} \frac{\beta_k}{2} \|A(u_{k+1})-b\|^2
 \qquad \text{(see \eqref{eq:key ineq})}
 \nonumber\\
 & = h(u_{k_0}) - h(u_{{k_1}+1}) + \langle A(u_{k_0}) - A(u_{{k_1}+1}) , y_{k_0-1} \rangle 
 + \sum_{k=k_0}^{k_1} \sum_{i=k_0+1}^k \sigma_i  \left\langle A(u_k) - A(u_{k+1}) , A(u_i)-b \right\rangle\nonumber\\
 & \qquad + \sum_{k=k_0}^{k_1} \frac{\beta_k}{2}\|A(u_k)-b\|^2- \sum_{k=k_0+1}^{{k_1}+1}  \frac{\beta_{k-1}}{2} \|A(u_{k})-b\|^2  \nonumber\\
 & =  h(u_{k_0}) - h(u_{{k_1}+1}) + \langle A(u_{k_0}) - A(u_{{k_1}+1}) , y_{k_0-1} \rangle + \frac{\beta_{k_0}}{2}\|A(u_{k_0})-b\|^2  + \sum_{i=k_0+1}^{k_1} \sum_{k=i}^{k_1} \sigma_i \left\langle A(u_k) - A(u_{k+1}) , A(u_i)-b \right\rangle 
 \nonumber\\
 & \qquad 
 + \sum_{k=k_0+1}^{k_1}  \frac{\beta_k - \beta_{k-1}}{2}  \|A(u_k)-b\|^2 - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2
   \nonumber\\
 & \le  \mu + \sum_{i=k_0+1}^{k_1} \sigma_i \left\langle A(u_i) - A(u_{{k_1}+1}),  A(u_i)-b\right\rangle + \sum_{k=k_0+1}^{k_1} \frac{\beta_k - \beta_{k-1}}{2}  \|A(u_k)-b\|^2 - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2
 \qquad \text{(see \eqref{eq:defn mu})} 
   \nonumber\\
 & = \mu   + \sum_{k=k_0+1}^{k_1}  \left( \sigma_k +\frac{\beta_k - \beta_{k-1}}{2}  \right) \|A(u_k)-b\|^2
 - \sum_{k=k_0+1}^{k_1} \sigma_k  \left \langle A(u_{{k_1}+1})-b, A(u_k)-b\right\rangle - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2
  \nonumber\\
  & \le\mu  + \sum_{k=k_0}^{k_1}  \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
 + 
  \sum_{k=k_0}^{k_1} \sigma_k \|A(u_{{k_1}+1})-b\| \|A(u_k)-b\| - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2,
  \label{eq:long chain broken}
 \end{align} 
 where we assumed that 
 \begin{equation}
 \mu := \sup_k h(u_{k_0}) - h(u_k) + \langle A(u_{k_0})-A(u_k) ,y_{k_0-1}\rangle  + \frac{\beta_{k_0}}{2} \|A(u_{k_0})-b\|^2 < \infty. 
 \label{eq:defn mu}
 \end{equation}
 Recall that, for every $k\in K$, we set the penalty weight as 
 \begin{align}
 \beta_k =
 \begin{cases}
 \beta_{k-1} & \|A(u_k)-b\| \le  \frac{\beta_{k_0}}{\sqrt{k\log^{\editf{2}} (k+1)}},\\
 \\
 \beta_{k-1}
 \sqrt{\frac{k\log^{\editf{2}}(k+1)}{(k-1)\log^{\editf{2}} k}} & 
  \|A(u_k)-b\| > \frac{\beta_{k_0}}{\sqrt{k\log^{\editf{2}}(k+1)}}, 
 \end{cases}
 \label{eq:beta rule}
 \end{align}
 for a threshold $\beta_{k_0}>0$. 
 For the record, the above penalty weights satisfy
 \begin{align*}
 \beta_{k_0}\le \beta_k \le \beta_{k_0} \sqrt{\frac{k \log^{\editf{2}}(k+1)}{k_0\log^{\editf{2}}(k_0+1)}},
 \qquad \forall k\in K,
 \end{align*}
 \begin{align}
 \beta_k- \beta_{k-1} 
 & \le \beta_{k-1} \left( \sqrt{\frac{k\log^{\editf{2}}(k+1)}{(k-1)\log^{\editf{2}}k}} -1 \right) \nonumber\\
 & \le \beta_{k-1} \cdot  \frac{k\log^{\editf{2}}(k+1) - (k-1) \log^{\editf{2}} k}{(k-1)\log^{\editf{2}} k}   \nonumber\\
 & \le \frac{\editf{5}\beta_{k-1}}{k-1} \nonumber\\
 & \le \frac{\editf{5}\beta_{k_0}}{k-1} \sqrt{\frac{(k-1)\log^{\editf{2}}k}{k_0 \log^{\editf{2}}(k_0+1)}} \nonumber\\
-& =\editf{5}\beta_{k_0} \sqrt{\frac{\log^{\editf{2}}k}{(k-1)k_0 \log^{\editf{2}}(k_0+1)}},
+& = \editf{5}\beta_{k_0} \sqrt{\frac{\log^{\editf{2}}k}{(k-1)k_0 \log^{\editf{2}}(k_0+1)}},
 \qquad \forall k \in K,
 \label{eq:consequences}
 \end{align}
 when $k_0$ is sufficiently large. 
 For every $k\in K$, recall that we set the dual step sizes as  
 \begin{align}
 \sigma_k =
 \begin{cases}
 \sigma_{k-1} & \|A(u_k)-b\| \le \frac{\sigma_{k_0}}{k\log^{\editf{2}}(k+1)},\\
 \\
 \sigma_{k-1} \sqrt{\frac{(k-1) \log^{\editf{2}}k}{k \log^{\editf{2}}(k+1)}} 
 & 
 \frac{\sigma_{k_0}}{k\log^{\editf{2}}(k+1)} \le \|A(u_k)-b\| \le \frac{\sigma_{k_0}}{\sqrt{k\log^{\editf{2}}(k+1)}},
 \\
 \\
 \editf{\frac{\sigma_{\tilde{k}}}{\beta_{\tilde{k}}-\beta_{\tilde{k}-1}}} (\beta_k - \beta_{k-1} )& \|A(u_k)-b\| > \frac{\sigma_{k_0}}{\sqrt{k\log \editf{^2}(k+1)}},
 \end{cases}
 \label{eq:sigma rule}
 \end{align}
 for a threshold $\sigma_{k_0}\ge \beta_{k_0}$. \editf{$\tilde{k}$ is at which a transition occurs from first and second update region to third update region. The constant in the third region is for the sake of continuity.} 
 %  \frac{1}{\|A(u_k)-b\| k\log (k+1)}
 For the record, a consequence of the above updates is that 
 \begin{align}
 \sigma_k \le  \sigma_{k_0}, 
 \qquad \forall k\in K,
 \qquad \text{(see (\ref{eq:consequences},\ref{eq:sigma rule}))}
 \label{eq:consequences 2}
 \end{align}
 for sufficiently large $k_0$. Let us partition $K$ to disjoint subsets
 \begin{align}
 K = K_{\beta,l} \cup K_{\beta,u}
 \label{eq:partition beta}
 \end{align}
 More specifically, for every $k\in K_{\beta,l}$, the first update in \eqref{eq:beta rule} is in force whereas, for every $k\in K_{\beta,u}$, the second update in  \eqref{eq:beta rule}  is in force. Likewise, let us partition $K$ to disjoint subsets
 \begin{align}
 K= K_{\sigma,l} \cup K_{\sigma,m} \cup  K_{\sigma,u}.
 \label{eq:partition sigma}
 \end{align}
 For every $k\in K_{\sigma,l}$, the first update in \eqref{eq:sigma rule} is in force whereas, for every $k\in K_{\sigma,m}$, the second update in  \eqref{eq:sigma rule} is in force and, for every $k\in K_{\sigma,u}$ the third update there is active. By their definitions in (\ref{eq:beta rule},\ref{eq:sigma rule}) and the assumption that $\s_{k_0}\ge \b_{k_0}$, we observe that $K_{\beta,l}\cap K_{\s,u}=\emptyset$, namely, 
 \begin{equation}
 \sigma_k >0, \qquad k\in K.
 \end{equation} 
 The partitioning in \eqref{eq:partition sigma} allows us to break the sum in the last line of \eqref{eq:long chain broken} to three sums. 
 To evaluate the sum over $K_{\sigma,l}$, we write that 
 \begin{align}
 &  \sum_{k\in K_{\sigma,l}}  \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
 + 
  \|A(u_{{k_1}+1})-b\|  \sum_{k\in K_{\sigma,l}} \sigma_k \|A(u_k)-b\|
  \nonumber\\
  & \le  \sum_{k\in K_{\sigma,l}}  \left( \sigma_{k_0}+ \editf{5}\beta_{k_0} \sqrt{\frac{\log \editf{^2} k}{(k-1) k_0 \log\editf{^2}(k_0+1)}} \right) \|A(u_k)-b\|^2 +  \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,l}} \sigma_{k_0} \|A(u_k)-b\|  
 \qquad \text{(see (\ref{eq:consequences},\ref{eq:consequences 2}))} 
  \nonumber\\
  & \le 2 \s_{k_0}\sum_{k\in K_{\sigma,l}}   \|A(u_k)-b\|^2 +  \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,l}} \sigma_{k_0} \|A(u_k)-b\|
  \qquad \left( k_0\gg 1 \right)
  \nonumber\\
  & \le  2\s_{k_0}\sum_{k\in K_{\sigma,l}}   \|A(u_k)-b\|^2 +  \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,l}} \frac{\sigma_{k_0}^2}{k\log \editf{^2}(k+1)}
  \qquad \text{(see \eqref{eq:sigma rule})}
  \nonumber\\
  & = 2\s_{k_0} \sum_{k\in K_{\sigma,l}}   \|A(u_k)-b\|^2 +  c\s^2_{k_0} \| A(u_{k_1+1})-b\| ,
  \label{eq:long chain broke 2}
 \end{align} 
 where 
 \begin{equation}
 c\ge  \sum_{k=1}^{\infty} \frac{1}{k\log \editf{^2}(k+1)}.
 \label{eq:defn of c}
 \end{equation} 
 To evaluate the sum in the last line of \eqref{eq:long chain broken} over $K_{\s,m}$, 
 we write that 
 \begin{align}
 &  \sum_{k\in K_{\sigma,m}}  \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
 + 
  \|A(u_{{k_1}+1})-b\|  \sum_{k\in K_{\sigma,m}} \sigma_k \|A(u_k)-b\|
  \nonumber\\
  & \le  \sum_{k\in K_{\sigma,m}}  \left( \sigma_{k_0}+ \editf{5}\beta_{k_0} \sqrt{\frac{\log \editf{^2} k}{(k-1) k_0 \log\editf{^2}(k_0+1)}} \right) \|A(u_k)-b\|^2 +  \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,m}} \sigma_{k} \|A(u_k)-b\|  
 \qquad \text{(see (\ref{eq:consequences},\ref{eq:consequences 2}))} 
  \nonumber\\
  & \le 2\s_{k_0} \sum_{k\in K_{\sigma,m}}   \|A(u_k)-b\|^2 +  \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,m}} \sigma_{k} \|A(u_k)-b\|
  \qquad \left( k_0\gg 1 \right)
  \nonumber\\
  & \le 2\s_{k_0} \sum_{k\in K_{\sigma,m}}   \|A(u_k)-b\|^2 +  \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,m}} \frac{\sigma_{k_0}\sigma_{k} }{\sqrt{k\log \editf{^2}(k+1)}}.
  \qquad \text{(see \eqref{eq:sigma rule})}
  \label{eq:long chain broke 3}
 \end{align} 
 To control the last sum above, let us further write that $K_{\s,m}=\cup_{j=1}^{J_\s} K_{\s,m,j}$, where each $K_{\s,m,j}$ is an interval, with the starting point   $k_{\s,m,j}$. With this decomposition, we write that 
 \begin{align}
 \sum_{k\in K_{\sigma,m}} \frac{\sigma_{k_0}\sigma_k}{\sqrt{k\log\editf{^2}(k+1)}} & = 
 \sum_{j=1}^{J_\s} \sum_{k\in K_{\sigma,m,j}} \frac{\sigma_{k_0}\sigma_k}{\sqrt{k\log\editf{^2}(k+1)}} \nonumber\\
 & = \sum_{j=1}^{J_\s}
 \sigma_{k_0}\sigma_{k_{\s,m,j}} \sqrt{k_{\s,m,j} \log \editf{^2}(k_{\s,m,j}+1)}
  \sum_{k\in K_{\sigma,m,j}} \frac{1}{k \log \editf{^2}(k+1)}
  \qquad \text{(see \eqref{eq:sigma rule})} \nonumber\\
  & \le c \s_{k_0} \s'_{k_0},
 \end{align}
 where we set 
 \begin{align}
 \s'_{k_0} :=\sum_{j=1}^{J_\s}\sigma_{k_{\s,m,j}} \sqrt{k_{\s,m,j} \log \editf{^2}(k_{\s,m,j}+1)}. 
 \label{eq:defn of sigmap}
 \end{align}
 \begin{remark}
 Note that our convergence bounds in particular will depend on $J_\s$, the number of transitions into the middle regime in \eqref{eq:sigma rule}. 
 \end{remark}
 Substituting the above bound back into \eqref{eq:long chain broke 3}, we reach
 \begin{align}
 &  \sum_{k\in K_{\sigma,m}}  \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
 + 
  \|A(u_{{k_1}+1})-b\|  \sum_{k\in K_{\sigma,m}} \sigma_k \|A(u_k)-b\|
  \nonumber\\
  & \le 2\s_{k_0} \sum_{k\in K_{\sigma,m}}   \|A(u_k)-b\|^2 + c\s_{k_0}\sigma'_{k_0}  \| A(u_{k_1+1})-b\|. 
 \end{align}
 \editf{Suppose that $K_{\s,u} = \cup_{j=1}^{J_{\s, u}} K_{\s,u,j}$, where each $K_{\s,u,j}$ is an interval, with the starting point $\tilde{k}_j = k_{\s,u,j}\in K$. }Finally, to evaluate the sum in the last line of \eqref{eq:long chain broken} over $K_{\s,u}$, 
 we write that 
 \begin{align}
 & 
 \sum_{k\in K_{\sigma,u}} \left( \sigma_k + \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2 
 + \sum_{k\in K_{\sigma,u}} \sigma_k \|A(u_{k_1+1})-b\| \|A(u_k)-b\| - \frac{\beta_{k_1}}{2} \|A(u_{k_1+1})-b\|^2 \nonumber\\ 
 & \le \editf{\sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}}%\sum_{k\in K_{\sigma,u}}
  \left( \sigma_k + \frac{\beta_k-\beta_{k-1}}{2} + 
 \editf{\frac{\sigma_{\tilde{k}_j }}{2(\beta_{\tilde{k}_j}-\b_{\tilde{k}_j -1})}}\right) 
 \| A(u_k)-b\|^2 \nonumber\\
 & \qquad + \frac{1}{2} 
 \left(\editf{ \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}% \sum_{k\in K_{\sigma,u}} 
  \frac{ (\beta_{\tilde{k}_j}-\beta_{\tilde{k}_j-1}) \sigma_k }{\sigma_{\tilde{k}_j}}  - \beta_{k_1} }\right) 
  \| A(u_{k_1+1})-b\|^2
 \qquad ( 2ab \le \theta a^2 + b^2/\theta ) 
  \nonumber\\
  & \le \editf{\sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}% \sum_{k\in K_{\sigma,u}}  
  \left(
   \left( \frac{\sigma_{\tilde{k}_j }}{(\beta_{\tilde{k}_j}-\b_{\tilde{k}_j -1})} +\frac{1}{2} \right) (\beta_k - \beta_{k-1})
   + \frac{\sigma_{\tilde{k}_j }}{2(\beta_{\tilde{k}_j}-\b_{\tilde{k}_j -1})}
  \right)}   
    \|A(u_k)-b\|^2 
  \nonumber\\
 & \qquad  + \frac{1}{2}\left( \sum_{k\in K_{\sigma,u}}  (\beta_k-\beta_{k-1})  - \beta_{k_1} \right) \|A(u_{k_1+1})-b\|^2 
 \qquad \text{(see \eqref{eq:sigma rule})}
 \nonumber\\
 & \le \editf{\sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}% \sum_{k\in K_{\sigma,u}}  
  \left(
   \left( \frac{\sigma_{\tilde{k}_j }}{(\beta_{\tilde{k}_j}-\b_{\tilde{k}_j -1})} +\frac{1}{2} \right) (\beta_k - \beta_{k-1})
   + \frac{\sigma_{\tilde{k}_j }}{2(\beta_{\tilde{k}_j}-\b_{\tilde{k}_j -1})}
  \right)}  
    \|A(u_k)-b\|^2 
  \nonumber\\
 & \qquad  + \frac{1}{2}\left( \sum_{k=k_0}^{k_1}  (\beta_k-\beta_{k-1})  - \beta_{k_1} \right) \|A(u_{k_1+1})-b\|^2 
 \qquad \text{(see \eqref{eq:beta rule})}
 \nonumber\\
 & \le  \editf{\sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}% \sum_{k\in K_{\sigma,u}}  
  \left(
   \left( \frac{\sigma_{\tilde{k}_j }}{(\beta_{\tilde{k}_j}-\b_{\tilde{k}_j -1})} +\frac{1}{2} \right) (\beta_k - \beta_{k-1})
   + \frac{\sigma_{\tilde{k}_j }}{2(\beta_{\tilde{k}_j}-\b_{\tilde{k}_j -1})}
  \right)}  
    \|A(u_k)-b\|^2 
    \qquad \left( \beta_{k_0}\ge 0 \right)
    \nonumber\\
    &  \le  \editf{\sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}% \sum_{k\in K_{\sigma,u}}  
  \left(
   \left( \frac{\sigma_{\tilde{k}_j }}{(\beta_{\tilde{k}_j}-\b_{\tilde{k}_j -1})} +\frac{1}{2} \right) \left(4 \beta_{k_0}\sqrt{\frac{\log^2(k+1)}{k k_0 \log^2(k_0+1)}}\right)
   + \frac{\sigma_{\tilde{k}_j }}{2(\beta_{\tilde{k}_j}-\b_{\tilde{k}_j -1})}
  \right)}   
   %%%
 %   \sum_{k\in K_{\sigma,u}}  
 % \left(
 %  \left( \frac{\sigma_{k_0}}{\beta_{k_0}-\b_{k_0-1}} +\frac{1}{2} \right)\cdot 4 \beta_{k_0}\sqrt{\frac{\log(k+1)}{kk_0 \log(k_0+1)}}
 %  + \frac{\sigma_{k_0}}{2(\beta_{k_0}-\b_{k_0-1})}  
 % \right)   
    \|A(u_k)-b\|^2 
 \qquad \text{(see \eqref{eq:consequences})}   
    \nonumber\\
    &  \le  \editf{ \frac{\sigma_{\tilde{k}_{\tilde{j}}}}{\beta_{\tilde{k}_{\tilde{j}}}-\b_{\tilde{k}_{\tilde{j}}-1}} } \sum_{k\in K_{\sigma,u}}    
    \|A(u_k)-b\|^2
 \qquad \left( k_0 \gg 1\right)\nonumber\\%  (\s_k is monotnically decreasing)
 & =: \delta_{k_0}  \sum_{k\in K_{\sigma,u}}    
    \|A(u_k)-b\|^2, 
 \label{eq:long chain broke 1}
 \end{align}
 \editf{,where 
 $$
 \tilde{j} = \argmax_{j \in [J_{\s, u}] } \frac{\sigma_{\tilde{k}_j }}{(\beta_{\tilde{k}_j}-\b_{\tilde{k}_j -1})}
 $$
 when $k_0$ is sufficiently large, and we set 
 \begin{align}
 \delta_{k_0} =  \frac{\s_{k_{\tilde{j}}}}{\b_{k_{\tilde{j}}}-\b_{k_{\tilde{j}}-1}}.
 \label{eq:defn of delta}
 \end{align} 
 }
 By combining (\ref{eq:long chain broke 2},\ref{eq:long chain broke 3},\ref{eq:long chain broke 1}), we can simplify \eqref{eq:long chain broken} as 
 \begin{align}
 & \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 \nonumber\\
 & \le 2\mu + \delta_{k_0} \sum_{k=k_0}^{k_1} \|A(u_k)-b\|^2 + c\s_{k_0}(\sigma_{k_0}+\s_{k_0}') \|A(u_{k_1+1})-b\| 
 \qquad 
 \left( \text{(\ref{eq:long chain broke 1},\ref{eq:long chain broke 2},\ref{eq:long chain broke 3}) and } k_0 \gg 1 \right)
 \nonumber\\
 & =: \mu' + \delta_{k_0} \sum_{k=k_0}^{k_1} \|A(u_k)-b\|^2 ,
 \label{eq:long chain}
 \end{align}
 when $k_0$ is sufficiently large, and we observed that 
 \begin{align}
 \mu' := 2\mu + c
 \s_{k_0}(\s_{k_0}+\s_{k_0}') \max_{u\in C} \|A(u)-b\| < \infty,
 \label{eq:defn of mup}
 \end{align}
 thanks to the continuity of $A$ and compactness of $C$. 
 Note that \eqref{eq:long chain} bounds the gradient mapping with the feasibility gap. We next find a converse, thus bounding  the feasibility gap with the gradient mapping. To that end, let $T_C(u)$ and $P_{T_C(u)}$ be the tangent cone of $C$ at $u\in C$ and orthogonal projection onto this subspace, respectively. Likewise, let $N_C(u)$ and $P_{N_{C}(u)}$ be the normal cone of $C$ at $u$ and the corresponding orthogonal projection. 
 The update rule for $u_k$ in  \eqref{eq:update uk recall} immediately implies that
 \begin{align}
 G_k - \nabla h(u_k) -  DA(u_k) ^\top  y_k-  \beta_k DA(u_k)^\top (A(u_k)-b) \in  N_C(u_{k+1}). 
 \label{eq:opt cnd of update}
 \end{align}
 By definition in \eqref{eq:grad map recall}, we have that  $G_k \in T_C(u_{k+1})$ which, together with \eqref{eq:opt cnd of update}, imply that 
 \begin{align}
 G_k & 
 = P_{T_C(u_{k+1})} \left(
 - \nabla h(u_k) -  DA(u_k) ^\top  y_k-  \beta_k DA(u_k)^\top (A(u_k)-b) 
  \right)
 \nonumber\\ 
   & 
  = P_{T_C(u_{k+1})}(- \nabla h(u_k))  + P_{T_C(u_{k+1})}(-  DA(u_k) ^\top  y_k)
 +  \beta_k P_{T_C(u_{k+1})} (
  -  DA(u_k)^\top (A(u_k)-b) ) \nonumber\\
 &  = P_{T_C(u_{k+1})}(- \nabla h(u_k))  + P_{T_C(u_{k+1})}(-  DA(u_k) ^\top  y_{k-1})
 +  \left(\beta_k+ \sigma_k \right) P_{T_C(u_{k+1})} (
  -  DA(u_k)^\top (A(u_k)-b) ).
 \end{align}
 In the second line above, we used the identity $P_T(a+b) = P_T(a)+P_T(b)$, for points $a,b$ and cone $T$. The last line above uses \eqref{eq:y update recall}. 
  After rearranging and applying the triangle inequality to the last line above, we reach
 \begin{align}
 \beta_k\| P_{T_C(u_{k+1})}(DA(u_k)^\top (A(u_k)-b)) \|
 & \le \left( \sigma_k+ \beta_k \right) \| P_{T_C(u_{k+1})} (DA(u_k)^\top (A(u_k)-b) ) \| \nonumber\\
 & \le \|\nabla h(u_k)\| + \|DA(u_k) \| \cdot \|y_{k-1}\|+ \|G_k\| \nonumber\\
 & \le \lambda	'_h + \eta_{\max} \|y_{k-1}\|+ \|G_k\|,
 \label{eq:bnd on Ak raw}
 \end{align}
 where we set 
 \begin{align}
 \lambda'_h := \max_{u\in C} \| \nabla h(u)\|,
 \qquad 
 \eta_{\max}= \max_{u\in C} \|DA(u)\|.
 \label{eq:defn lambdap n etaMax}
 \end{align}
 The following result, proved in Appendix \ref{sec:proof of bnd Ak}, translates \eqref{eq:bnd on Ak raw} into an upper bound on $\|A(u_k)-b\|$.
 \begin{lemma}\label{lem:bnd bnd Ak}
 %Suppose that $C$ is sufficiently smooth, in the sense that there exists a constant $\tau_C$ such that 
 %\begin{equation}
 %\|P_{T_C(u)} - P_{T_C(u')}\| \le \tau_C \|u-u'\|,
 %\qquad \forall u,u'\in  C.
 %\label{eq:curvature}
 %\end{equation}
 For an integer $k_0$, consider a subspace $S_K\subseteq \mathbb{R}^d$ such that  
 \begin{align}
 S_{K} \supseteq  \bigcup_{k\in K} T_{C}(u_k),
 \end{align}
 and, with some abuse of notation, let $S_{K}$ also denote an orthonormal basis for this subspace. For $\rho>0$, let 
 \begin{align}
  \eta_{\min } :=
 \begin{cases}
 \min_u \, \left\|  S_{K}^\top P_{T_C(u)} (DA(u)^\top v ) \right\| \\
 \|v\| =1\\
 \|A(u)-b\|\le  \rho\\
 u\in  C,
 \end{cases}
 \label{eq:new slater}
 \end{align}
 and assume that $\eta_{\min}>0$. 
 Suppose also that 
 \begin{align}
 \sup_{k\in K }\|A(u_k)-b\| \le \rho,
 \label{eq:good neighb}
 \end{align}
 \begin{align}
 \operatorname{diam}(C) \le \frac{\eta_{\min}}{2\lambda_A}.
 \label{eq:cnd for well cnd}
 \end{align}
 %\begin{align}
 % \sup_{k\ge k_0} \gamma_k \|G_k\| \le \frac{\eta_{\min}}{2\lambda_A}.
 %\label{eq:cnd for well cnd}
 %\end{align}
 %\begin{align}
 %\sup_{k\ge k_0 }\frac{2\sigma_k}{\eta_{\min}}  \left( \lambda'_h + \eta_{\max} \|y_{k-1}\|+ \|G_k\| \right) \le \rho.
 %\label{eq:to be met asympt}
 %\end{align}
 Then it holds that 
 \begin{align}
  \|A(u_k) -b\| \le \frac{2}{\eta_{\min}\beta_k}  \left( \lambda'_h + \eta_{\max} \|y_{k-1}\|+ \|G_k\| \right) ,
 \qquad \forall k\in K.
 \label{eq:bnd on Ak final}
 \end{align}
 \end{lemma}
 
 
 
 Roughly speaking, \eqref{eq:bnd on Ak final} states that the feasibility gap is itself bounded by the gradient map. In order to apply Lemma \ref{lem:bnd bnd Ak}, let us assume that  (\ref{eq:good neighb}) holds. Recall also the partitioning of $K$ in \eqref{eq:partition beta}.
 We  may now substitute \eqref{eq:bnd on Ak final} back into \eqref{eq:long chain} to find that 
 \begin{align}
 \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 
 & \le   \sum_{k=k_0}^{{k_1}} \delta_{k_0}  \|A(u_k)-b\|^2 + \mu'
 \qquad \text{(see \eqref{eq:long chain})}
 \nonumber\\
 & = \sum_{k\in K_{\beta,l}} \delta_{k_0} \|A(u_k)-b\|^2  + \sum_{k\in K_{\beta,u}} \delta_{k_0}\|A(u_k)-b\|^2  
 + \mu' \nonumber\\
 & \le 
 \sum_{k\in K_{\beta,l}} \frac{\b_{k_0}\delta_{k_0}}{ k\log (k+1)} 
 + \sum_{k\in K_{\beta,u}} \delta_{k_0} \|A(u_k)-b\|^2  
 +\mu'
 \qquad \text{(see \eqref{eq:beta rule})}
 \nonumber\\
 & \le 
 \sum_{k=k_0}^{k_1} \frac{\b_{k_0}\delta_{k_0}}{ k\log (k+1)} 
 + \sum_{k\in K_{\beta,u}}\delta_{k_0} \|A(u_k)-b\|^2  
 +\mu' 
 \nonumber\\
 & \le 
 c\b_{k_0}\delta_{k_0}
 + \sum_{k\in K_{\beta,u}} 
 \frac{4\delta_{k_0}}{\eta_{\min}^2\b_k^2}   \left(  \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2
 +\mu' . 
 \qquad \text{(see (\ref{eq:defn of c},\ref{eq:bnd on Ak final}))}
 \label{eq:adaptive 1}
 \end{align}
 Suppose that $K_{\beta,u} = \cup_{j=1}^{J_\b} K_{\beta,u,j}$, where each $K_{\beta,u,j}$ is an interval, with the starting point $k_{\beta,u,j}\in K$. With this decomposition and after recalling \eqref{eq:beta rule}, we simplify the last line of \eqref{eq:adaptive 1} as 
 \begin{align}
 & \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 \nonumber\\
 & \le \sum_{k=k_0}^{k_1} \delta_{k_0}\|A(u_k)-b\|^2
  + \mu'\nonumber\\
 & \le c\b_{k_0}\delta_{k_0} + \sum_{j=1}^{J_\b}
 \frac{4\delta_{k_0} k_{\b,u,j} \log(k_{\b,u,j}+1)}{\eta_{\min}^2\b^2_{k_{\b,u,j}}} 
   \sum_{k\in K_{\beta,r,j}} \frac{1}{k\log(k+1)}
  \left(  \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2
 + \mu'.
 \qquad \text{(see \eqref{eq:consequences})}
  \label{eq:adaptive 2}
 \end{align}
 The inner sum in the last line above can in turn be bounded as 
 \begin{align}
 & \sum_{k\in K_{\beta,r,j}} \frac{1}{k\log(k+1)}
  \left(  \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2  \nonumber\\
 & \le \sum_{k\in K_{\beta,r,j}} \frac{3}{k\log(k+1)}
  \left(  \lambda_h'^2 + \eta_{\max}^2 \|y_{k-1}\|^2 + \|G_k\|^2 \right)
  \qquad \left( (a+b+c)^2 \le 3a^2+3b^2+3c^2 \right) \nonumber\\
  & \le 3c\lambda_h'^2 + 3 \eta_{\max}^2 B_K + \sum_{k\in K_{\beta,u,j}} \frac{3\|G_k\|^2}{k\log(k+1)}, 
  \qquad \text{(see \eqref{eq:defn of c})}
  \label{eq:adaptive 3}
 \end{align}
 where 
 \begin{align}
 B_K := \sum_{k=k_0}^{k_1} \frac{\|y_{k-1}\|^2}{k \log(k+1)}.
 \label{eq:defn of B}
 \end{align}
 Substituting \eqref{eq:adaptive 3} back into \eqref{eq:adaptive 2}, we find that 
 \begin{align}
 & \sum_{k=k_0}^{k_1} \gamma_k  \|G_k\|^2 \nonumber\\
 & \le\sum_{k=k_0}^{k_1} \delta_{k_0} \|A(u_k)-b\|^2
  + \mu'
 \nonumber\\
 & \le c\b_{k_0}\delta_{k_0}+ 
 \frac{12 c \b'_{k_0}\delta_{k_0} \lambda'^2_h}{\eta_{\min}^2} 
 + 
 \frac{12 \b'_{k_0}\delta_{k_0} \eta_{\max}^2 B_K}{\eta_{\min}^2} 
 +
 \sum_{k\in K} \frac{12  \beta_{k_0}'  \delta_{k_0}   \|G_k\|^2}{ \eta_{\min}^2 k\log(k+1)} 
 +
 \mu',
 \qquad \text{(see \eqref{eq:adaptive 2})}
 \label{eq:to be used for feas}
 \end{align}
 where 
 \begin{align}
 \beta'_{k_0} := \sum_{j=1}^{J_\b} \frac{k_{r,j} \log(k_{r,j}+1)}{ \beta_{k_{\b,r,j}}^2}.
 \label{eq:defn of kappa}
 \end{align}
 To simplify the above expression, let us assume that 
 \begin{equation}
 \frac{12 \beta_{k_0}'\delta_{k_0}}{\eta_{\min}^2 k \log(k+1)} \le \frac{\gamma_k}{2},
 \qquad \forall  k\in K.
 \label{eq:to be used later on}
 \end{equation}
 We also let $|K|=k_1-k_0+1$ denote the size of the interval $K$. 
 After rearranging \eqref{eq:to be used for feas} and applying \eqref{eq:to be used later on}, we arrive at 
 \begin{align}
  \sum_{k=k_0}^{k_1} \frac{\gamma_k}{2} \|G_k\|^2  
 & \le 
 \sum_{k=k_0}^{k_1} \left(\gamma_k - \frac{12 \beta_{k_0}'\delta_{k_0}}{\eta_{\min}^2 k \log(k+1)} \right)\|G_k\|^2 
 \qquad \text{(see \eqref{eq:to be used later on})}
  \nonumber\\
  & \le c\b_{k_0}\delta_{k_0} + 
 \frac{12 c \b'_{k_0}\delta_{k_0} \lambda'^2_h}{\eta_{\min}^2} 
 + 
 \frac{12 \b'_{k_0}\delta_{k_0} \eta_{\max}^2 B_K}{\eta_{\min}^2} 
 +\mu'.
 \qquad \text{(see \eqref{eq:to be used for feas})}
 \label{eq:final bnd on G}
 \end{align}
 In turn, the bound above on the gradient mapping controls the feasibility gap, namely, 
 \begin{align}
 & \sum_{k=k_0}^{k_1} \|A(u_k)-b\|^2 \nonumber\\
 & \le c\b_{k_0} + \frac{12c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{12\b_{k_0}' \eta_{\max}^2 B_K}{\eta_{\min}^2} 
 + \sum_{k\in K} \frac{12\b_{k_0}' \|G_k\|^2}{\eta_{\min}^2 k \log(k+1)} 
 \qquad \text{(see \eqref{eq:to be used for feas})} 
 \nonumber\\
 & \le c\b_{k_0} + \frac{12c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{12\b_{k_0}' \eta_{\max}^2 B_K}{\eta_{\min}^2} 
 + \frac{1}{2\delta_{k_0}} \sum_{k\in K} \gamma_k \|G_k\|^2
 \qquad \text{(see \eqref{eq:to be used later on})} 
 \nonumber\\
 & \le 2c\b_{k_0} + \frac{24c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{24\b_{k_0}' \eta_{\max}^2 B_K}{\eta_{\min}^2} 
 + \frac{\mu'}{\delta_{k_0}}.
 \qquad \text{(see \eqref{eq:final bnd on G})} 
 \label{eq:feas bnd semi final}
 \end{align}
  In order to interpret (\ref{eq:final bnd on G},\ref{eq:feas bnd semi final}), we next  estimate $B_{K}$, as defined in \eqref{eq:defn of B}.  To that end, let us first control the growth of the dual sequence, namely,  $\{y_k\}_k$. Recalling \eqref{eq:y update recall} and for every $k\in K$, we write that 
 \begin{align}
 \|y_k\| & \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \sigma_i \|A(u_i)-b\|
 \qquad \text{(see \eqref{eq:y update recall})}
 \nonumber\\
 & =  \|y_{k_0}\|+ \sum_{i\in K_{\sigma,l}} \sigma_i \|A(u_i)-b\|
 + \sum_{i\in K_{\sigma,m}} \sigma_i \|A(u_i)-b\|
 + \sum_{i\in K_{\sigma,u}} \sigma_i \|A(u_i)-b\|
 \qquad \text{(see \eqref{eq:partition sigma})}
  \nonumber\\
  & \le \|y_{k_0}\|+ \sum_{k\in K_{\s,l}} \frac{\s_{k_0}^2 }{k\log(k+1)}
  + \sum_{i\in K_{\sigma,m}} \sigma_i \|A(u_i)-b\|
 + \sum_{i\in K_{\sigma,u}} \sigma_i \|A(u_i)-b\| \nonumber\\
 & \le \|y_{k_0}\|+c\s_{k_0}^2 
  + \sum_{i\in K_{\sigma,m}} \sigma_i \|A(u_i)-b\|
 + \sum_{i\in K_{\sigma,u}} \sigma_i \|A(u_i)-b\|. 
 \qquad \text{(see \eqref{eq:defn of c})}
 \label{eq:dual growth 1}
 \end{align}
 To evaluate the sum over $K_{\s,m}$ in the last line above, recall the partitioning $K_{\s,m}=\cup_{j=1}^{J_\s} K_{\s,m,j}$. Each $K_{\s,m,j}$ is an interval with the starting point $k_{\s,m,j}$. This allows us to write that 
 \begin{align}
 & \sum_{i\in K_{\s,m}} \s_i \|A(u_i)-b\| \nonumber\\
  & = \sum_{j=1}^{J_\s} \sum_{i\in K_{\s,m,\j}} \s_i \|A(u_i)-b\| \nonumber\\
 & \le 
  \sum_{j=1}^{J_\s} \sum_{i\in K_{\s,m,\j}} \frac{\s_{k_0}\s_i }{\sqrt{i\log(i+1)} }
 \qquad \text{(see \eqref{eq:sigma rule})} 
  \nonumber\\
 & = \sum_{j=1}^{J_\s}  \s_{k_0}\s_{k_{\s,m,j}} \sqrt{k_{\s,m,j} \log(k_{\s,m,j}+1)} \sum_{i\in K_{\s,m,\j}} \frac{1}{i\log(i+1)}
 \qquad \text{(see \eqref{eq:sigma rule})}
 \nonumber\\
 & \le  c \s_{k_0}\s'_{k_0}.
 \qquad \text{(see (\ref{eq:defn of c},\ref{eq:defn of sigmap}))} 
 \label{eq:dual growth 2}
 \end{align}
 To bound the last sum in \eqref{eq:dual growth 1}, we write that 
 \begin{align}
 & \sum_{i\in K_{\s,u}} \s_i \|A(u_i)-b\|
 \nonumber\\ & 
 = \sum_{i\in K_{\s,u}} \delta_{k_0} (\b_i- \b_{i-1})\|A(u_i)-b\|
 \qquad \text{(see \eqref{eq:sigma rule})}
  \nonumber\\
 & \le  \sum_{i\in K_{\s,u}} \rho \delta_{k_0} (\b_i- \b_{i-1})
 \qquad \text{(see \eqref{eq:good neighb})} \nonumber\\
 &\le \sum_{i\in K_{\s,u}} \rho \delta_{k_0} \cdot 2\b_{k_0} \sqrt{\frac{\log i}{(i-1)k_0 \log(k_0+1)}}
 \qquad \text{(see \eqref{eq:consequences})}\nonumber\\
 & \le 4\rho \beta_{k_0}\delta_{k_0} \sqrt{\frac{k_{\s,u,\max} \log k_{\s,u,\max} }{k_0 \log(k_0+1)}}
 \qquad \left( \sum_{i=1}^{k} \frac{1}{\sqrt{i}} \le \int_{0}^{k} \frac{dx}{\sqrt{x}} =  2 \sqrt{x} \right),
 \label{eq:dual growth 3}
 \end{align}
 where 
 \begin{align}
 k_{\s,u,\max}  := \max_{i\in K_{\s,u}} i,
 \label{eq:largest index}
 \end{align}
 is the largest index in the subset $K_{\s,u}$. By combining (\ref{eq:dual growth 1},\ref{eq:dual growth 2},\ref{eq:dual growth 3}), we find that 
 \begin{align}
 \|y_k\| & \le \|y_{k_0}\|+ c \s_{k_0}(\s_{k_0}+\s'_{k_0}) + 4\rho \beta_{k_0}\delta_{k_0} \sqrt{\frac{k_{\s,u,\max} \log k_{\s,u,\max} }{k_0 \log(k_0+1)}} =: y^{\max}.
 \label{eq:dual growth}
 \end{align}
 \begin{remark}
 The bound on dual sequence in \eqref{eq:dual growth} is meaningful if the algorithm, after finitely many iterations, leaves and never returns to the third regime in \eqref{eq:sigma rule}. For sufficiently large $\s_{k_0}$, this is a reasonable assumption because the feasibility decays like $1/\sqrt{k}$ in general. One can also remove the dependence on $k_{\s,u,\max}$ in \eqref{eq:dual growth} by replacing the third regime of \eqref{eq:sigma rule} with the (slightly slower) dual step size of 
 \begin{equation}
 \s_{k_0} \min\left( \frac{\b_k-\b_{k-1}}{\b_{k_0}-\b_{k_0-1}} , \frac{1}{k\|A(u_k)-b\|} \right).
 \end{equation}
 \end{remark}
 We can now finally evaluate $B_{K}$ as 
 \begin{align}
 B_{K} & = \sum_{k=k_0-1}^{{k_1-1}} \frac{ \|y_{k}\|^2 }{(k+1)\log( k+2)}  
 \qquad \text{(see \eqref{eq:defn of B})}
 \nonumber\\
 & \le \sum_{k=k_0-1}^{k_1-1} \frac{(y^{\max})^2}{(k+1)\log(k+2)} 
 \qquad \text{(see \eqref{eq:dual growth})}
 \nonumber\\
 & \le c (y^{\max})^2.
 \qquad \text{(see \eqref{eq:defn of c})}
 \label{eq:Bk evaluated}
 \end{align}
 Let us now revisit and simplify the condition imposed in (\ref{eq:good neighb}). 
 To that end, we first derive a weaker but uniform bound on the feasibility gap. For every $k\in K$, it holds that 
 \begin{align}
 \|A(u_k)-b\|^2 & \le \sum_{i=k_0}^{k_1} \|A(u_i)-b\|^2 
 \nonumber\\
 & \le 2c\b_{k_0} + \frac{24c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{24\b_{k_0}' \eta_{\max}^2 B_K}{\eta_{\min}^2} 
 + \frac{\mu'}{\delta_{k_0}}
 \qquad \text{(see \eqref{eq:feas bnd semi final})} \nonumber\\
 & \le 2c\b_{k_0} + \frac{24c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{24c\b_{k_0}' \eta_{\max}^2 (y^{\max})^2}{\eta_{\min}^2} 
 + \frac{\mu'}{\delta_{k_0}}. 
 \qquad \text{(see \eqref{eq:Bk evaluated})}
 \label{eq:rate of feas gap}
 \end{align}
 We may therefore replace \eqref{eq:good neighb} with the assumption that 
 \begin{equation}
 2c\b_{k_0} + \frac{24c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{24c\b_{k_0}' \eta_{\max}^2 (y^{\max})^2}{\eta_{\min}^2} 
 + \frac{\mu'}{\delta_{k_0}}\le \rho^2,
 \label{eq:suff cnd on rho}
 \end{equation}
 which ensures that \eqref{eq:good neighb} holds. 
 
 \begin{remark}
 Note that \eqref{eq:suff cnd on rho} controls the initial feasibility $\|A(u_{k_0})-b\|$, as it appears in (\ref{eq:defn mu},\ref{eq:defn of mup}). 
 \end{remark}
 By combining (\ref{eq:final bnd on G},\ref{eq:feas bnd semi final},\ref{eq:Bk evaluated}), we reach 
 \begin{align}
 & \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 + \|A(u_k)-b\|^2  \nonumber\\
 &\le  2c\b_{k_0}(\delta_{k_0}+1) 
 + 
 \frac{24\b_{k_0}'\lambda'^2_h}{\eta_{\min}^2}(\delta_{k_0}+1) 
 + 
 \frac{24\b_{k_0}'\eta_{\max}^2 B_K}{\eta_{\min}^2}(\delta_{k_0}+1) 
 +\frac{\mu' }{\delta_{k_0}}\left( 2\delta_{k_0}+1 \right) 
  \nonumber\\
 &   \le   4c\b_{k_0}\delta_{k_0}
 + 
 \frac{48\b_{k_0}'\lambda'^2_h \delta_{k_0}}{\eta_{\min}^2} 
 + 
 \frac{48\b_{k_0}'\eta_{\max}^2 B_K}{\eta_{\min}^2}
 + 3\mu' 
 \qquad \left( \sigma_{k_0} \gg \beta_{k_0},\beta_{k_0-1} \Longrightarrow 
 \delta_{k_0} \gg 1
 \right)
 \nonumber\\
 & \le   4c\b_{k_0}\delta_{k_0}
 + 
 \frac{48\b_{k_0}'\lambda'^2_h \delta_{k_0}}{\eta_{\min}^2} 
 + 
 \frac{48c\b_{k_0}'\eta_{\max}^2 (y^{\max})^2}{\eta_{\min}^2}
 + 3\mu' . 
 \qquad \text{(see \eqref{eq:Bk evaluated})}
 \label{eq:raw bnd on sum}
 \end{align}
 We next find a lower bound on the step sizes $\{\gamma_k\}_k$ by invoking \eqref{eq:moreover}. To do so,  we  first need to gauge how smooth the augmented Lagrangian $\mathcal{L}_{\beta_k}(\cdot,y_k)$ is (as a function of its first argument). For every $k\in K$ and after recalling Lemma \ref{lem:11}, we  note that 
 \begin{align}
 \lambda_{\beta_k} & \le \lambda_h+ \sqrt{m}\lambda_A \left( \|y_k\| + \beta_k \|A(u_k)-b\| \right)+ \beta_k \|DA(u_k)\|_F^2  
 \qquad \text{(see \eqref{eq:smoothness of Lagrangian})} \nonumber\\
 & \le \lambda_h+ \sqrt{m}\lambda_A \left( y^{\max} + \rho \beta_k \right) + 
 \beta_k  m \eta_{\max}^2 
 \qquad \text{(see (\ref{eq:defn lambdap n etaMax},\ref{eq:good neighb},\ref{eq:dual growth}))} \nonumber\\
 & = \lambda_h+\sqrt{m}\lambda_A y^{\max} + 
 \beta_k \left( \sqrt{m}\lambda_A \rho + m \eta_{\max}^2 \right)
 \nonumber\\
 & \le  2\beta_k \left( \sqrt{m}\lambda_A \rho + m \eta_{\max}^2 \right),
 \label{eq:smoothness at k}
 \end{align}
 where the last line holds if $k_0$ is sufficiently large; see \eqref{eq:beta rule}. We are now in position to invoke \eqref{eq:moreover} and write that 
 \begin{align}
 \gamma_k & \ge  \frac{\theta}{\lambda_{\beta_k}} \qquad \text{(see \eqref{eq:moreover})}\nonumber\\
 & \ge \frac{\theta }{2\beta_k( \sqrt{m}\lambda_A \rho + 2m \eta_{\max}^2)  }
 \qquad \text{(see \eqref{eq:smoothness at k})} \nonumber\\
 & \ge \frac{\theta}{2 \beta_{k_0} (\sqrt{m}\lambda_A\rho+2m\eta^2_{\max})} \sqrt{\frac{k_0 \log (k_0+1) }{ k \log(k+1)}}
 \qquad \text{(see \eqref{eq:consequences})} \nonumber\\
 & =: \gamma' \sqrt{\frac{k_0 \log (k_0+1) }{ k \log(k+1)}},
 \label{eq:low bnd on gammas}
 \end{align}
 for every $k\in K$. 
 The lower bound on the step sizes in \eqref{eq:low bnd on gammas} has two immediate consequences. First, we find that \eqref{eq:to be used later on} automatically holds if $k_0$ is sufficiently large and not too many transitions happen. Second, it allows us to improve \eqref{eq:raw bnd on sum} by writing that 
 \begin{align}
 & \sum_{k=k_0}^{k_1} \gamma' \sqrt{\frac{k_0 \log (k_0+1) }{ k \log(k+1)}} \|G_k\|^2 +\|A(u_k)-b\|^2 \nonumber\\
 & \sum_{k=k_0}^{k_1}\gamma_k \|G_k\|^2 +\|A(u_k)-b\|^2 \nonumber\\
 & \le 
  4c\b_{k_0}\delta_{k_0}
 + 
 \frac{48\b_{k_0}'\lambda'^2_h \delta_{k_0}}{\eta_{\min}^2} 
 + 
 \frac{48c\b_{k_0}'\eta_{\max}^2 (y^{\max})^2}{\eta_{\min}^2}
 + 3\mu', 
 \qquad \text{(see \eqref{eq:raw bnd on sum})}
 \end{align}
 which immediately yields that 
 \begin{align}
 \min_{k\in K} \gamma' \sqrt{\frac{k_0 \log(k_0+1)}{k_1\log(k_1+1)}} \|G_k\|^2+ \|A(u_k)-b\|^2 = \frac{O(1)}{|K|}.
 \end{align}
 
 
 
 
 
 
 
 \begin{theorem}
 \label{thm:main}
 Let $\gamma_k$ be the output of the line search subroutine in our algorithm in iteration $k$.
 For integers $k_0\le k_1$, consider the interval $K=[k_0:k_1]$ and the choice of parameters 
 \begin{align}
 \beta_k = \frac{\beta}{\sqrt{k}},
 \qquad 
 \sigma_k = \sigma \sqrt{k},
 \qquad \forall k\in K,
 \label{eq:updates thm nonadapt}
 \end{align}
 and $\beta,\sigma>0$. Alternatively consider the (adaptive) choice of parameters
 \begin{align}
 \beta_k =
 \begin{cases}
 \beta_{k-1} & \|A(u_k)-b\| \le  \frac{\beta_{k_0}}{\sqrt{k\log (k+1)}},\\
 \\
 \beta_{k-1}
 \sqrt{\frac{k\log(k+1)}{(k-1)\log k}} & 
  \|A(u_k)-b\| > \frac{\beta_{k_0}}{\sqrt{k\log (k+1)}}, 
 \end{cases}
 \end{align}
 \begin{align}
 \sigma_k =
 \begin{cases}
 \sigma_{k-1} & \|A(u_k)-b\| \le \frac{\sigma_{k_0}}{k\log (k+1)},\\
 \\
 \sigma_{k-1} \sqrt{\frac{(k-1) \log k}{k \log(k+1)}} 
 & 
 \frac{\sigma_{k_0}}{k\log (k+1)} \le \|A(u_k)-b\| \le \frac{\sigma_{k_0}}{\sqrt{k\log (k+1)}},
 \\
 \\
 \sigma_{k_0}\frac{\beta_k - \beta_{k-1}}{\beta_{k_0}-\beta_{k_0-1}} & \|A(u_k)-b\| > \frac{\sigma_{k_0}}{\sqrt{k\log (k+1)}},
 \end{cases}
 \label{eq:updates thm}
 \end{align}
 for $\s_{k_0}\ge \b_{k_0}>0$. We impose the following geometric requirements on the constraints. Let $P_{T_C(u)}$ and $P_{N_C(u)}$ denote the orthogonal projection onto the tangent and normal cones at $u\in C$, respectively.
 Consider a subspace $S_{K}\subseteq \mathbb{R}^d$ such that 
 \begin{align}
 S_{K} \supseteq \bigcup_{k\in K} T_C(u_k),
 \end{align}
 and, with some abuse of notation, let $S_{K}$ also denote an orthonormal basis for this subspace. For $\rho>0$, we assume that the nonconvex Slater's condition holds, that is, assume that  $\eta_{\min}>0$, where 
 \begin{align}
  \eta_{\min} :=
 \begin{cases}
 \min_u \, \left\| S_{k_0}^\top P_{T_C(u)} (DA(u)^\top v) \right\| \\
 \|v\|=1\\
 \|A(u)-b\|\le  \rho\\
 u\in  C. 
 \end{cases}
 \label{eq:new slater}
 \end{align}
 Suppose that 
 \begin{align}
 \operatorname{diag}(C) \le \frac{2\eta_{\min}}{\lambda_A},
 \end{align}
 and that $\rho$ is large enough so that (\ref{eq:suff cnd on rho}) holds. Assume also that $k_0\gg 1$, that $\s_{k_0}\gg \b_{k_0}$, and the number of regime transitions in (\ref{eq:updates thm}) are $O(1)$. 
 Then, for both updates in (\ref{eq:updates thm nonadapt},\ref{eq:updates thm}), it holds that 
 \begin{align}
 \min_{k\in K} \gamma' \sqrt{\frac{k_0 \log(k_0+1)}{k_1\log(k_1+1)}} \|G_k\|^2+ \|A(u_k)-b\|^2 = \frac{O(1)}{k_1-k_0},
 \end{align}
 for a factor $\gamma'$ defined in (\ref{eq:low bnd on gammas}).
 \end{theorem}
 
 
 
 
 %\paragraph{\textbf{Summary of Bang's argument.}} Bang's argument is summarized below for comparison:
 %\begin{align}
 %& \frac{\gamma_k G_k^2}{2} \nonumber\\
 %& \le h_k - h_{k+1} + (A_k - A_{k+1}) \cdot y_k + \frac{\|A_k\|^2}{2\beta_k} - \frac{\|A_{k+1}\|^2}{2\beta_k} \nonumber\\
 %& = h_k - h_{k+1} + A_k \cdot y_k - A_{k+1} \cdot y_{k+1} + A_{k+1} \cdot (y_{k+1} - y_{k}) + \frac{\|A_k\|^2}{2\beta_k} - \frac{\|A_{k+1}\|^2}{2\beta_k} \nonumber\\
 %& = h_k - h_{k+1} + A_k \cdot y_k - A_{k+1} \cdot y_{k+1}  + \left(\frac{1}{\sigma_{k+1}} - \frac{1}{2\beta_k} \right) \|A_{k+1}\|^2 + \frac{\|A_k\|^2}{2\beta_k}
 %\quad \text{(definition of ys)}
 % \nonumber\\
 %& \le (h_k  +A_k \cdot y_k + \frac{G_{k-1}}{4} + \frac{\|A_k\|^2}{2\beta_{k-1}}) - (h_{k+1}+  A_{k+1} \cdot y_{k+1}  + \frac{G_k}{4} + \frac{\|A_{k+1}\|^2}{2\beta_k}) + e_k.  
 %\qquad \text{(Bang's assumption)}
 %\end{align}
 %\textbf{We form a telescope with the last line above, which throws away any  information in $\{y_k\}$. 
 %}
 
 
 \paragraph{\textbf{Example: Max-cut}}
 
 Consider the factorized max-cut program, namely,  
 \begin{align}
 \begin{cases}
 \min \langle UU^\top , H\rangle\\
 \text{diag}(UU^\top) = 1,
 \end{cases}
 \end{align}
 where $U\in \mathbb{R}^{d'\times r}$. For every $i$, let $u_i\in\mathbb{R}^r$ denote the $i$th row of $U$. Let us form $u\in\mathbb{R}^d$ with $d=d'r$ by vectorizing $U$, namely, 
 \begin{equation}
 u = [u_1^\top \cdots u_{d'}^\top]^\top.
 \end{equation} 
 We can therefore cast the above program as Program \eqref{prob:01} with 
 \begin{align}
 h(u) = \sum_{i,j} H_{i,j} \langle u_i, u_j \rangle,
 \end{align}
 \begin{align}
 A: u \rightarrow [\|u_1\|^2 \cdots \|u_{d'}\|^2]^\top. 
 \end{align}
 It is easy to verify that 
 \begin{align}
 DA(u) = 
 \left[
 \begin{array}{ccc}
 u_1^\top & \cdots & 0\\
 \vdots\\
 0 & \cdots & u_{d'}^\top
 \end{array}
 \right]\in \mathbb{R}^{d'\times d}.
 \end{align}
 In particular, if we take $S_{k_0}=\mathbb{R}^d$ and $\rho <1$, we have $P_{T_C(u)}=I_d$ and thus
 \begin{align}
 \eta_{\min} & = 
 \begin{cases}
 \min_u \eta_{\min}\left(
 DA(u)
 \right)\\
 \|A(u) - 1\| \le \rho
 \end{cases} \nonumber\\
 & =
 \begin{cases}
 \min_u \editf{ \min_i \|u_i\| } \\
 | \|u_i\|^2 - 1 | \le \rho
 \qquad \forall i
  \end{cases}
  \nonumber\\
  & \ge \editf{\sqrt{1- \rho} }>0.
 \end{align}
 Above, $\eta_{\min}(DA(u))$ returns the smallest singular value of $DA(u)$. Consequently, the nonconvex Slater's condition holds for the max-cut problem.
 
 \paragraph{\textbf{Example: Clustering}}
 
 Consider the factorized clustering problem, namely, 
 \begin{align}
 \begin{cases}
 \min \langle UU^\top , H \rangle \\
 UU^\top 1 = 1 \\
 \|U\|_F \le \sqrt{k}\\
 U \ge 0,
 \end{cases}
 \end{align}
 where $U\in \mathbb{R}^{d'\times r}$  and $k$ is the number of clusters. We form $u\in \mathbb{R}^d$ as before. Note that the above program can be cast as Program \eqref{prob:01} with the same $h$ as before and 
 \begin{align}
 A: u\rightarrow 
 \left[
 \begin{array}{ccc}
 u_1^\top \sum_j u_j & \cdots u_{d'}^\top \sum_j u_j 
 \end{array}
 \right]^\top \in \mathbb{R}^{d'},
 \end{align}
 and also $C=\sqrt{k}B_{2+}$, where $B_{2+}\subset \mathbb{R}^{d'}$ is the intersection of the unit $\ell_2$-ball with the positive orthant. 
 Note that 
 \begin{align}
 DA(u) = 
 \left[
 \begin{array}{ccc}
 w_{1,1} u_1^\top & \cdots & w_{1,d'} u_{1}^\top\\
 \vdots\\
 w_{d',1} u_{d'}^\top & \cdots & w_{d',d'} u_{d'}^\top
 \end{array}
 \right],
 \label{eq:Jacobian clustering}
 \end{align}
 where $w_{i.i}=2$ and $w_{i,j}=1$ for $i\ne j$. Let us start with the case where $u\in \partial C$ belongs to the boundary of $C$, namely, $\|u\|=\sqrt{k}$. We also assume that $u>0$. 
 Under these assumptions, note that 
 \begin{equation}
 T_C(u)=\{z \in \mathbb{R}^{d'}:  \langle u, z\rangle = 0 \},
 \end{equation}
 and, consequently, 
 \begin{equation}
 P_{T_C(u)} = I_d - \frac{uu^\top}{\|u\|^2} = I_d - \frac{uu^\top}{k}. 
 \end{equation}
 For simplicity, let us assume that $\rho=0$, namely, $u$ is a feasible point of Program \eqref{prob:01}. 
 Then we find that 
 \begin{align}
 \eta_{\min} ( P_{T_C(u)}  DA(u)^\top) 
 & 
 = \eta_{\min} ( (I-\frac{uu^\top}{k} )  DA(u)^\top) 
 \nonumber\\
 & \ge 
 \eta_{\min} ( DA(u) ) - \frac{1}{k} \| uu^\top DA(u)^\top\| 
 \qquad \text{(Weyl's inequality)}
 \nonumber\\
 & = 
 \eta_{\min} ( DA(u) ) - \frac{1}{\sqrt{k}}\| DA(u) u\| 
 \qquad \left( \|u\|=\frac{1}{\sqrt{k}} \right).
 \label{eq:lower bnd}
 \end{align}
 We evaluate each term in the last line above separately. By its definition in \eqref{eq:Jacobian clustering}, first note that 
 \begin{align}
 \eta_{\min}(DA(u)) & \ge 
 \eta_{\min}\left( \left[
 \begin{array}{ccc}
 u_1 & \cdots & u_{d'}\\
 \vdots\\
 u_1 & \cdots &  u_{d'}
 \end{array}
 \right]\right) 
 - \max_i \|u_i\|
 \qquad \text{(Weyl's inequality)}
  \nonumber\\
 & = \sqrt{d}' \eta_{\min}(U) -\max_i \|u_i\| \nonumber\\
 & \ge  \sqrt{d'} \eta_{\min }(U)  - 1,
 \label{eq:lower bnd on DA}
 \end{align}
 where in the last line follows from the assumptions that $u_i^\top \sum_j u_j =1$ for all $i$ and that $u>0$ to see that $\max_i \|u_i\|\le 1$. Note also that 
 \begin{align}
 \|DA(u) u \| & \le 
 \left\| 
 \left[
 \begin{array}{ccc}
 u_1^\top & \cdots & u_1^\top\\
 \vdots\\
 u_{d'}^\top & \cdots & u_{d'}^\top  
 \end{array}
 \right]
 u
 \right\|
 + \sqrt{ \sum_{i=1}^{d'}  \|u_i\|^4} \nonumber\\
 & = \|1_{d'}\|+\max_i \|u_i\| \cdot \|u\| \nonumber\\
 & \le \sqrt{d'}+ \sqrt{k},
 \end{align}
 where the last line follows because $\max_i\|u_i\|\le 1$ and $\|u\|=\sqrt{k}$ by assumption. 
 Consequently, we reach 
 \begin{align}
 \eta_{\min} ( P_{T_C(u)}  DA(u)^\top) \ge 
  & 
 \sqrt{d'} \left(\eta_{\min}(U)  - \frac{1}{\sqrt{k}} \right) - 2 .
 \qquad \text{(see \eqref{eq:lower bnd})}
 \end{align}
 By continuity, we extend the results to the case where $u $ might have zero entries. Lastly, in the case where $u\in \text{int}(C)$ (namely, $\|u\|< \sqrt{k}$ and $u>0$), we have that $T_C(u)=\mathbb{R}^d$ and it directly follows from  \eqref{eq:lower bnd on DA} that 
 \begin{align}
 \eta_{\min} (P_{T_C(u)} DA(u)) & = \eta_{\min} (DA(u))\nonumber\\
 &  \ge \sqrt{d'} \eta_{\min}(U) -1.
 \qquad \text{(see \eqref{eq:lower bnd on DA})}
 \end{align}
 Having studied all cases for $u$ for $\rho=0$, we conclude that 
 \begin{align}
 \eta_{\min} \ge  \sqrt{d'} \left(\eta_{\min}(U)  - \frac{1}{\sqrt{k}} \right) - 2. 
 \qquad \text{(see \eqref{eq:new slater})} 
 \end{align}
 Roughly speaking, as long as $\eta_{\min}(U) \gtrsim 1/\sqrt{k}$, the right-hand side is greater than zero and the nonconvex Slater's condition holds. We assumed for simplicity that $\rho=0$ but this is expected to hold for $\rho$ sufficiently small as well by continuity.  
 
 
 %% mfsahin generalized eigenvalue problem
 \editf{
 
 \paragraph{\textbf{Example: Generalized eigenvalue problem}}
 
 Consider the following generalized eigenvalue problem for a pair of symmetric matrices.
 \begin{align}
 Hu_i = \lambda_i Lu_i ~~~~ i \in [n]
 \end{align}
 ,where $\lambda_i$ is the eigenvalue corresponding to eigenvector $u_i$.
 Top eigenvalues of pairs $H$ and $L$ can be obtained by solving the nonconvex optimization problem, namely, 
 \begin{align}
 \begin{cases}
 \min  -u^\top H u \\
 u^\top Lu = 1 \\
 \end{cases}
 \end{align}
 where $u\in \mathbb{R}^{d'\times 1}$. Notice that $r' = 1$ and hence $d = d'$ for this problem. Hence the above program can be cast as the following
 
 \begin{align}
 h(u) = \sum_{i, j} H_{i, j}  (u_i u_j)
 \end{align}
 
 \begin{align}
 A: u\rightarrow 
 \left[
 \sum_{i,j} L_{i,j} (u_i u_j)
 \right] \in \mathbb{R},
 \end{align}
 
 Note also that 
 \begin{align}
 DA(u) = %u^\top \text{I}_d L u = 
 \left[
 \begin{array}{ccc}
 u_1^2 L_{1,1}, & u_2^2 L_{2,2}, & \cdots, u_d^2 L_{d,d}
 \end{array}
 \right],
 \label{eq:Jacobian geneigen}
 \end{align}
 Now let $\rho < 1$ and take $S_k0$ to be the whole space, namely, $S_{k0} = \mathbb{R}^d$. Under these assumptions we have 
 
 \begin{align}
 \eta_{\min} & = 
 \begin{cases}
 \min_u \eta_{\min}\left(
 DA(u)
 \right)\\
 \|A(u) - 1\| \le \rho
 \end{cases} \nonumber\\
 & =
 \begin{cases}
 \min_u \sqrt{\sum_{ i}  L_{i,i}^2 u_i^4} \\
 | \sum_{i, j} L_{i,j}u_i u_j - 1 | \le \rho
  \end{cases}
  \nonumber\\
 &  >0. \nonumber
 \end{align}
 The above directly holds true if $u \in K$ does not contain the zero vector and at least one diagonal element of H is nonzero. 
 Former is a necessary condition for the proposed algorithm to proceed for quadratically constraint quadratic problems. 
 %We need to find a lower bound on the objective for the above optimization program using the feasibilyty condition,
 %\begin{align}
 %\rho \geq \sum_{i, j} L_{i,j}u_i u_j - 1  &\geq -\rho \nonumber \\
 %\sum_{i} L_{i,i}u_i^2 + \sum_{i\neq j} L_{i,j}u_i u_j- 1&\geq -\rho \nonumber
 %\end{align}
 
 }
 
 
 
 
 
 \paragraph{\textbf{New Slater's condition}}
 
 Here we describe a variant of the Slater's condition for Program \eqref{prob:01}.  
 \begin{definition}
 \textbf{(Nonconvex Slater's condition)} Let $\theta_{\min}$ be the smallest angle between to subspace and define $\psi$ to be 
 \begin{align}
 \psi_{A,C} & :=
 \begin{cases}
 \inf_u \, \sin \left( \theta_{\min} (\Null(A),T_C(u)) \right)\\
 A u \ne 0\\
 u\in \partial C
 \end{cases} \nonumber\\
 & = \begin{cases}
 \inf_u \, \eta_{\min}\left(P_{T_C(u)}A^\top\right)\\
 A u \ne 0\\
 u\in \partial C
 \end{cases}
 \end{align}
 where $\partial C$ is the boundary of $C$, and $\eta_{\min}$ returns the smallest singular value. 
 We say that Program \eqref{prob:01} satisfies the Slater's condition if $\psi_{A,C}>0$.  
 \end{definition}
 As a sanity check, we have the following result. 
 \begin{proposition}\label{prop:convex}
 The nonconvex Slater's condition for Program (\ref{prob:01}) implies the standard Slater's condition when $A$ is a linear operator and Program (\ref{prob:01}) is feasible.  
 \end{proposition}
 \begin{proof}
 Suppose that the standard Slater's condition does not hold, namely, that 
 \begin{equation}
 \relint(\Null(A) \cap C) = \Null(A)\cap \relint(C) =  \emptyset.
 \label{eq:no feas}
 \end{equation} 
 Since Program \eqref{prob:01} is feasible, there exists a feasible $u$, namely, $Au=0$ and $u\in C$. By \eqref{eq:no feas}, it must be that $u\in \partial C$ and that $\Null(A)$ supports $C$ at $u$ \textbf{Why?}.  In particular, it follows that $\Null(A) \cap T_C(u) \ne \{0\}$ or, equivalently, 
 $\row(A)\cap N_C(u) \ne \{0\}$. That is, there exists a unit-norm vector $v$ such that 
 \begin{align}
 P_{T_C(u)}A^\top v=0,
 \label{eq:existence}
 \end{align}
 and consequently 
 \begin{align}
 \eta_{\min}(P_{T_C(u)}A^\top) = 0.
 \end{align}
 Because $\eta_{\min}(P_{T_C(u)}A^\top)$ is a continuous function of $u$ \textbf{(Why?)}, we conclude that $\psi_{A,C}=0$, namely, the nonconvex Slater's condition also does not hold, thereby completing the proof of Proposition \ref{prop:convex}.
 %
 %Consider now the curve $g: [0,1]\rightarrow \partial C$ passing through $u$ with the direction of $A^\dagger v$, namely,
 %\begin{equation}
 %g(0) = u,
 %\qquad 
 %\frac{dg}{dt}(0) = \lim_{t\rightarrow0^+}\frac{g(t)-g(0)}{t} = (A^\dagger)^\top v.
 %\end{equation}
 %\textbf{Does the above curve exist even when $C$ is not smooth?} Then note that 
 %\begin{align}
 %& \eta_{\min}(P_{T_C(u)} A^\top) \nonumber\\
 %& \le  \lim_{t\rightarrow 0^+} \frac{\| P_{T_C(u)}A^\top  Ag(t)\|}{\|Ag(t)\|}  \nonumber\\
 %& = \lim_{t\rightarrow 0^+} \frac{\| P_{T_C(u)}A^\top  (Au + AA^\dagger v)\|}{\|Au t + A A^\dagger v + o(t)\|} \nonumber\\
 %& =  \lim_{t\rightarrow 0^+} \frac{\| P_{T_C(u)}A^\top  ( t v +o(t))\|}{\|t v+o(t)\|} 
 %\qquad \left( Au = 0,\,\, AA^\dagger = I \right) \nonumber\\
 %& = \lim_{t\rightarrow 0^+} \frac{o(t)}{t} 
 %\qquad \text{(see \eqref{eq:existence})} \nonumber\\
 %& = 0. 
 %\end{align}
 %
 %
 %
 \end{proof}
 
 
 
 
 %\paragraph{\textbf{Intuition in the convex case}}
 %We give some qualitative discussion about the last condition in \eqref{eq:conditions} here. Since this condition only pertains the feasibility, we consider the convex feasibility program 
 %\begin{equation}
 %\begin{cases}
 %\min 0\\
 %A u = 0\\
 %u\in C. 
 %\end{cases}
 %\end{equation}
 %Then consider the algorithm
 %\begin{equation}
 %u_{k+1} = P_C (u_k - \gamma A^T Au_ k ).
 %\end{equation}
 %To build intuition, assume that $A$ has orthonormal columns. Then, if $\gamma=1$, the algorithm above is an instance of the alternative projection algorithm. To further simplify the setup, let us assume that $C$ is the null space of a matrix$B$, namely, 
 %\begin{equation}
 %\begin{cases}
 %\min 0\\
 %A u = 0\\
 %B u= 0. 
 %\end{cases}
 %\end{equation}
 %Our algorithm simplifies to 
 %\begin{equation}
 %u_{k+1} = (I - B^T B) (I - A^T A ) u_k.
 %\end{equation}
 %In this exposition, we assume that $\text{dim}(\text{null}(A))+\text{dim}(\text{null}(B))= d$ for simplicity. All these assumptions are for the sake of simplicity of our argument and can be relaxed.   
 %The convergence is dictated by the corresponding spectral norm:
 %\begin{equation}
 %\rho = \| (I- B^TB) (I - A^TA) \| = \cos^2(\theta_1),
 %\end{equation}
 %where $\theta_1$ is the smallest angle between the subspaces $\text{null}(A)$ and $\text{null}(B)$. Then note that 
 %\begin{equation}
 %\eta_{\min} ((I- B^TB) A^T) = \eta_{\min} ((I- B^TB) A^TA ) = \sin^2(\theta_1) = 1- \cos^2(\theta_1) = 1- \rho ,
 %\end{equation}
 %where $\eta_{\min}$ returns the smallest singular value. The term on the far left is exactly $ \eta_{\min}$ in the condition \eqref{eq:conditions}. As $ \eta_{\min} $ reduces, the convergence rate slows down. This is also easy to visualize with two lines in the plane. \textbf{To summarize, ignoring the objective function $h$, we can think of our algorithm as alternative projections in the convex setting. There, the convergence rate is exactly determined with the $\eta_{\min}$ in \eqref{eq:conditions}.}
 
 
 \section{Proof of Lemma \ref{lem:bnd bnd Ak} \label{sec:proof of bnd Ak}}
 If $A(u_k)=b$, then \eqref{eq:bnd on Ak final} holds trivially. Otherwise, 
 %we  show that $P_{T_C(u_{k+1})} DA(u_k)^\top$ is a well-conditioned matrix, in order to lower bound \eqref{eq:bnd on Ak raw}. More specifically, 
 for an integer $k_0$, consider a subspace 
 \begin{align}
 S _{K} \supseteq \bigcup_{k\in K} T_{C}(u_k),
 \label{eq:defn of S}
 \end{align}
 and let $S_{K}$ with orthonormal columns  denote a basis for this subspace, with some abuse of notation. We then assume that  
 \begin{align}
 0 < \eta_{\min} :=
 \begin{cases}
 \min_u \, \left\| S_{K}^\top P_{T_C(u)} ( DA(u)^\top v )  \right\| \\
 \|v\|=1\\
 \|A(u)-b\|\le  \rho\\
 u\in  C.
 \end{cases}
 \label{eq:new slater proof}
 \end{align}
 If $ \max_{k\in K} \|A(u_k)-b\| \le \rho $,
 then \eqref{eq:new slater proof} is in force and, for every $k\ge k_0$, we may write that 
 \begin{align}
 &  \left\| P_{T_C(u_{k+1})}( DA(u_{k})^\top (A(u_k)-b) ) \right\| \nonumber\\
 & \ge \left\| P_{T_C(u_{k+1})} ( DA(u_{k+1})^\top (A(u_k)-b) ) \right\| -  \left\|  (   DA(u_{k+1}) - DA(u_k) )^\top (A(u_k)-b) \right\| 
 \qquad \text{(non-expansiveness of projection)}
 \nonumber\\
 & \ge \eta_{\min} \|A(u_k)-b\| -  \left\|    DA(u_{k+1}) - DA(u_k)  \right\| \|A(u_k)-b\|
 \qquad \text{(see \eqref{eq:new slater proof})}
 \nonumber\\
 & \ge \eta_{\min}  \|A(u_k)-b\| - \lambda_A \|u_{k+1}-u_k\| \cdot \|A(u_k)-b\|
 \qquad \text{(see \eqref{eq:smoothness basic})}
 \nonumber\\
 & = \left( \eta_{\min}  -\lambda_A \gamma_{k} \|G_{k}\|  \right) \|A(u_k)-b\|
 \qquad \text{(see \eqref{eq:grad map recall})} \nonumber\\
 & \ge \frac{\eta_{\min}}{2} \|A(u_k)-b\|,
 \label{eq:well cnd}
 \end{align}
 where the last line above uses the observation that 
 \begin{align}
 \lambda_A\gamma_k\|G_k\| & = \lambda_A \|u_{k+1}-u_k\| \nonumber\\
 & \le \lambda_A \text{diam}(C) \nonumber\\
 & \le \frac{\eta_{\min}}{2}.
 \qquad \text{(see \eqref{eq:cnd for well cnd})}
 \end{align}
 We can now lower bound \eqref{eq:bnd on Ak raw} by using \eqref{eq:well cnd}, namely,  
 \begin{align}
  \frac{\eta_{\min}\beta_k}{2} \|A(u_{k})-b\|& 
  \le \beta_{k}\| P_{T_C(u_{k+1})} ( DA(u_{k})^\top (A(u_{k})-b) )\| 
 \qquad \text{(see \eqref{eq:well cnd})} 
  \nonumber\\
  & \le    \lambda_h' + \eta_{\max} \|y_{k-1}\|+ \|G_{k}\|.
 \qquad \text{(see \eqref{eq:bnd on Ak raw})} 
 % \nonumber\\
 %& \le \frac{ \eta_{\min}\rho}{2\sigma_{k}},
 % \qquad \text{(see \eqref{eq:to be met asympt})}
 %
 \end{align}
 %The step of the induction is proved similarly by replacing $k_0$ above with $k$.
 which completes the proof of Lemma \ref{lem:bnd bnd Ak}.
 
 
 
 
 
 
 
 \end{document}