Page MenuHomec4science

nonadaptive.tex
No OneTemporary

File Metadata

Created
Wed, Jun 19, 08:46

nonadaptive.tex

%% This is file `elsarticle-template-1-num.tex',
%%
%% Copyright 2009 Elsevier Ltd
%%
%% This file is part of the 'Elsarticle Bundle'.
%% ---------------------------------------------
%%
%% It may be distributed under the conditions of the LaTeX Project Public
%% License, either version 1.2 of this license or (at your option) any
%% later version. The latest version of this license is in
%% http://www.latex-project.org/lppl.txt
%% and version 1.2 or later is part of all distributions of LaTeX
%% version 1999/12/01 or later.
%%
%% Template article for Elsevier's document class `elsarticle'
%% with numbered style bibliographic references
%%
%% $Id: elsarticle-template-1-num.tex 149 2009-10-08 05:01:15Z rishi $
%% $URL: http://lenova.river-valley.com/svn/elsbst/trunk/elsarticle-template-1-num.tex $
%%
\documentclass[preprint,10pt]{elsarticle}
\usepackage[margin=1.0in]{geometry}
%% Use the option review to obtain double line spacing
%% \documentclass[preprint,review,12pt]{elsarticle}
%% Use the options 1p,twocolumn; 3p; 3p,twocolumn; 5p; or 5p,twocolumn
%% for a journal layout:
%% \documentclass[final,1p,times]{elsarticle}
%% \documentclass[final,1p,times,twocolumn]{elsarticle}
%% \documentclass[final,3p,times]{elsarticle}
%% \documentclass[final,3p,times,twocolumn]{elsarticle}
%% \documentclass[final,5p,times]{elsarticle}
%% \documentclass[final,5p,times,twocolumn]{elsarticle}
%% The graphicx package provides the includegraphics command.
\usepackage{graphicx}
%% The amssymb package provides various useful mathematical symbols
\usepackage{amssymb}
%% The amsthm package provides extended theorem environments
%% \usepackage{amsthm}
%% The lineno packages adds line numbers. Start line numbering with
%% \begin{linenumbers}, end it with \end{linenumbers}. Or switch it on
%% for the whole article with \linenumbers after \end{frontmatter}.
\usepackage{lineno}
\usepackage{amsmath,amsthm}
\usepackage{amssymb}
\usepackage{graphicx}
\usepackage{algorithm}
\usepackage{color}
%\usepackage{cite}
\usepackage{hyperref}
% \usepackage{hyperref}
\input{preamble.tex}
%% natbib.sty is loaded by default. However, natbib options can be
%% provided with \biboptions{...} command. Following options are
%% valid:
%% round - round parentheses are used (default)
%% square - square brackets are used [option]
%% curly - curly braces are used {option}
%% angle - angle brackets are used <option>
%% semicolon - multiple citations separated by semi-colon
%% colon - same as semicolon, an earlier confusion
%% comma - separated by comma
%% numbers- selects numerical citations
%% super - numerical citations as superscripts
%% sort - sorts multiple citations according to order in ref. list
%% sort&compress - like sort, but also compresses numerical citations
%% compress - compresses without sorting
%%
%% \biboptions{comma,round}
% \biboptions{}
\journal{Journal Name}
\begin{document}
\begin{frontmatter}
%% Title, authors and addresses
\title{Non-Adaptive Linearized Augmented Lagrangian}
%% use the tnoteref command within \title for footnotes;
%% use the tnotetext command for the associated footnote;
%% use the fnref command within \author or \address for footnotes;
%% use the fntext command for the associated footnote;
%% use the corref command within \author for corresponding author footnotes;
%% use the cortext command for the associated footnote;
%% use the ead command for the email address,
%% and the form \ead[url] for the home page:
%%
%% \title{Title\tnoteref{label1}}
%% \tnotetext[label1]{}
%% \author{Name\corref{cor1}\fnref{label2}}
%% \ead{email address}
%% \ead[url]{home page}
%% \fntext[label2]{}
%% \cortext[cor1]{}
%% \address{Address\fnref{label3}}
%% \fntext[label3]{}
%% use optional labels to link authors explicitly to addresses:
%% \author[label1,label2]{<author name>}
%% \address[label1]{<address>}
%% \address[label2]{<address>}
\author{Authors}
\address{LIONS, EPFL}
% \begin{abstract}
% %% Text of abstract
% Suspendisse potenti. Suspendisse quis sem elit, et mattis nisl. Phasellus consequat erat eu velit rhoncus non pharetra neque auctor. Phasellus eu lacus quam. Ut ipsum dolor, euismod aliquam congue sed, lobortis et orci. Mauris eget velit id arcu ultricies auctor in eget dolor. Pellentesque suscipit adipiscing sem, imperdiet laoreet dolor elementum ut. Mauris condimentum est sed velit lacinia placerat. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Nullam diam metus, pharetra vitae euismod sed, placerat ultrices eros. Aliquam tincidunt dapibus venenatis. In interdum tellus nec justo accumsan aliquam. Nulla sit amet massa augue.
% \end{abstract}
% \begin{keyword}
% Science \sep Publication \sep Complicated
% %% keywords here, in the form: keyword \sep keyword
% %% MSC codes here, in the form: \MSC code \sep code
% %% or \MSC[2008] code \sep code (2000 is the default)
% \end{keyword}
\end{frontmatter}
%%
%% Start line numbering here if you want
%%
% \linenumbers
%% main text
\section{Problem Formulation}
\label{S:1}
We study the following non-convex optimization program.
\begin{equation}
\label{prob:01}
\min_{u} \{h(u):~A(u) = b,~u\in \mathcal{C} \}
\end{equation}
where $h:\mathbb{R}^d\rightarrow\mathbb{R}$ and
$A:\mathbb{R}^d\rightarrow\mathbb{R}^m$ satisfy
\begin{align}
\| \nabla h(u) - \nabla h(u')\| \le \lambda_h \|u-u'\|,
\qquad
\| Dh(u) - Dh(u') \| \le \lambda_A \|u-u'\|,
\label{eq:smoothness basic}
\end{align}
for every $u,u'\in \mathbb{R}^d$. Above, $\nabla h(u)\in \mathbb{R}^d$ is the gradient of $h$ and $DA(u)\in \mathbb{R}^{m\times d}$ is the Jacobian of $A$.
Write down the equivalent program
\begin{align}
\min_{u\in C} \max_y \, \mathcal{L}_\beta(u,y),
\label{eq:minmax}
\end{align}
where the augmented Lagrangian is
\begin{align}
\label{eq:Lagrangian}
\mathcal{L}_\beta(u,y) := h(u) + \langle A(u)-b, y \rangle + \frac{1}{2\beta}\|A(u)-b\|^2,
\end{align}
\section{Preliminaries}
\begin{definition} \label{def:grad map} Given $u$ and $\gamma >0$, define
the gradient mapping
\begin{equation}
G_{\beta,\gamma}(\cdot; y)\colon u\rightarrow \frac{u-u^+}{\gamma},
\label{eq:grad map}
\end{equation}
where $u^+=P_{C}(u-\gamma \nabla \mathcal{L}_ {\beta}(u,y))$.
\end{definition}
\begin{lemma}[Descent Lemma]\label{lem:11}
{For fixed $y\in \RR^m$, suppose that $\nabla_u \mathcal{L}_{\beta}(\cdot, y)$ is $\lambda_\beta$ Lipschitz-continuous. For $u\in C$ and $\gamma \in (0, 1/\lambda_\beta)$, it holds that }
\begin{equation}
\label{e:deslem}
\| G_{\beta,\gamma}(u;y)\|^{2}\leq \frac{2}{\gamma} (\mathcal{L}_\beta(u;y) - \mathcal{L}_\beta(u^+;y)),
\end{equation}
{where
\begin{align}
\lambda_\beta
& \le \lambda_h + \sqrt{m}\lambda_A \left(\|y\| + \frac{\|A(u)\|}{\beta} \right) + \frac{\|DA(u)\|_F^2}{\beta},
\label{eq:smoothness of Lagrangian}
\end{align}
where $DA(u)$ is the Jacobian of $A$ at $u$.
}
%where $u^+ =u^+(\mu,u) = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$.
%\end{enumerate}
\end{lemma}
\begin{proof}
{
Note that \eqref{e:deslem} follows immediately from an application of \cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}. It only remains to compute the smoothness parameter of $\mathcal{L}_\beta(\cdot,y)$, namely, $\lambda_\beta$. To that end, note that
\begin{align}
\mathcal{L}_{\beta}(u,y) = h(u) + \sum_{i=1}^m y_i (A_i (u)-b_i) + \frac{1}{2\beta} \sum_{i=1}^m (A_i(u)-b_i)^2,
\end{align}
which implies that
\begin{align}
\nabla_u \mathcal{L}_\beta(u,y) & = \nabla h(u) + \sum_{i=1}^m y_i \nabla A_i(u) + \frac{1}{\beta} \sum_{i=1}^m (A_i(u)-b_i) \nabla A_i(u) \nonumber\\
& = \nabla h(u) + DA(u)^\top y + \frac{DA(u)^\top (A(u)-b)}{\beta},
\end{align}
where $DA(u)$ is the Jacobian of $A$ at $u$. Likewise,
\begin{align}
\nabla^2_u \mathcal{L}_\beta(u,y) & = \nabla^2 h(u) + \sum_{i=1}^m \left( y_i + \frac{A_i(u)}{\beta} \right) \nabla^2 A_i(u) + \frac{1}{\beta} \sum_{i=1}^m \nabla A_i(u) \nabla A_i(u)^\top.
\end{align}
It follows that
\begin{align}
\|\nabla_u^2 \mathcal{L}_\beta(u,y)\| & \le \| \nabla^2 h(u) \|+ \max_i \| \nabla^2 A_i(u)\| \left (\|y\|_1+\frac{\|A(u)-b\|_1}{\beta} \right) + \frac{1}{\beta} \sum_{i=1}^m \|\nabla A_i(u)\|^2 \nonumber\\
& \le \lambda_h+ \sqrt{m} \lambda_A \left (\|y\|+\frac{\|A(u)-b\|}{\beta} \right) + \frac{\|DA(u)\|^2_F}{\beta}
\qquad \left(h \in \mathbb{L}(\lambda_h),\,\, A_i \in \mathbb{L}(\lambda_A) \right) \nonumber\\
&
= \lambda_h+ \sqrt{m} \lambda_A \left (\|y\|+\frac{\|A(u)-b\|}{\beta} \right) + \frac{ \|DA(u)\|_F^2}{\beta},
\end{align}
and, consequently,
\begin{align}
\lambda_\beta & = \sup_u \|\nabla_u^2\mathcal{L}_\beta(u,y)\| \nonumber\\
& \le \lambda_h + \sqrt{m}\lambda_A \left(\|y\| + \frac{\|A(u)-b\|}{\beta} \right) + \frac{\|DA(u)\|_F^2}{\beta},
\end{align}
which completes the proof of Lemma \ref{lem:11}.}
\end{proof}
In {practice}, the Lipschitz constant $\lambda_{\beta}$ is often hard to evaluate exactly {and we might resort to the classic line search technique, reviewed below for completeness.}
{
\begin{lemma}[Line search] \label{lem:eval Lipsc cte} Fix $\theta \in (0,1)$ and ${\gamma}_0$. For $\gamma'>0$, let $u^+_{\gamma'} = P_C(u - \gamma' \nabla \mathcal{L}_\beta(u,y))$ and define
\begin{equation*}
\gamma :=
\max \left\{
\gamma' ={\gamma}_0 \theta^i :
\mathcal{L}_\beta (u^+_{\gamma'},y) \le \mathcal{L}_\beta (u,y) +
\left\langle u^+_{\gamma'} - u , \nabla \mathcal{L}_{\beta}(u,y) \right\rangle
+ \frac{1}{2\gamma'} \| u^+_{\gamma'} - u\|^2
\right\}.
\label{eq:defn of gamma line search}
\end{equation*}
Then, (\ref{e:deslem}) holds and moreover we have that
\begin{align}
\gamma \ge \frac{\theta}{\lambda_\beta}.
\label{eq:moreover}
\end{align}
\end{lemma}
}
\begin{proof}
Since $u,u^+_\gamma \in C$, it holds that
\begin{align}
u^+_\gamma - u \in - T_C(u^+_\gamma).
\label{eq:both C feas}
\end{align}
Also, recalling $u^+_{\gamma}$ in Definition \ref{def:grad map}, we have that
\begin{equation}
u^+_{\gamma} - u +\gamma \nabla \mathcal{L}_\beta(u,y) \in -N_C(u^+_{\gamma}).
\label{eq:optimality of uplus}
\end{equation}
Lastly, $\gamma$ by definition in \eqref{eq:defn of gamma line search} satisfies
\begin{align}
& \mathcal{L}_{\beta}(u^+_{\gamma},y) \nonumber\\
& \le \mathcal{L}_\beta(u,y) + \left\langle
u^+_{\gamma} - u , \nabla \mathcal{L}_\beta (u,y)
\right\rangle + \frac{1}{2\gamma}\|u^+_{\gamma} - u\|^2 \nonumber\\
& = \mathcal{L}_\beta(u,y) + \frac{1}{\gamma} \left\langle
u^+_{\gamma} - u ,u^+_\gamma - u+ \gamma \nabla \mathcal{L}_\beta (u,y)
\right\rangle
- \frac{1}{2\gamma}\|u^+_{\gamma} - u\|^2 \nonumber\\
& \le \mathcal{L}_\beta(u,y)
- \frac{1}{2\gamma}\|u^+_{\gamma} - u\|^2
\qquad \text{(see (\ref{eq:both C feas},\ref{eq:optimality of uplus}))} \nonumber\\
& = \mathcal{L}_\beta(u,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(u,y)\|^2,
\qquad \text{(see Definition \ref{def:grad map})}
\end{align}
which completes the proof of Lemma \ref{lem:eval Lipsc cte}.
\end{proof}
\begin{lemma}[Non-convex Slater's condition]\label{lem:bnd bnd Ak}
For an integer $k_0$, let
\begin{align}
S_{K} \supseteq \bigcup_{k\in K} T_{C}(u_k),
\end{align}
and, with some abuse of notation, let $S_{K}$ also denote an orthonormal basis for this subspace. For $\rho>0$, suppose that there exists $\eta_{\min}$ such that
\begin{align}
0 < \eta_{\min } :=
\begin{cases}
\min_u \, \left\| S_{K}^\top P_{T_C(u)} (DA(u)^\top v ) \right\| \\
\|v\| =1\\
\|A(u)-b\|\le \rho\\
u\in C.
\end{cases}
\label{eq:new slater}
\end{align}
Suppose also that
\begin{align}
\sup_{k\in K }\|A(u_k)-b\| \le \rho,
\label{eq:good neighb}
\end{align}
\begin{align}
\operatorname{diam}(C) \le \frac{\eta_{\min}}{2\lambda_A}.
\label{eq:cnd for well cnd}
\end{align}
Then it holds that
\begin{align}
\|A(u_k) -b\| \le \frac{2\beta_k}{\eta_{\min}} \left( \lambda'_h + \eta_{\max} \|y_{k-1}\|+ \|G_k\| \right) ,
\qquad \forall k\in K.
\label{eq:bnd on Ak final}
\end{align}
\end{lemma}
\begin{proof}
If $A(u_k)=b$, then \eqref{eq:bnd on Ak final} holds trivially. Otherwise,
%we show that $P_{T_C(u_{k+1})} DA(u_k)^\top$ is a well-conditioned matrix, in order to lower bound \eqref{eq:bnd on Ak raw}. More specifically,
for an integer $k_0$, consider a subspace
\begin{align}
S _{K} \supseteq \bigcup_{k\in K} T_{C}(u_k),
\label{eq:defn of S}
\end{align}
and let $S_{K}$ with orthonormal columns denote a basis for this subspace, with some abuse of notation. We then assume that
\begin{align}
0 < \eta_{\min} :=
\begin{cases}
\min_u \, \left\| S_{K}^\top P_{T_C(u)} ( DA(u)^\top v ) \right\| \\
\|v\|=1\\
\|A(u)-b\|\le \rho\\
u\in C.
\end{cases}
\label{eq:new slater proof}
\end{align}
If $ \max_{k\in K} \|A(u_k)-b\| \le \rho $,
then \eqref{eq:new slater proof} is in force and, for every $k\ge k_0$, we may write that
\begin{align}
& \left\| P_{T_C(u_{k+1})}( DA(u_{k})^\top (A(u_k)-b) ) \right\| \nonumber\\
& \ge \left\| P_{T_C(u_{k+1})} ( DA(u_{k+1})^\top (A(u_k)-b) ) \right\| - \left\| ( DA(u_{k+1}) - DA(u_k) )^\top (A(u_k)-b) \right\|
\qquad \text{(non-expansiveness of projection)}
\nonumber\\
& \ge \eta_{\min} \|A(u_k)-b\| - \left\| DA(u_{k+1}) - DA(u_k) \right\| \|A(u_k)-b\|
\qquad \text{(see \eqref{eq:new slater proof})}
\nonumber\\
& \ge \eta_{\min} \|A(u_k)-b\| - \lambda_A \|u_{k+1}-u_k\| \cdot \|A(u_k)-b\|
\qquad \text{(see \eqref{eq:smoothness basic})}
\nonumber\\
& = \left( \eta_{\min} -\lambda_A \gamma_{k} \|G_{k}\| \right) \|A(u_k)-b\|
\qquad %\text{(see \eqref{eq:grad map recall})}
\nonumber\\
& \ge \frac{\eta_{\min}}{2} \|A(u_k)-b\|,
\label{eq:well cnd}
\end{align}
where the last line above uses the observation that
\begin{align}
\lambda_A\gamma_k\|G_k\| & = \lambda_A \|u_{k+1}-u_k\| \nonumber\\
& \le \lambda_A \text{diam}(C) \nonumber\\
& \le \frac{\eta_{\min}}{2}.
\qquad \text{(see \eqref{eq:cnd for well cnd})}
\end{align}
The update rule for $u_k$ immediately implies that
\begin{align}
G_k - \nabla h(u_k) - DA(u_k) ^\top y_k- \frac{1}{\beta_k} DA(u_k)^\top (A(u_k)-b) \in N_C(u_{k+1}).
\label{eq:opt cnd of update}
\end{align}
By definition in \eqref{eq:grad map}, we have that $G_k \in T_C(u_{k+1})$ which, together with \eqref{eq:opt cnd of update}, imply that
\begin{align}
G_k &
= P_{T_C(u_{k+1})} \left(
- \nabla h(u_k) - DA(u_k) ^\top y_k- \frac{1}{\beta_k} DA(u_k)^\top (A(u_k)-b)
\right)
\nonumber\\
&
= P_{T_C(u_{k+1})}(- \nabla h(u_k)) + P_{T_C(u_{k+1})}(- DA(u_k) ^\top y_k)
+ \frac{1}{\beta_k} P_{T_C(u_{k+1})} (
- DA(u_k)^\top (A(u_k)-b) ) \nonumber\\
& = P_{T_C(u_{k+1})}(- \nabla h(u_k)) + P_{T_C(u_{k+1})}(- DA(u_k) ^\top y_{k-1})
+ \left(\frac{1}{\beta_k}+ \frac{1}{\sigma_k} \right) P_{T_C(u_{k+1})} (
- DA(u_k)^\top (A(u_k)-b) ),
\end{align}
where the last line above uses
$$
y_{k+1} = y_{k} + \frac{1}{\sigma_{k+1}} (Au_{k+1}-b).
$$
After rearranging and applying the triangle inequality above, we reach
\begin{align}
\frac{1}{\beta_k}\| P_{T_C(u_{k+1})}(DA(u_k)^\top (A(u_k)-b)) \|
& \le \left( \frac{1}{\sigma_k}+ \frac{1}{\beta_k} \right) \| P_{T_C(u_{k+1})} (DA(u_k)^\top (A(u_k)-b) ) \| \nonumber\\
& \le \|\nabla h(u_k)\| + \|DA(u_k) \| \cdot \|y_{k-1}\|+ \|G_k\| \nonumber\\
& \le \lambda '_h + \eta_{\max} \|y_{k-1}\|+ \|G_k\|,
\label{eq:bnd on Ak raw}
\end{align}
where we set
\begin{align}
\lambda'_h := \max_{u\in C} \| \nabla h(u)\|,
\qquad
\eta_{\max}= \max_{u\in C} \|DA(u)\|.
\label{eq:defn lambdap n etaMax}
\end{align}
We can now lower bound \eqref{eq:bnd on Ak raw} by using \eqref{eq:well cnd}, namely,
\begin{align}
\frac{\eta_{\min}}{2\beta_{k}} \|A(u_{k})-b\|&
\le \frac{1}{\beta_{k}}\| P_{T_C(u_{k+1})} ( DA(u_{k})^\top (A(u_{k})-b) )\|
\qquad \text{(see \eqref{eq:well cnd})}
\nonumber\\
& \le \lambda_h' + \eta_{\max} \|y_{k-1}\|+ \|G_{k}\|.
\qquad %\text{(see \eqref{eq:bnd on Ak raw})}
\end{align}
which completes the proof of Lemma \ref{lem:bnd bnd Ak}.
\end{proof}
\section{Algorithm and Convergence}
\subsection{Algorithm}
We propose the following method for solving the problem \eqref{prob:01} where, the main idea is to do one projected gradient step over $u$ in order to avoid solving the sub-problems generated by \eqref{eq:minmax} exactly which is intractable. The formal algorithm is presented as follows.
\begin{algorithm}
\label{Algo:2}
Input: $\beta_0> 0$, $u_{0}\in \mathcal{C}$, $y_0 =0$.
Given $\beta_k$, choose
$\gamma_{k} \leq \frac{1-\epsilon_1}{\mathbb{L}_{h}+\mathbb{L}_{\beta_{k}}}.$
Iterate \\
For k=0,1,\ldots n\\
1. $ u_{k+1} = P_{\mathcal{C} }(u_{k} - \gamma_{k} \nabla \mathcal{L}_{\beta_{k}}(u_{k},y_{k})).$\\
2. Update parameters
\begin{align}
\beta_k = \frac{\beta}{{k}^\alpha} \text{ where } \alpha \leq 1/2,
\qquad
%\sigma_k \geq \beta \sqrt{k},
\sigma_k \geq \max(\beta \sqrt{k}, \beta k \| Au_k - b \|),
\qquad \forall k \in K,
\label{eq:beta n sigma}. \nonumber
\end{align}
3. $y_{k+1} = y_{k} + \frac{1}{\sigma_{k+1}} (Au_{k+1}-b)$.\\
\caption{LinAL}
\end{algorithm}
\subsection{Convergence}
\begin{theorem}
\label{thm:main}
Let $\gamma_k$ be the output of the line search subroutine in our algorithm in iteration $k$.
For integers $k_0\le k_1$, consider the interval $K=[k_0:k_1]$ and following parameter updates
\begin{align}
\beta_k = \frac{\beta}{\sqrt{k}},
\qquad
%\sigma_k = \beta k,
\sigma_k = \max(\beta \sqrt{k}, \beta k \| Au_k - b \|),
\qquad \forall k\in K.
\end{align}
We assume that the nonconvex Slater's condition (Lemma \ref{lem:bnd bnd Ak}) holds with
\begin{align}
\rho \ge
\rho_{\operatorname{low}} (C,A,\beta) \log^{\frac{5}{2}} k_1 ,
\end{align}
where $\rho_{\operatorname{low}} (C,A,\beta)$ depends only on $C,A,\beta$ and is specified in the proof, see (\ref{eq:suff cnd 2nd time}).
Then it holds that
\begin{align}
\min_{k\in K} \|G_{\beta_i,\gamma_i}(u_i,y_i)\|^2 & = \frac{O(\log^5 k_1)}{\sqrt{k_1}},
\qquad \forall k\in K,
\end{align}
\begin{align}
\min_{k\in K} \| A(u_k)-b\| = \frac{O(\log^5 k_1)}{\sqrt{k_1}},
\qquad \forall k\in K,
\end{align}
provided that $k_0 = \Omega(k_1)$ is sufficiently large and
\begin{equation}
\inf_k h(u_k) + \langle A(u_k)-b ,y_{k_0}\rangle > -\infty.
\end{equation}
\end{theorem}
\begin{proof}
For convenience, let us recall the updates of the algorithm in iteration $k$, namely,
\begin{align}
u_{k+1} & = P_C( u_k - \gamma_k \nabla \mathcal{L}_{\beta_k} (u_k,y_k) )
\nonumber\\
& = P_C\left( u_k - \gamma_k \nabla h(u_k) - \gamma_k DA(u_k) ^\top \left( y_k + \frac{A(u_k)-b}{\beta_k} \right) \right),
\qquad \text{(see \eqref{eq:Lagrangian})}
\label{eq:update uk recall}
\end{align}
\begin{align}
y_{k+1} =y_k + \frac{A(u_{k+1})-b}{\sigma_{k+1}},
\label{eq:y update recall}
\end{align}
\begin{equation}
G_k = G_{\beta_k,\gamma_k}(u_k,y_k) = \frac{u_k-u_{k+1}}{\gamma_k}.
\qquad \text{(see \eqref{eq:grad map})}
\label{eq:grad map recall}
\end{equation}
For integers $k_0\le k_1$, consider the interval
\begin{equation}
K = [k_0:k_1]=\{k_0,\cdots,k_1\}.
\end{equation}
Since $\gamma_k$ is determined by the line search subroutine in Lemma \ref{lem:eval Lipsc cte}, we may now apply Lemma \ref{lem:11} for every iteration in this interval to find that
\begin{align}
\frac{\gamma_k \|G_k\|^2}{2}
& \le \mathcal{L}_{\beta_k}(u_k,y_k) - \mathcal{L}_{\beta_k}(u_{k+1},y_k)
\qquad \text{(see Lemma \ref{lem:11})} \nonumber\\
& = h(u_k) - h(u_{k+1})+ \langle A(u_k)-A(u_{k+1}) , y_k \rangle+ \frac{\|A(u_k)-b\|^2 - \|A(u_{k+1})-b\|^2}{2\beta_k},
\qquad \text{(see \eqref{eq:Lagrangian})}
\label{eq:smoothness rep}
\end{align}
for every $k\in K$.
On the other hand,
\begin{align}
{y_k = y_{k_0-1} + \sum_{i=k_0}^k \frac{A(u_i)-b}{\sigma_i}},
\qquad \text{(see \eqref{eq:y update recall})}
\label{eq:y update unfolded}
\end{align}
which, after substituting in \eqref{eq:smoothness rep}, yields that
\begin{align}
\frac{\gamma_k \|G_k\|^2}{2} & \le h(u_k) - h(u_{k+1}) + \left\langle A(u_k) - A(u_{k+1}) , y_{k_0} + \sum_{i=k_0+1}^k \frac{A(u_i)-b}{\sigma_i} \right\rangle
+\frac{\|A(u_k)-b\|^2-\|A(u_{k+1})-b\|^2}{2\beta_k}.
\label{eq:key ineq}
\end{align}
%Additionally, let us assume that
%\begin{align}
%\beta_k = \frac{\beta}{\sqrt{k}},
%\qquad
%\sigma_k = \beta k,
%\qquad \forall k \in K,
%\label{eq:beta n sigma}
%\end{align}
%with $\beta>0$. For the record, the above assumptions imply that
%\begin{align}
%\frac{1}{\beta_k}- \frac{1}{\beta_{k-1}} \le \frac{1 }{\beta \sqrt{k}},
%\qquad
%\frac{1}{\sigma_k} \le \frac{1}{\beta \sqrt{k}},
%\qquad \forall k\in K,
%\label{eq:consequences}
%\end{align}
%for sufficiently large $k_0$.
{Additionally, let us assume that, for $\beta>0$,
\begin{align}
\frac{1}{\beta_k}- \frac{1}{\beta_{k-1}} \le \frac{1 }{\beta \sqrt{k}},
\qquad
\frac{1}{\sigma_k} \le \frac{1}{\beta \sqrt{k}},
\qquad \forall k\in K,
\label{eq:consequences}
\end{align}
for sufficiently large $k_0$.
For example, following choices satisfy the requirements:
\begin{align}
\beta_k = \frac{\beta}{{k}^\alpha} \text{ where } \alpha \leq 1/2,
\qquad
%\sigma_k \geq \beta \sqrt{k},
\sigma_k \geq \max(\beta \sqrt{k}, \beta k \| Au_k - b \|),
\qquad \forall k \in K,
\label{eq:beta n sigma}
\end{align}
}
By summing up the key inequality in \eqref{eq:key ineq} over $k$ from $k_0$ to $k_1$ and using \eqref{eq:beta n sigma}, we find that
\begin{align}
& \sum_{k=k_0}^{k_1} \frac{\gamma_k \|G_k\|^2}{2} \nonumber\\
& \le h(u_{k_0}) - h(u_{k_1+1}) + \langle A(u_{k_0}) - A(u_{k_1+1}) , y_{k_0-1}\rangle
+ \sum_{k=k_0}^{k_1} \sum_{i=k_0}^k \left\langle A(u_k) - A(u_{k+1}) , \frac{A(u_i)-b}{\sigma_i} \right\rangle
\nonumber\\
& \qquad + \sum_{k=k_0}^{k_1} \frac{\|A(u_k)-b\|^2}{2\beta_k} - \sum_{k=k_0}^{k_1} \frac{\|A(u_{k+1})-b\|^2}{2\beta_k}
\qquad \text{(see \eqref{eq:key ineq})}
\nonumber\\
& = h(u_{k_0}) - h(u_{{k_1}+1}) + \langle A(u_{k_0}) - A(u_{{k_1}+1}) , y_{k_0-1} \rangle
+ \sum_{k=k_0}^{k_1} \sum_{i=k_0}^k \left\langle A(u_k) - A(u_{k+1}) , \frac{A(u_i)-b}{\sigma_i} \right\rangle\nonumber\\
& \qquad + \sum_{k=k_0}^{k_1} \frac{\|A(u_k)-b\|^2}{2\beta_k} - \sum_{k=k_0+1}^{{k_1}+1} \frac{\|A(u_{k})-b\|^2}{2\beta_{k-1}} \nonumber\\
& = h(u_{k_0}) - h(u_{{k_1}+1}) + \langle A(u_{k_0}) - A(u_{{k_1}+1}) , y_{k_0-1} \rangle + \frac{\|A(u_{k_0})-b\|^2}{2\beta_{k_0}} + \sum_{i=k_0}^{k_1} \sum_{k=i}^{k_1} \left\langle A(u_k) - A(u_{k+1}) , \frac{A(u_i)-b}{\sigma_i} \right\rangle
\nonumber\\
& \qquad
+ \sum_{k=k_0+1}^{k_1} \left( \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2 - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2
\nonumber\\
& \le \mu + \sum_{i=k_0}^{k_1} \left\langle A(u_i) - A(u_{{k_1}+1}), \frac{A(u_i)-b}{\sigma_i} \right\rangle + \sum_{k=k_0}^{k_1} \left( \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2 - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2
\qquad \text{(see \eqref{eq:defn mu})}
\nonumber\\
& = \mu + \sum_{k=k_0}^{k_1} \left( \frac{1}{\sigma_k} +\frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2
- \sum_{k=k_0}^{k_1} \left \langle A(u_{{k_1}+1})-b, \frac{A(u_k)-b}{\sigma_k} \right\rangle - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2
\nonumber\\
& \le\mu + \sum_{k=k_0}^{k_1} \left( \frac{1}{\sigma_k}+ \frac{1}{2\beta_k}- \frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2
+
\sum_{k=k_0}^{k_1} \frac{1}{\sigma_k} \|A(u_{{k_1}+1})-b\| \|A(u_k)-b\| - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2 \nonumber\\
& \le \mu + \sum_{k=k_0}^{k_1} \left( \frac{1 }{\beta \sqrt{k}}+ \frac{1}{2\beta_k}-\frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2
+
\sum_{k=k_0}^{k_1} \frac{ 1}{\beta \sqrt{k}} \|A(u_{{k_1}+1})-b\| \|A(u_k)-b\| - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2
\qquad \text{(see (\ref{eq:consequences}))}
\nonumber\\
& \le
\mu + \sum_{k=k_0}^{k_1} \frac{3 }{2\beta \sqrt{k}} \|A(u_k)-b\|^2 + \sum_{k=k_0}^{k_1} \frac{1}{\beta \sqrt{k}} \|A(u_{{k_1}+1})-b\| \|A(u_k)-b\| - \frac{\sqrt{k}}{2\beta} \| A(u_{k_1+1})-b\|^2
\qquad \text{(see (\ref{eq:consequences}))}
\nonumber\\
& \le \mu + \sum_{k=k_0}^{k_1} \left( \frac{3}{2\beta \sqrt{k}} + \frac{1}{\beta \sqrt{k} } \right) \|A(u_k)-b\|^2 + \sum_{k=k_0}^{k_1} \frac{1}{4\beta\sqrt{k}} \|A(u_{{k_1}+1})-b\|^2 - \frac{\sqrt{k_1}}{2\beta} \| A(u_{k_1+1})-b\|^2
\qquad \left( 2ab \le ca^2 + c^{-1}b^2 \right)
\nonumber\\
& \le \mu + \sum_{k=k_0}^{k_1} \frac{5 }{2\beta \sqrt{k} } \|A(u_k)-b\|^2 + \frac{\sqrt{k_1}}{2\beta} \| A(u_{k_1+1})-b\|^2 - \frac{\sqrt{k_1}}{2\beta} \| A(u_{k_1+1})-b\|^2 \qquad \left( \sum_{k=1}^{k_1} \frac{1}{\sqrt{k}} \le \int_{1}^{k_1} \frac{d\kappa }{\sqrt{\kappa}} =2 \sqrt{k_1} \right) \nonumber\\
&
\le \mu + \sum_{k=k_0}^{k_1} \frac{5 }{2\beta\sqrt{k}} \|A(u_k)-b\|^2
\label{eq:long chain}
\end{align}
where we assumed that
\begin{equation}
\mu := \sup_k h(u_{k_0}) - h(u_k) + \langle A(u_{k_0})-A(u_k) ,y_{k_0-1}\rangle + \frac{\|A(u_{k_0})-b\|^2}{2\beta_{k_0}}< \infty.
\label{eq:defn mu}
\end{equation}
%
%Let us also assume that $\{\beta_k\}_k$ is non-increasing, namely,
%\begin{equation}
%\beta_{k+1} \le \beta_k ,
%\qquad
%\forall k \ge k_0,
%\label{eq:sigma dec}
%\end{equation}
%which allows us to further simplify the last line in \eqref{eq:long chain} as
%\begin{align}
%\sum_{k=k_0}^K \frac{\gamma_k\|G_k\|^2}{2}
%& \le
%\mu + \sum_{k=k_0+1}^K \frac{\sqrt{k}+3}{2\beta_k}
% \|A(u_k)-b\|^2 +
% \sum_{k=k_0+1}^K \frac{\|A(u_{K+1})-b\|^2}{2\sqrt{k}\beta_k}
%\nonumber\\
%& \le \mu + \sum_{k=k_0+1}^K \frac{\sqrt{k}+3}{2\beta_k} \|A(u_k)-b\|^2 +
% \sum_{k=k_0+1}^K \frac{\|A(u_{K+1})-b\|^2}{2\sqrt{k}\beta_{K+1}}
%\qquad \text{(see \eqref{eq:sigma dec})}
%\nonumber\\
%& \le \mu + \sum_{k=k_0+1}^K \frac{\sqrt{k}+3}{2\beta_k} \|A(u_k)-b\|^2 +
% \frac{\sqrt{K+1}}{\beta_{K+1}} \|A(u_{K+1})-b\|^2
% \qquad \left( \sum_{k=1}^K \frac{1}{\sqrt{k}} \le \int_{0}^K \frac{d\kappa}{\sqrt{\kappa}} = 2\sqrt{K} \right) \nonumber\\
%& \le \mu + \sum_{k=k_0+1}^{K} \frac{2\sqrt{k}}{\beta_k} \|A(u_k)\|^2 + \frac{\sqrt{K+1}}{\beta_{K+1}} \|A(u_{K+1})-b\|^2 \nonumber\\
%& \le \mu + \sum_{k=k_0+1}^{K+1} \frac{2\sqrt{k}}{\beta_k} \|A(u_k)-b\|^2.
%\label{eq:raw}
%\end{align}
%We now try to formalize Volkan's intuition that gradient mapping should derive down the feasibility gap. Let us assume that
%\begin{align}
%\sum_{k=1}^{K+1} \frac{\|A_k\|^2}{\beta_k} \le \sum_{k=0}^K \gamma_k G_k^2.
%\label{eq:assumption}
%\end{align}
%And we take
%\begin{equation}
%\sigma_k = 6 \beta_k,
%\qquad \forall k.
%\end{equation}
%Then, by combining the two inequalities above, we reach
%\begin{align}
% \sum_{k=1}^{K+1} \frac{\|A_k\|^2}{\beta_k} \le 4\mu_0,
%\end{align}
%\begin{align}
%\sum_{k=0}^K \gamma_k G_k^2 \le 4\mu_0.
%\end{align}
%That is, Volkan's assumption \eqref{eq:assumption} successfully bounds both the gradient mapping and the feasibility gap. One question is the interplay between $\{\beta_k,\gamma_k\}_k$ to ensure the validity of Volkan's assumption, which feels like some sort of \emph{uncertainty principle}.
Note that \eqref{eq:long chain} bounds the gradient mapping with the feasibility gap. We next find a converse, thus bounding the feasibility gap with the gradient mapping. To that end, let $T_C(u)$ and $P_{T_C(u)}$ be the tangent cone of $C$ at $u\in C$ and orthogonal projection onto this subspace, respectively. Likewise, let $N_C(u)$ and $P_{N_{C}(u)}$ be the normal cone of $C$ at $u$ and the corresponding orthogonal projection.
Roughly speaking, \eqref{eq:bnd on Ak final} states that the feasibility gap is itself bounded by the gradient map. In order to apply Lemma \ref{lem:bnd bnd Ak}, let us assume that (\ref{eq:good neighb}) holds. Lemma \ref{lem:bnd bnd Ak} is then in force and we may now substitute \eqref{eq:bnd on Ak final} back into \eqref{eq:long chain} to find that
\begin{align}
\sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2
& \le \frac{5 }{2\beta} \sum_{k=k_0}^{{k_1}} {\frac{1}{\sqrt{k}}} \|A(u_k)-b\|^2 +2 \mu
\qquad \text{(see \eqref{eq:long chain})}
\nonumber\\
& \le \frac{20 }{\beta \eta_{\min}^2} \sum_{k=k_0}^{{k_1}}
\beta_k^2 \frac{1}{\sqrt{k}} \left( \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2+ 2\mu
\qquad \text{(see \eqref{eq:bnd on Ak final})}
\nonumber\\
& \le
\frac{20\beta}{ \eta_{\min}^2} \sum_{k=k_0}^{{k_1}}
\frac{1}{k^{3/2}} \left( \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2+ 2\mu
\qquad \text{(see (\ref{eq:beta n sigma}))} \nonumber\\
& \le
\frac{60\beta}{ \eta_{\min}^2} \sum_{k=k_0}^{{k_1}}
\frac{1}{k^{3/2}} \left( \lambda_h'^2 + \eta_{\max}^2 \|y_{k-1}\|^2 + \|G_k\|^2 \right)+ 2\mu,
\qquad \left( (a+b+c)^2 \le 3(a^2+b^2+c^2) \right)
\label{eq:to be used for feas}
\end{align}
where in the third inequality, we used the particular choice of $\beta_k = \frac{\beta}{k^{1/2}}$ in accordance with~\eqref{eq:beta n sigma}.
%To move forward, let us assume that
%\begin{align}
%\sigma_k = o(\sqrt{k}),
%\qquad
%\frac{\sqrt{k}\sigma_k}{\gamma_k} = o(1),
%\qquad
%k^{\frac{3}{2}} \sigma_k \|y_{k-1}\|^2 =o(1).
%\label{eq:cnd on sigma gamma}
%\end{align}
To simplify the above expression, let us assume that
\begin{equation}
\frac{60\beta}{ \eta_{\min}^2 k} \le \frac{\gamma_k}{2},
\qquad \forall k\in K.
\label{eq:to be used later on}
\end{equation}
Let $|K|=k_1-k_0+1$ be the size of the interval $K$.
After rearranging \eqref{eq:to be used for feas} and applying \eqref{eq:to be used later on}, we arrive at
\begin{align}
& { \frac{|K|}{2} \cdot \min_{k \in K} \gamma_k \|{G}_{k}\|^2} \nonumber\\
& \le \sum_{k=k_0}^{k_1} \frac{\gamma_k}{2} \|G_k\|^2 \nonumber\\
& \le \sum_{k=k_0}^{k_1} \left(\gamma_k - \frac{60\beta}{\eta_{\min}^2 k^{3/2}} \right)\|G_k\|^2
\qquad \text{(see \eqref{eq:to be used later on})}
\nonumber\\
& \le \frac{60 \beta \lambda'^2_h }{\eta_{\min}^2 } \sum_{k=k_0}^{{k_1}} \frac{1}{k^{3/2}}
+ \frac{60\beta \eta_{\max}^2}{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2 }{k^{3/2}}
%+\frac{48\beta \log(k_1+1) \|G_{k_1+1}\|^2}{\eta_{\min}^2 (k_1+1)}
+2\mu
\qquad \text{(see \eqref{eq:to be used for feas})}
\nonumber\\
% & =: \frac{48 \beta \lambda'^2_h }{\eta_{\min}^2 } \sum_{k=k_0+1}^{{k_1}+1} \frac{\log k}{k}
%+ \frac{48\beta \eta_{\max}^2}{\eta_{\min}^2} \sum_{k=k_0+1}^{{k_1}+1} \frac{\|y_{k-1}\|^2\log k}{k}
%+2\mu' \nonumber\\
% & \le \frac{60\beta \lambda'^2_h }{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{1}{k}
% +
% \frac{60\beta\eta_{\max}^2 }{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2}{k} +2 \mu \nonumber\\
& \le \frac{120\beta \lambda'^2_h C_1}{\eta_{\min}^2}
+
\frac{60\beta\eta_{\max}^2 }{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2}{k^{3/2}} +2 \mu \nonumber\\
& \le \frac{120\beta C_1}{\eta_{\min}^2} \left( \lambda'^2_h + \frac{\eta_{\max}^2}{2C_1}\sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2}{k^{3/2}} \right)
+
2 \mu,
\label{eq:final bnd on G}
\end{align}
or, equivalently,
\begin{align}
\min_{k \in K} \gamma_k \|{G}_{k}\|^2 & \le
\frac{240\beta C_1}{\eta_{\min}^2|K|} \left( \lambda'^2_h + \frac{\eta_{\max}^2}{2C_1} |K|B_K \right)
+ \frac{4\mu}{|K|},
\label{eq:bnd on gamma G}
\end{align}
where
%\begin{align}
%\mu' := \frac{24\beta \log(k_1+1) \|G_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1)} + \mu,
%\label{eq:defn of muprime}
%\end{align}
\begin{align}
B_{K} := \frac{1 }{|K|} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2}{k}, \text{ and } C_1 = \sum_{k=k_0}^{k_1} \frac{1}{k^{3/2}},
\label{eq:defn of B}
\end{align}
and we will later estimate $B_K$.
In turn, the bound above on the gradient mapping controls the feasibility gap, namely,
\begin{align}
& |K| \min_{k\in K} \|A({u}_k)-b\|^2 \nonumber\\
& \le \sum_{k=k_0}^{{k_1}} \|A(u_k)-b\|^2 \nonumber\\
& \le \frac{12\beta^2 \lambda'^2_h}{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{1}{k}
+ \frac{12\beta^2 \eta_{\max}^2}{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2 }{k}
+ \sum_{k=k_0+1}^{{k_1}+1} \frac{12 \beta^2 }{\eta_{\min}^2 k} \|G_k\|^2
\qquad
\text{(see \eqref{eq:to be used for feas})}
\nonumber\\
& \le \frac{24\beta^2 \lambda'^2_h \log ({k_1}+1) }{\eta_{\min}^2 }
+ \frac{12\beta^2 \eta_{\max}^2}{\eta_{\min}^2} \cdot |K| B_{K}
+ \frac{\beta}{10}\sum_{k=k_0}^{{k_1}} \gamma_k \|G_k\|^2
%+ \frac{12\beta^2 \log (k_1+1) \|G_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1)}
\qquad \text{(see (\ref{eq:to be used later on},\ref{eq:defn of B}))}
\nonumber\\
& \le
\frac{24\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right)
+
%\frac{24\beta^2 \log^2(k_1+1) }{\eta_{\min}^2} \left( \lambda'^2_h+ \eta_{\max}^2 |K|B_K \right) + \frac{\beta \mu'}{2}
\frac{24\beta^2 C_1}{\eta_{\min}^2} \left( \lambda'^2_h + \frac{\eta_{\max}^2}{2C_1}|K|B_K \right) + \frac{2\beta}{5} \mu,
%+ \frac{12\beta^2 \log (k_1+1) \|G_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1)}
\qquad \text{(see \eqref{eq:final bnd on G})} \nonumber\\
& \le
\frac{24\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right)
+
\frac{24\beta^2 \log (k_1+1) }{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right) + \frac{2\beta}{5} \mu
\qquad \text{(see \eqref{eq:defn of muprime})}
\nonumber\\
& =
\frac{48\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right) + \frac{2\beta}{5} \mu,
\label{eq:final bnd on feas gap}
\end{align}
which in turn implies that
\begin{align}
\min_{k\in K}{\|A({u}_k)-b\|^2
\le \frac{48\beta^2 \log(k_1+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right) + \frac{2\beta\mu}{5|K|}},
\label{eq:feas bnd semi final}
\end{align}
if $k_0\ge 2$.
Let us now revisit and simplify the condition imposed in (\ref{eq:good neighb}).
To that end, we first derive a weaker but uniform bound on the feasibility gap. For every $k-1\in K$, it holds that
\begin{align}
\|A(u_k)-b\|^2 & \le \sum_{i=k_0}^{k_1} \|A(u_i)-b\|^2
\qquad \left( k_0\ge 2 \right)
\nonumber\\
& \le \frac{48\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right) + \frac{2\beta}{5} \mu.
\qquad \text{(see \eqref{eq:feas bnd semi final})}
\label{eq:rate of feas gap}
\end{align}
Therefore, we may replace \eqref{eq:good neighb} with the assumption that
\begin{equation}
\|A(u_{k_0})-b\| \le \rho,
\qquad
\frac{48\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right) + \frac{2\beta}{5} \mu \le \rho^2,
\label{eq:suff cnd on rho}
\end{equation}
which ensures that
\begin{align}
\|A(u_k)-b\| \le \rho,
\qquad \forall k\in [k_0:k_1+1].
\label{eq:uniform feas larger interval}
\end{align}
In order to interpret (\ref{eq:final bnd on G},\ref{eq:feas bnd semi final},\ref{eq:suff cnd on rho}), we next estimate $B_{K}$ in \eqref{eq:defn of B}. To that end, let us first control the growth of the dual sequence $\{y_k\}_k$. Recalling \eqref{eq:y update recall} and for every $k\in [k_0:k_1+1]$, we write that
\begin{align}
\|y_k\| & \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \frac{\|A(u_i)-b\|}{\sigma_k}
\qquad \text{(see \eqref{eq:y update recall})}
\nonumber\\
& \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \frac{\rho}{\beta k}
\qquad \text{(see (\ref{eq:beta n sigma},\ref{eq:uniform feas larger interval}))} \nonumber\\
& \le \|y_{k_0}\|+ \frac{2\rho \log k}{\beta}.
\label{eq:dual growth}
\end{align}
With the growth of the dual sequence uncovered, we evaluate $B_{K}$ as
\begin{align}
B_{K} & = \frac{1}{|K|}\sum_{k=k_0}^{{k_1}} \frac{ \|y_{k}\|^2 }{k+1}
\qquad \text{(see \eqref{eq:defn of B})}
\nonumber\\
& \le \frac{1}{|K|} \sum_{k=k_0}^{{k_1}} \frac{1}{k+1}\left( \|y_{k_0}\|+ \frac{2\rho \log k}{\beta} \right)^2
\qquad \text{(see (\ref{eq:dual growth}))}
\nonumber\\
& \le \frac{1}{|K|} \sum_{k=k_0}^{{k_1}} \frac{2\|y_{k_0}\|^2}{k+1}+ \frac{8\rho^2 \log^2 k}{\beta^2 (k+1)}
\qquad ((a+b)^2 \le 2a^2+2b^2)
\nonumber\\
& \le \frac{4\|y_{k_0}\|^2 \log ({k_1}+1)}{|K|} + \frac{16 \rho^2 \log^3 ({k_1}+1)}{|K|} \nonumber\\
& \le \frac{16 \log^3(k_1+1)}{ |K|} \left( \|y_{k_0}\|^2+\rho^2 \right).
\label{eq:Bk evaluated}
\end{align}
In order to interpret (\ref{eq:final bnd on G},\ref{eq:feas bnd semi final},\ref{eq:suff cnd on rho}), it still remains to estimate $\mu'$ in \eqref{eq:defn of muprime}. To that end, we first derive a lower bound on the step sizes $\{\gamma_k\}_k$. To invoke \eqref{eq:moreover}, we in turn need to gauge how smooth the augmented Lagrangian $\mathcal{L}_{\beta_k}(\cdot,y_k)$ is. For every $k\in [k_0:k_1+1]$, note that
\begin{align}
\lambda_{\beta_k} & \le \lambda_h+ \sqrt{m}\lambda_A \left( \|y_k\| + \frac{\|A(u_k)-b\|}{\beta_k} \right)+ \frac{\|DA(u_k)\|_F^2}{\beta_k}
\qquad \text{(see \eqref{eq:smoothness of Lagrangian})} \nonumber\\
& \le \lambda_h+ \sqrt{m}\lambda_A \left( \|y_{k_0}\|+ \frac{2\rho \log (k_1+1)}{\beta_k} + \frac{\rho}{\beta_k} \right) + \frac{ m \eta_{\max}^2}{\beta_k}
\qquad \text{(see (\ref{eq:defn lambdap n etaMax},\ref{eq:uniform feas larger interval},\ref{eq:dual growth}))} \nonumber\\
& = \lambda_h+\sqrt{m}\lambda_A \|y_{k_0}\| + \frac{ 1}{\beta_k}(2\sqrt{m}\lambda_A\rho \log(k_1+1) + \sqrt{m}\lambda_A \rho + m\eta_{\max}^2 ) \nonumber\\
& \le \frac{ 2}{\beta_k}(2\sqrt{m}\lambda_A\rho \log(k_1+1) + \sqrt{m}\lambda_A \rho + m\eta_{\max}^2 ),
\label{eq:smoothness at k}
\end{align}
where the last line holds if $k_0$ is sufficiently large. We are now in position to invoke \eqref{eq:moreover} by writing that
\begin{align}
\gamma_k & \ge \frac{\theta}{\lambda_{\beta_k}} \qquad \text{(see \eqref{eq:moreover})}\nonumber\\
& \ge \frac{\theta \beta_k}{ 4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 }
\qquad \text{(see \eqref{eq:smoothness at k})} \nonumber\\
& = \frac{\theta \beta}{ (4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 ) \sqrt{k} },
\qquad \text{(see \eqref{eq:beta n sigma})}
\label{eq:low bnd on gammas}
\end{align}
for every $k\in [k_0:k_1+1]$.
In particular, the lower bound on $\gamma_{k_1+1}$ above allows us to estimate $\mu'$ by writing that
\begin{align}
\mu' & = \frac{24\beta \log(k_1+1) \|G_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1)} + \mu
\qquad \text{(see \eqref{eq:defn of muprime})} \nonumber\\
& = \frac{24\beta \log(k_1+1) \|u_{k_1+2}-u_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1) \gamma_{k_1+1}^2} + \mu
\qquad \text{(see \eqref{eq:grad map recall})} \nonumber\\
& \le
\frac{24\beta \log(k_1+1) \text{diam}(C)^2 }{\eta_{\min}^2 (k_1+1) \gamma_{k_1+1}^2} + \mu
\qquad \left( \{u_k\} \subset C \right)
\nonumber\\
& \le
\frac{24\beta \log(k_1+1) \text{diam}(C)^2 (4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 )^2 }{\eta_{\min}^2 \theta^2 \beta^2}
+2\mu
\qquad \text{(see \eqref{eq:low bnd on gammas})}
\nonumber\\
& =: \mu''.
\label{eq:mup to mupp}
\end{align}
Having estimated $B_K$ and $\mu'$, we can rewrite the bound on the feasibility gap as
\begin{align}
\min_{k-1\in[K]}\|A(u_k)-b\|^2
& \le \frac{48\beta^2 \log^2(k_1+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h+ \eta_{\max}^2 |K|B_K \right) + \frac{\beta\mu'}{|K|}
\qquad \text{(see \eqref{eq:feas bnd semi final})} \nonumber\\
& \le \frac{48\beta^2 \log^2(k_1+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h+ 16 \eta_{\max}^2 (\|y_{k_0}\|^2 +\rho^2) \log^3(k_1+1) \right) + \frac{\beta\mu''}{|K|}
\qquad \text{(see (\ref{eq:Bk evaluated},\ref{eq:mup to mupp}))}
\end{align}
Moreover, we can simplify the assumption in \eqref{eq:suff cnd on rho}. To be specific, thanks to (\ref{eq:Bk evaluated},\ref{eq:mup to mupp}), we can replace \eqref{eq:suff cnd on rho} with the assumption
\begin{align}
\|A(u_{k_0})-b\|\le \rho,
\qquad
\frac{48\beta^2 \log^2(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ 16 \eta_{\max}^2 (\|y_{k_0}\|^2+\rho^2) \log^3(k_1+1) \right) + \beta\mu'' \le \rho^2.
\label{eq:suff cnd 2nd time}
\end{align}
The lower bound on the step sizes in \eqref{eq:low bnd on gammas} has two other consequences. First, we find that \eqref{eq:to be used later on} automatically holds if $k_0$ is sufficiently large. Second, it allows us to improve \eqref{eq:bnd on gamma G} by writing that
\begin{align}
& \min_{ k\in K } \| G_k\|^2 \nonumber\\
& \le \frac{\min_{k \in K} \gamma_k \|G_k\|^2}{\min_{k\in K} \gamma_k } \nonumber\\
& \le \frac{1}{\min_{k\in K} \gamma_k}
\left( \frac{192\beta \log^2({k_1}+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h + \eta_{\max}^2 |K|B_K \right)
+ \frac{4\mu'}{|K|} \right)
\qquad \text{(see \eqref{eq:bnd on gamma G})}
\nonumber\\
& \le \frac{1}{\min_{k\in K} \gamma_k}
\left( \frac{192\beta \log^2({k_1}+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h + 16 \eta_{\max}^2 (\|y_{k_0}\|^2+\rho^2) \log^3 (k_1+1) \right)
+ \frac{4\mu''}{|K|} \right)
\qquad \text{(see (\ref{eq:Bk evaluated},\ref{eq:mup to mupp}))}
\nonumber\\
& \le \frac{ (4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 ) \sqrt{k}}{\theta \beta}
\left( \frac{192\beta \log^2({k_1}+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h + 16 \eta_{\max}^2 (\|y_{k_0}\|^2+\rho^2) \log^3 (k_1+1) \right)
+ \frac{4\mu''}{|K|} \right)
\quad \text{(see \eqref{eq:low bnd on gammas})} \nonumber\\
& \le \frac{ 4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 }{c\theta \beta \sqrt{k_1}}
\left( \frac{192\beta \log^2({k_1}+1)}{\eta_{\min}^2 } \left( \lambda'^2_h + 16 \eta_{\max}^2 (\|y_{k_0}\|^2+\rho^2) \log^3 (k_1+1) \right)
+ 4\mu'' \right),
\end{align}
where the last line holds if there exists $c>0$ for which $k_1-k_0+1 \ge ck_1 $.
\end{proof}
Note that $\log$ factors in the convergence proof might be shaved up to a factor by choosing a more conservative step size for the dual which will automatically bound the norm of the dual. A possible choice
\begin{align}\label{eq:yk bounded}
\sigma_k \geq \max(\beta \sqrt{k}, \beta k \log^2(k+1) \| Au_k - b \|),
\end{align}
which immediately yields
\begin{align}
\|y_k\| & \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \frac{\|A(u_i)-b\|}{\sigma_k}
\qquad
\nonumber\\
& \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \frac{\rho}{\beta k\log^2(k+1)}
\qquad \text{(see (\ref{eq:yk bounded}))} \nonumber\\
& \le \|y_{k_0}\|+ \frac{\rho c}{\beta} \nonumber\\
&:= y_{max}.
\label{eq:dual bounded}
\end{align}
%% The Appendices part is started with the command \appendix;
%% appendix sections are then done as normal sections
%% \appendix
%% \section{}
%% \label{}
%% References
%%
%% Following citation commands can be used in the body text:
%% Usage of \cite is as follows:
%% \cite{key} ==>> [#]
%% \cite[chap. 2]{key} ==>> [#, chap. 2]
%% \citet{key} ==>> Author [#]
%% References with bibTeX database:
\bibliographystyle{model1-num-names}
\bibliography{sample.bib}
%% Authors are advised to submit their bibtex database files. They are
%% requested to list a bibtex style file in the manuscript if they do
%% not want to use model1-num-names.bst.
%% References without bibTeX database:
% \begin{thebibliography}{00}
%% \bibitem must have the following form:
%% \bibitem{key}...
%%
% \bibitem{}
% \end{thebibliography}
\end{document}
%%
%% End of file `elsarticle-template-1-num.tex'.

Event Timeline