Page MenuHomec4science

adaptive_theory.tex
No OneTemporary

File Metadata

Created
Fri, Nov 8, 05:01

adaptive_theory.tex

\section{Proof of Theorem \ref{thm:main}}
For convenience, let us recall the updates of the algorithm in iteration $k$, namely,
\begin{align}
u_{k+1}
& = P_C( u_k - \gamma_k \nabla \mathcal{L}_{\beta_k} (u_k,y_k) )
\nonumber\\
& = P_C\Big( u_k - \gamma_k \nabla h(u_k) \nonumber\\
& \qquad \qquad - \gamma_k DA(u_k) ^\top \left( y_k + \beta_k(A(u_k)-b) \right) \Big),
\qquad \text{(see \eqref{eq:Lagrangian})}
\label{eq:update uk recall}
\end{align}
\begin{align}
y_{k+1} =y_k + \sigma_{k+1}(A(u_{k+1})-b),
\label{eq:y update recall}
\end{align}
and recall also that
\begin{equation}
G_k = G_{\beta_k,\gamma_k}(u_k,y_k) = \frac{u_k-u_{k+1}}{\gamma_k}.
\qquad \text{(see \eqref{eq:grad map})}
\label{eq:grad map recall}
\end{equation}
For integers $k_0\le k_1$, consider the interval
\begin{equation}
K = [k_0:k_1]=\{k_0,\cdots,k_1\}.
\end{equation}
Since $\gamma_k$ is determined by the line search subroutine in Lemma \ref{lem:eval Lipsc cte}, we may apply Lemma \ref{lem:11} for every iteration in this interval to find that
\begin{align}
\frac{\gamma_k \|G_k\|^2}{2}
& \le \mathcal{L}_{\beta_k}(u_k,y_k) - \mathcal{L}_{\beta_k}(u_{k+1},y_k)
\qquad \text{(see Lemma \ref{lem:11})} \nonumber\\
& = h(u_k) - h(u_{k+1})+ \langle A(u_k)-A(u_{k+1}) , y_k \rangle
\nonumber\\
& \qquad + \frac{\beta_k}{2}\left(\|A(u_k)-b\|^2 - \|A(u_{k+1})-b\|^2\right),
\qquad \text{(see \eqref{eq:Lagrangian})}
\label{eq:smoothness rep}
\end{align}
for every $k\in K$.
On the other hand,
\begin{align}
y_k = y_{k_0} + \sum_{i=k_0+1}^k \sigma_i \left( A(u_i)-b \right),
\qquad \text{(see \eqref{eq:y update recall})}
\label{eq:y update unfolded}
\end{align}
which, after substituting in \eqref{eq:smoothness rep}, yields that
\begin{align}
\frac{\gamma_k \|G_k\|^2}{2} & \le h(u_k) - h(u_{k+1}) + \left\langle A(u_k) - A(u_{k+1}) , y_{k_0} + \sum_{i=k_0+1}^k \sigma_i ( A(u_i)-b) \right\rangle
\nonumber\\
& \qquad +\frac{\beta_k}{2} \left(\|A(u_k)-b\|^2-\|A(u_{k+1})-b\|^2\right).
\label{eq:key ineq}
\end{align}
By summing up the key inequality in \eqref{eq:key ineq} over $k$, we find that
\begin{align}
& \sum_{k=k_0}^{k_1} \frac{\gamma_k \|G_k\|^2}{2} \nonumber\\
& \le h(u_{k_0}) - h(u_{k_1+1}) + \langle A(u_{k_0}) - A(u_{k_1+1}) , y_{k_0-1}\rangle \nonumber\\
& \qquad + \sum_{k=k_0}^{k_1} \sum_{i=k_0+1}^k \sigma_i \left\langle A(u_k) - A(u_{k+1}) , A(u_i)-b \right\rangle
\nonumber\\
& \qquad + \sum_{k=k_0}^{k_1} \frac{\beta_k}{2} \|A(u_k)-b\|^2 - \sum_{k=k_0}^{k_1} \frac{\beta_k}{2} \|A(u_{k+1})-b\|^2
\qquad \text{(see \eqref{eq:key ineq})}
\nonumber\\
& = h(u_{k_0}) - h(u_{{k_1}+1}) + \langle A(u_{k_0}) - A(u_{{k_1}+1}) , y_{k_0-1} \rangle \nonumber\\
& \qquad + \sum_{k=k_0}^{k_1} \sum_{i=k_0+1}^k \sigma_i \left\langle A(u_k) - A(u_{k+1}) , A(u_i)-b \right\rangle\nonumber\\
& \qquad + \sum_{k=k_0}^{k_1} \frac{\beta_k}{2}\|A(u_k)-b\|^2- \sum_{k=k_0+1}^{{k_1}+1} \frac{\beta_{k-1}}{2} \|A(u_{k})-b\|^2 \nonumber\\
& = h(u_{k_0}) - h(u_{{k_1}+1}) + \langle A(u_{k_0}) - A(u_{{k_1}+1}) , y_{k_0-1} \rangle + \frac{\beta_{k_0}}{2}\|A(u_{k_0})-b\|^2 \nonumber\\
& \qquad + \sum_{i=k_0+1}^{k_1} \sum_{k=i}^{k_1} \sigma_i \left\langle A(u_k) - A(u_{k+1}) , A(u_i)-b \right\rangle
\nonumber\\
& \qquad
+ \sum_{k=k_0+1}^{k_1} \frac{\beta_k - \beta_{k-1}}{2} \|A(u_k)-b\|^2 - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2
\nonumber\\
& \le \mu + \sum_{i=k_0+1}^{k_1} \sigma_i \left\langle A(u_i) - A(u_{{k_1}+1}), A(u_i)-b\right\rangle \nonumber\\
& \qquad + \sum_{k=k_0+1}^{k_1} \frac{\beta_k - \beta_{k-1}}{2} \|A(u_k)-b\|^2 - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2
\qquad \text{(see \eqref{eq:defn mu})}
\nonumber\\
& = \mu + \sum_{k=k_0+1}^{k_1} \left( \sigma_k +\frac{\beta_k - \beta_{k-1}}{2} \right) \|A(u_k)-b\|^2 \nonumber\\
& \qquad - \sum_{k=k_0+1}^{k_1} \sigma_k \left \langle A(u_{{k_1}+1})-b, A(u_k)-b\right\rangle - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2
\nonumber\\
& \le\mu + \sum_{k=k_0}^{k_1} \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
\nonumber\\
& \qquad +
\sum_{k=k_0}^{k_1} \sigma_k \|A(u_{{k_1}+1})-b\| \|A(u_k)-b\| - \frac{\beta_{k_1}}{2} \| A(u_{k_1+1})-b\|^2,
\label{eq:long chain broken}
\end{align}
where we assumed that
\begin{align}
\mu & := \sup_k \Big( h(u_{k_0}) - h(u_k) + \langle A(u_{k_0})-A(u_k) ,y_{k_0-1}\rangle \nonumber\\
& \qquad \qquad + \frac{\beta_{k_0}}{2} \|A(u_{k_0})-b\|^2 \Big) < \infty.
\label{eq:defn mu}
\end{align}
Recall that, for every $k\in K$, we set the penalty weight as
\begin{align}
\beta_k =
\begin{cases}
\beta_{k-1} & \|A(u_k)-b\| \le \frac{\beta_{k_0}}{\sqrt{k\log^{{2}} (k+1)}},\\
\\
\beta_{k-1}
\sqrt{\frac{k\log^{{2}}(k+1)}{(k-1)\log^{{2}} k}} &
\|A(u_k)-b\| > \frac{\beta_{k_0}}{\sqrt{k\log^{{2}}(k+1)}},
\end{cases}
\label{eq:beta rule}
\end{align}
for a threshold $\beta_{k_0}>0$.
For the record, the above penalty weights $\{\b_k\}_{k\ge 0}$ is a nondecreasing sequence and satisfies
\begin{align*}
\beta_{k_0}\le \beta_k \le \beta_{k_0} \sqrt{\frac{k \log^{{2}}(k+1)}{k_0\log^{{2}}(k_0+1)}},
\qquad \forall k\in K,
\end{align*}
\begin{align}
\beta_k- \beta_{k-1}
& \le \beta_{k-1} \left( \sqrt{\frac{k\log^{{2}}(k+1)}{(k-1)\log^{{2}}k}} -1 \right) \nonumber\\
& \le \beta_{k-1} \cdot \frac{k\log^{{2}}(k+1) - (k-1) \log^{{2}} k}{(k-1)\log^{{2}} k} \nonumber\\
& \le \b_{k-1} \left(
\frac{k\log^2(1+ \frac{1}{k})}{(k-1)\log^2k} + \frac{1}{k-1}
\right)
\nonumber\\
& \le \frac{2 \beta_{k-1}}{k-1}
\qquad ( k_0 \gg 1)
\nonumber\\
& \le \frac{{2}\beta_{k_0}}{k-1} \sqrt{\frac{(k-1)\log^{{2}}k}{k_0 \log^{{2}}(k_0+1)}} \nonumber\\
& = \frac{ 2\beta_{k_0} \log k}{\sqrt{(k-1)k_0} \log (k_0+1)},
\qquad \forall k \in K,
\label{eq:consequences}
\end{align}
when $k_0$ is sufficiently large.
For every $k\in K$, recall also that we set the dual step sizes as
\begin{align}
\sigma_k =
\begin{cases}
\sigma_{k-1} & \|A(u_k)-b\| \le \frac{\sigma_{k_0}}{k\log^{{2}}(k+1)},\\
\\
\sigma_{k-1} \sqrt{\frac{(k-1) \log^{{2}}k}{k \log^{{2}}(k+1)}}
&
\frac{\sigma_{k_0}}{k\log^{{2}}(k+1)} \le \|A(u_k)-b\| \le \frac{\sigma_{k_0}}{\sqrt{k\log^{{2}}(k+1)}},
\\
\\
\sigma_{\widetilde{k}-1} \frac{ \b_k-\b_{k-1}}{\beta_{\widetilde{k}}-\beta_{\widetilde{k}-1}} & \|A(u_k)-b\| > \frac{\sigma_{k_0}}{\sqrt{k\log {^2}(k+1)}} \text{ and } k \le k_{\max},\\
\\
\min\left( \sigma_{\widetilde{k}-1} \frac{ \b_k-\b_{k-1}}{\beta_{\widetilde{k}}-\beta_{\widetilde{k}-1}} , \frac{\|A(u_k)-b\|}{k\log^2(k+1)} \right)& \|A(u_k)-b\| > \frac{\sigma_{k_0}}{\sqrt{k\log {^2}(k+1)}} \text{ and } k > k_{\max},
\end{cases}
\label{eq:sigma rule}
\end{align}
for a threshold $\sigma_{k_0}\ge \beta_{k_0}$ and integer $k_{\max}$. {Above, $\widetilde{k}=\widetilde{k}(k)$ is the iteration at which the most recent transition occurs into the third update regime. Indeed, the factor depending on $\widetilde{k}$ is introduced above to ensure the continuity of the sequence $\{\sigma_k\}_{k\ge 0}$.}
%For a threshold $k_{\max}$, the third regime of \eqref{eq:sigma rule} is replaced with $\|A(u_k)-b\|/(k \log^2(k+1))$ when $k \ge k_{\max}$, namely,
For the record, a consequence of the above updates is that $\{\s_k\}_{k\ge 0}$ is a nonincreasing sequence and, in particular,
\begin{align}
\sigma_k \le \sigma_{k_0},
\qquad \forall k\in K.
\qquad \text{(see (\ref{eq:consequences},\ref{eq:sigma rule}))}
\label{eq:consequences 2}
\end{align}
Let us partition $K$ to disjoint subsets
\begin{align}
K = K_{\beta,l} \cup K_{\beta,u}.
\label{eq:partition beta}
\end{align}
More specifically, for every $k\in K_{\beta,l}$, the first update in \eqref{eq:beta rule} is in force whereas, for every $k\in K_{\beta,u}$, the second update in \eqref{eq:beta rule} is in force. Likewise, let us partition $K$ to disjoint subsets
\begin{align}
K= K_{\sigma,l} \cup K_{\sigma,m} \cup K_{\sigma,u}.
\label{eq:partition sigma}
\end{align}
For every $k\in K_{\sigma,l}$, the first update in \eqref{eq:sigma rule} is in force whereas, for every $k\in K_{\sigma,m}$, the second update in \eqref{eq:sigma rule} is in force and, for every $k\in K_{\sigma,u}$, the third update there is active. By their definitions in (\ref{eq:beta rule},\ref{eq:sigma rule}) and the assumption that $\s_{k_0}\ge \b_{k_0}$, we observe that $K_{\beta,l}\cap K_{\s,u}=\emptyset$, namely,
\begin{equation}
\sigma_k >0, \qquad \forall k\in K.
\end{equation}
The partitioning in \eqref{eq:partition sigma} allows us to break the sum in the last line of \eqref{eq:long chain broken} to three sums.
To evaluate the sum over $K_{\sigma,l}$, we write that
\begin{align}
& \sum_{k\in K_{\sigma,l}} \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
+
\|A(u_{{k_1}+1})-b\| \sum_{k\in K_{\sigma,l}} \sigma_k \|A(u_k)-b\|
\nonumber\\
& \le \sum_{k\in K_{\sigma,l}} \left( \sigma_{k_0}+ \frac{ {2}\beta_{k_0} \log k}{\sqrt{(k-1) k_0} \log(k_0+1)} \right) \|A(u_k)-b\|^2 \nonumber\\
& \qquad + \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,l}} \sigma_{k_0} \|A(u_k)-b\|
\qquad \text{(see (\ref{eq:consequences},\ref{eq:consequences 2}))}
\nonumber\\
& \le 2 \s_{k_0}\sum_{k\in K_{\sigma,l}} \|A(u_k)-b\|^2 + \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,l}} \sigma_{k_0} \|A(u_k)-b\|
\nonumber\\
& \le 2\s_{k_0}\sum_{k\in K_{\sigma,l}} \|A(u_k)-b\|^2 + \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,l}} \frac{\sigma_{k_0}^2}{k\log {^2}(k+1)}
\qquad \text{(see \eqref{eq:sigma rule})}
\nonumber\\
& = 2\s_{k_0} \sum_{k\in K_{\sigma,l}} \|A(u_k)-b\|^2 + c\s^2_{k_0} \| A(u_{k_1+1})-b\| ,
\label{eq:long chain broke 2}
\end{align}
where we assumed that $k_0 \gg 1$ to obtain the second inequality and
\begin{equation}
c\ge \sum_{k=1}^{\infty} \frac{1}{k\log {^2}(k+1)}.
\label{eq:defn of c}
\end{equation}
To evaluate the sum in the last line of \eqref{eq:long chain broken} over $K_{\s,m}$,
we write that
\begin{align}
& \sum_{k\in K_{\sigma,m}} \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
\nonumber\\
& \qquad +
\|A(u_{{k_1}+1})-b\| \sum_{k\in K_{\sigma,m}} \sigma_k \|A(u_k)-b\|
\nonumber\\
& \le \sum_{k\in K_{\sigma,m}} \left( \sigma_{k_0}+ \frac{2\beta_{k_0} \log k}{\sqrt{(k-1) k_0} \log{^2}(k_0+1)} \right) \|A(u_k)-b\|^2
\nonumber\\
& \qquad + \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,m}} \sigma_{k} \|A(u_k)-b\|
\qquad \text{(see (\ref{eq:consequences},\ref{eq:consequences 2}))}
\nonumber\\
& \le 2\s_{k_0} \sum_{k\in K_{\sigma,m}} \|A(u_k)-b\|^2 + \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,m}} \sigma_{k} \|A(u_k)-b\|
\nonumber\\
& \le 2\s_{k_0} \sum_{k\in K_{\sigma,m}} \|A(u_k)-b\|^2 + \| A(u_{k_1+1})-b\| \sum_{k\in K_{\sigma,m}} \frac{\sigma_{k_0}\sigma_{k} }{\sqrt{k\log {^2}(k+1)}},
\label{eq:long chain broke 3}
\end{align}
where the second to last line assumes that $k_0\gg 1$ and the last line uses \eqref{eq:sigma rule}.
To control the last sum above, let us further write that
\begin{align}
K_{\s,m}=\cup_{j=1}^{J_{\s,m}} K_{\s,m,j},
\label{eq:another partition}
\end{align}
where $\{K_{\s,m,j}\}_{j=1}^{J_{\s,m}}$ are disjoint intervals, with the starting points $\{k_{\s,m,j}\}_{j=1}^{J_{\s,m}}$. With the partitioning in \eqref{eq:another partition}, we write that
\begin{align}
& \sum_{k\in K_{\sigma,m}} \frac{\sigma_{k_0}\sigma_k}{\sqrt{k\log{^2}(k+1)}}
\nonumber\\
& =
\sum_{j=1}^{J_{\s,m}} \sum_{k\in K_{\sigma,m,j}} \frac{\sigma_{k_0}\sigma_k}{\sqrt{k\log{^2}(k+1)}} \nonumber\\
& = \sum_{j=1}^{J_{\s,m}}
\sigma_{k_0}\sigma_{k_{\s,m,j}} \sqrt{k_{\s,m,j} \log {^2}(k_{\s,m,j}+1)}
\sum_{k\in K_{\sigma,m,j}} \frac{1}{k \log {^2}(k+1)}
\qquad \text{(see \eqref{eq:sigma rule})} \nonumber\\
& \le c \s_{k_0} \s'_{k_0},
\label{eq:long chain broke 2.99}
\end{align}
where we conveniently set
\begin{align}
\s'_{k_0} :=\sum_{j=1}^{J_{\s,m}}\sigma_{k_{\s,m,j}} \sqrt{k_{\s,m,j}} \log (k_{\s,m,j}+1).
\label{eq:defn of sigmap}
\end{align}
%\begin{remark}
%Note that our convergence bounds in particular will depend on $J_\s$, the number of transitions into the middle regime in \eqref{eq:sigma rule}.
%\end{remark}
Substituting \eqref{eq:long chain broke 2.99} back into \eqref{eq:long chain broke 3}, we reach
\begin{align}
& \sum_{k\in K_{\sigma,m}} \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2
\nonumber\\
& \qquad +
\|A(u_{{k_1}+1})-b\| \sum_{k\in K_{\sigma,m}} \sigma_k \|A(u_k)-b\|
\nonumber\\
& \le 2\s_{k_0} \sum_{k\in K_{\sigma,m}} \|A(u_k)-b\|^2 + c\s_{k_0}\sigma'_{k_0} \| A(u_{k_1+1})-b\|.
\end{align}
{Consider also the partitioning
\begin{align}
K_{\s,u} = \cup_{j=1}^{J_{\s, u}} K_{\s,u,j},
\label{eq:upper_sigma_partition}
\end{align}
where $\{K_{\s,u,j}\}_{j=1}^{J_{\s,u}}$ are disjoint intervals, with the starting points $\{\widetilde{k}_j\}_{j=1}^{J_{\s,u}}$; recall \eqref{eq:sigma rule}.} To evaluate the sum in the last line of \eqref{eq:long chain broken} over $K_{\s,u}$,
we write that
\begin{align}
&
\sum_{k\in K_{\sigma,u}} \left( \sigma_k + \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2 \nonumber\\
& \qquad + \sum_{k\in K_{\sigma,u}} \sigma_k \|A(u_{k_1+1})-b\| \|A(u_k)-b\| - \frac{\beta_{k_1}}{2} \|A(u_{k_1+1})-b\|^2 \nonumber\\
& \le {\sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}}
\left( \sigma_k + \frac{\beta_k-\beta_{k-1}}{2} +
{\frac{\sigma_{\widetilde{k}_j -1}\s_{k_0}}{2(\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1})}}\right)
\| A(u_k)-b\|^2 \nonumber\\
& \qquad + \frac{1}{2}
\left({ \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}
\frac{ (\beta_{\widetilde{k}_j}-\beta_{\widetilde{k}_j-1}) \sigma_k^2 }{\sigma_{\widetilde{k}_j-1} \s_{k_0} } - \beta_{k_1} }\right)
\| A(u_{k_1+1})-b\|^2.
%\qquad ( 2ab \le \theta a^2 + b^2/\theta )
\label{eq:long chain broke .99}
\end{align}
The last sum above is not positive because
\begin{align}
& { \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}
\frac{ (\beta_{\widetilde{k}_j}-\beta_{\widetilde{k}_j-1}) \sigma_k^2 }{\sigma_{\widetilde{k}_j-1} \s_{k_0} } - \beta_{k_1} } \nonumber\\
& = \sum_{k\in K_{\sigma,u}} (\beta_k-\beta_{k-1}) \frac{\s_k}{\s_{k_0}} - \beta_{k_1}
\qquad \text{(see \eqref{eq:sigma rule})}
\nonumber\\
& \le \sum_{k\in K_{\sigma,u}} (\beta_k-\beta_{k-1}) - \beta_{k_1}
\qquad \text{(see \eqref{eq:consequences 2})}
\nonumber\\
& \le \sum_{k=k_0}^{k_1} (\beta_k-\beta_{k-1}) - \beta_{k_1} \nonumber\\
& \le - \b_{k_0}\le 0.
\label{eq:telescope}
\end{align}
We therefore bound the last line of \eqref{eq:long chain broke .99} as
\begin{align}
&
\sum_{k\in K_{\sigma,u}} \left( \sigma_k + \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2 \nonumber\\
& \qquad + \sum_{k\in K_{\sigma,u}} \sigma_k \|A(u_{k_1+1})-b\| \|A(u_k)-b\| - \frac{\beta_{k_1}}{2} \|A(u_{k_1+1})-b\|^2 \nonumber\\
& \le \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}
\Bigg(
\left( \frac{\sigma_{\widetilde{k}_j-1 } }{(\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1})} +\frac{1}{2} \right) (\beta_k - \beta_{k-1}) \nonumber\\
&\qquad \qquad \qquad \qquad \qquad + \frac{\sigma_{\widetilde{k}_j-1 } \s_{k_0}}{2(\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1})}
\Bigg)
\|A(u_k)-b\|^2
\qquad \text{(see (\ref{eq:long chain broke .99},\ref{eq:telescope}))}
\nonumber\\
& \le \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}
\Bigg(
\left( \frac{\sigma_{\widetilde{k}_{j-1} }}{(\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1})} +\frac{1}{2} \right) \left( \frac{ 2 \beta_{k_0} \log(k+1)}{\sqrt{k k_0} \log(k_0+1)}\right) \nonumber\\
& \qquad \qquad \qquad \qquad \qquad + \frac{\sigma_{\widetilde{k}_{j-1} }\s_{k_0}}{2(\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1})}
\Bigg)
\|A(u_k)-b\|^2
\qquad \text{(see \eqref{eq:consequences})}
\nonumber\\
& \le \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}
\frac{\sigma_{\widetilde{k}_{j-1} } \s_{k_0}}{\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1}}
\|A(u_k)-b\|^2 .
\qquad (k_0 \gg 1)
\label{eq:long chain broke 1}
\end{align}
We next show that the coefficient in the last line above is bounded. Indeed, for $j\in J_{\s,u}$, first note that
\begin{align}
\frac{\s_{\widetilde{k}_{j-1}}}{\beta_{\widetilde{k}_j}-\beta_{\widetilde{k}_j-1}}
& = \frac{\s_{\widetilde{k}_{j}-1}}{\left(
\sqrt{\frac{\widetilde{k}_{j} \log^2(\widetilde{k}_{j}+1)}{(\widetilde{k}_{j}-1)\log^2 \widetilde{k}_{j} }}
-1 \right) \beta_{\widetilde{k}_j-1}}
\qquad \text{(see (\ref{eq:beta rule},\ref{eq:sigma rule}))} \nonumber\\
& \le \frac{2\s_{\widetilde{k}_{j}-1}}
{\left( \frac{\widetilde{k}_j \log^2(\widetilde{k}_j+1 ) }{(\widetilde{k}_j - 1)\log^2 \widetilde{k}_j } -1 \right) \b_{\widetilde{k}_j-1}}
\qquad (k_0 \gg 1) \nonumber\\
& \le \frac{2\s_{\widetilde{k}_{j}-1}}
{\left( \frac{\widetilde{k}_j }{\widetilde{k}_j - 1 } -1 \right) \b_{\widetilde{k}_j-1}} \nonumber\\
& = \frac{2(\widetilde{k}_j -1) \s_{\widetilde{k}_{j}-1} }{ \b_{\widetilde{k}_j-1} }.
\label{eq:boundedness}
\end{align}
But the last line above is bounded. If, to the contrary, the last line above is unbounded, then the subsequences $\{\b_k,\s_k\}_{k\ge k_2}$ would never enter the second and third regimes in, respectively, (\ref{eq:beta rule},\ref{eq:sigma rule}) for a sufficiently large $k_2$ and, in that case, we must have that $\widetilde{k}_j \le k_2$ for every $j\in J_{\s,u}$. Therefore, $J_{\s,u}$ is a bounded set, which contradicts the unboundedness of the last line of \eqref{eq:boundedness}. Let us then set
\begin{align}
\delta'_{k_0} := \max_{j \in J_{\sigma,u}}
\frac{\s_{\widetilde{k}_{{j}-1}} \s_{k_0}} {\b_{\widetilde{k}_{{j}}}-\b_{\widetilde{k}_{{j}}-1}},
\label{eq:defn of delta}
\end{align}
and bound the last line of \eqref{eq:long chain broke 1} as
\begin{align}
& \sum_{k\in K_{\sigma,u}} \left( \sigma_k + \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(u_k)-b\|^2 \nonumber\\
& \le \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}
\frac{\sigma_{\widetilde{k}_{j-1} } \s_{k_0}}{\beta_{\widetilde{k}_j}-\b_{\widetilde{k}_j -1}}
\|A(u_k)-b\|^2
\qquad \text{(see \eqref{eq:long chain broke 1})} \nonumber\\
& \le \delta'_{k_0} \sum_{j=1}^{J_{\s, u}} \sum_{k\in K_{\sigma,u,j}}
\|A(u_k)-b\|^2\nonumber\\
& = \delta'_{k_0} \sum_{k\in K_{\s,u}} \|A(u_k)-b\|^2.
\label{eq:long chain broke 0}
\end{align}
By combining (\ref{eq:long chain broke 2},\ref{eq:long chain broke 3},\ref{eq:long chain broke 0}), we can finally simplify \eqref{eq:long chain broken}: When $k_0$ is sufficiently large, there exists a sufficiently large $\delta_{k_0}$, which depends only on $k_0$, such that
\begin{align}
& \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 \nonumber\\
& \le 2\mu + \delta_{k_0} \sum_{k=k_0}^{k_1} \|A(u_k)-b\|^2 + c\s_{k_0}(\sigma_{k_0}+\s_{k_0}') \|A(u_{k_1+1})-b\|
%\qquad
%\left( \text{(\ref{eq:long chain broke 1},\ref{eq:long chain broke 2},\ref{eq:long chain broke 3}) and } k_0 \gg 1 \right)
\nonumber\\
& =: \mu' + \delta_{k_0} \sum_{k=k_0}^{k_1} \|A(u_k)-b\|^2.
\label{eq:long chain}
\end{align}
In the last line above, we observed that
\begin{align}
\mu' := 2\mu + c
\s_{k_0}(\s_{k_0}+\s_{k_0}') \max_{u\in C} \|A(u)-b\| < \infty,
\label{eq:defn of mup}
\end{align}
thanks to the continuity of $A$ and compactness of $C$.
Note that \eqref{eq:long chain} bounds the gradient mapping with the feasibility gap. We next find a converse, thus bounding the feasibility gap with the gradient mapping. To that end, let $T_C(u)$ and $P_{T_C(u)}$ be the tangent cone of $C$ at $u\in C$ and orthogonal projection onto this subspace, respectively. Likewise, let $N_C(u)$ and $P_{N_{C}(u)}$ be the normal cone of $C$ at $u$ and the corresponding orthogonal projection.
The update rule for $u_k$ in \eqref{eq:update uk recall} immediately implies that
\begin{align}
G_k - \nabla h(u_k) - DA(u_k) ^\top y_k- \beta_k DA(u_k)^\top (A(u_k)-b) \in N_C(u_{k+1}).
\label{eq:opt cnd of update}
\end{align}
By definition in \eqref{eq:grad map recall}, we have that $G_k \in T_C(u_{k+1})$ which, together with \eqref{eq:opt cnd of update}, imply that
\begin{align}
G_k &
= P_{T_C(u_{k+1})} \left(
- \nabla h(u_k) - DA(u_k) ^\top y_k- \beta_k DA(u_k)^\top (A(u_k)-b)
\right)
\nonumber\\
&
= P_{T_C(u_{k+1})}(- \nabla h(u_k)) + P_{T_C(u_{k+1})}(- DA(u_k) ^\top y_k)
\nonumber\\
& \qquad + \beta_k P_{T_C(u_{k+1})} (
- DA(u_k)^\top (A(u_k)-b) ) \nonumber\\
& = P_{T_C(u_{k+1})}(- \nabla h(u_k)) + P_{T_C(u_{k+1})}(- DA(u_k) ^\top y_{k-1})
\nonumber\\
& \qquad + \left(\beta_k+ \sigma_k \right) P_{T_C(u_{k+1})} (
- DA(u_k)^\top (A(u_k)-b) ).
\label{eq:geometric decopm of Gk}
\end{align}
In the second line above, we used the identity $P_T(a+b) = P_T(a)+P_T(b)$, for points $a,b$ and convex cone $T$. The last line above uses \eqref{eq:y update recall}.
After rearranging and applying the triangle inequality to the last line above, we reach
\begin{align}
& \beta_k\| P_{T_C(u_{k+1})}(DA(u_k)^\top (A(u_k)-b)) \| \nonumber\\
& \le \left( \sigma_k+ \beta_k \right) \| P_{T_C(u_{k+1})} (DA(u_k)^\top (A(u_k)-b) ) \|
\nonumber\\
& \le \|\nabla h(u_k)\| + \|DA(u_k) \| \cdot \|y_{k-1}\|+ \|G_k\|
\qquad \text{(see \eqref{eq:geometric decopm of Gk})}
\nonumber\\
& \le \lambda '_h + \eta_{\max} \|y_{k-1}\|+ \|G_k\|,
\label{eq:bnd on Ak raw}
\end{align}
where we used the non-expansiveness of $P_{T_C(u_{k+1})}$ and also set
\begin{align}
\lambda'_h := \max_{u\in C} \| \nabla h(u)\|,
\qquad
\eta_{\max}= \max_{u\in C} \|DA(u)\|.
\label{eq:defn lambdap n etaMax}
\end{align}
The following result, proved in Appendix \ref{sec:proof of bnd Ak}, translates \eqref{eq:bnd on Ak raw} into an upper bound on $\|A(u_k)-b\|$.
\begin{lemma}\label{lem:bnd bnd Ak}
%Suppose that $C$ is sufficiently smooth, in the sense that there exists a constant $\tau_C$ such that
%\begin{equation}
%\|P_{T_C(u)} - P_{T_C(u')}\| \le \tau_C \|u-u'\|,
%\qquad \forall u,u'\in C.
%\label{eq:curvature}
%\end{equation}
For an integer $k_0$, consider a subspace $S_K\subseteq \mathbb{R}^d$ such that
\begin{align}
S_{K} \supseteq \bigcup_{k\in K} T_{C}(u_k),
\end{align}
and, with some abuse of notation, let $S_{K}$ also denote an orthonormal basis for this subspace. For $\rho>0$, let
\begin{align}
\eta_{\min } :=
\begin{cases}
\min_{u,v} \, \left\| S_{K}^\top P_{T_C(u)} (DA(u)^\top v ) \right\| \\
\|v\| =1\\
\|A(u)-b\|\le \rho\\
u\in C,
\end{cases}
\label{eq:new slater}
\end{align}
and assume that $\eta_{\min}>0$.
Suppose also that
\begin{align}
\sup_{k\in K }\|A(u_k)-b\| \le \rho,
\label{eq:good neighb}
\end{align}
\begin{align}
\operatorname{diam}(C) \le \frac{\eta_{\min}}{2\lambda_A}.
\label{eq:cnd for well cnd}
\end{align}
Then it holds that
\begin{align}
\|A(u_k) -b\| \le \frac{2}{\eta_{\min}\beta_k} \left( \lambda'_h + \eta_{\max} \|y_{k-1}\|+ \|G_k\| \right) ,
\qquad \forall k\in K.
\label{eq:bnd on Ak final}
\end{align}
\end{lemma}
Roughly speaking, \eqref{eq:bnd on Ak final} states that the feasibility gap is itself bounded by the gradient map. In order to apply Lemma \ref{lem:bnd bnd Ak}, let us assume for now that (\ref{eq:good neighb}) holds. Recall also the partitioning of $K$ in \eqref{eq:partition beta}.
We may now substitute \eqref{eq:bnd on Ak final} back into \eqref{eq:long chain} to find that
\begin{align}
& \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 \nonumber\\
& \le \sum_{k=k_0}^{{k_1}} \delta_{k_0} \|A(u_k)-b\|^2 + \mu'
\qquad \text{(see \eqref{eq:long chain})}
\nonumber\\
& = \sum_{k\in K_{\beta,l}} \delta_{k_0} \|A(u_k)-b\|^2 + \sum_{k\in K_{\beta,u}} \delta_{k_0}\|A(u_k)-b\|^2
+ \mu' \nonumber\\
& \le
\sum_{k\in K_{\beta,l}} \frac{\b_{k_0}\delta_{k_0}}{ k\log^2 (k+1)}
+ \sum_{k\in K_{\beta,u}} \delta_{k_0} \|A(u_k)-b\|^2
+\mu'
\qquad \text{(see \eqref{eq:beta rule})}
\nonumber\\
& \le
\sum_{k=k_0}^{k_1} \frac{\b_{k_0}\delta_{k_0}}{ k\log^2 (k+1)}
+ \sum_{k\in K_{\beta,u}}\delta_{k_0} \|A(u_k)-b\|^2
+\mu'
\nonumber\\
& \le
c\b_{k_0}\delta_{k_0}
+ \sum_{k\in K_{\beta,u}}
\frac{4\delta_{k_0}}{\eta_{\min}^2\b_k^2} \left( \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2
+\mu' ,
\label{eq:adaptive 1}
\end{align}
where the last line above uses (\ref{eq:defn of c},\ref{eq:bnd on Ak final}).
Consider the partitioning
\begin{align}
K_{\beta,u} = \cup_{j=1}^{J_\b} K_{\beta,u,j},
\label{eq:partition on beta}
\end{align}
where $\{K_{\beta,u,j}\}_{j=1}^{J_{\b}}$ are disjoint intervals, with the starting points $\{k_{\beta,u,j}\}_{j=1}^{J_{\b,u}}$. With the partitioning in \eqref{eq:partition on beta} at hand and after recalling \eqref{eq:beta rule}, we simplify the last line of \eqref{eq:adaptive 1} as
\begin{align}
& \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 \nonumber\\
& \le \sum_{k=k_0}^{k_1} \delta_{k_0}\|A(u_k)-b\|^2
+ \mu'\nonumber\\
& \le c\b_{k_0}\delta_{k_0} + \sum_{j=1}^{J_\b}
\frac{4\delta_{k_0} k_{\b,u,j} \log^2(k_{\b,u,j}+1)}{\eta_{\min}^2\b^2_{k_{\b,u,j}}}
\nonumber\\
& \qquad \qquad\qquad \cdot \sum_{k\in K_{\beta,r,j}} \frac{1}{k\log^2(k+1)}
\left( \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2
+ \mu',
\label{eq:adaptive 2}
\end{align}
where the last line uses \eqref{eq:consequences}.
The inner sum in the last line above can in turn be bounded as
\begin{align}
& \sum_{k\in K_{\beta,r,j}} \frac{1}{k\log^2(k+1)}
\left( \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2 \nonumber\\
& \le \sum_{k\in K_{\beta,r,j}} \frac{3}{k\log^2(k+1)}
\left( \lambda_h'^2 + \eta_{\max}^2 \|y_{k-1}\|^2 + \|G_k\|^2 \right)\nonumber\\
& \le 3c\lambda_h'^2 + 3 \eta_{\max}^2 B_K + \sum_{k\in K_{\beta,u,j}} \frac{3\|G_k\|^2}{k\log(k+1)},
\qquad \text{(see \eqref{eq:defn of c})}
\label{eq:adaptive 3}
\end{align}
where the second line above uses the inequality $(a+b+c)^2 \le 3a^2+3b^2+3c^2$ and we conveniently set
\begin{align}
B_K := \sum_{k=k_0}^{k_1} \frac{\|y_{k-1}\|^2}{k \log^2(k+1)}.
\label{eq:defn of B}
\end{align}
Substituting \eqref{eq:adaptive 3} back into \eqref{eq:adaptive 2}, we find that
\begin{align}
& \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 \nonumber\\
& \le\sum_{k=k_0}^{k_1} \delta_{k_0} \|A(u_k)-b\|^2
+ \mu'
\nonumber\\
& \le c\b_{k_0}\delta_{k_0}+
\frac{12 c \b'_{k_0}\delta_{k_0} \lambda'^2_h}{\eta_{\min}^2}
+
\frac{12 \b'_{k_0}\delta_{k_0} \eta_{\max}^2 B_K}{\eta_{\min}^2} \nonumber\\
& \qquad +
\sum_{k\in K} \frac{12 \beta_{k_0}' \delta_{k_0} \|G_k\|^2}{ \eta_{\min}^2 k\log^2(k+1)}
+
\mu',
\qquad \text{(see \eqref{eq:adaptive 2})}
\label{eq:to be used for feas}
\end{align}
where
\begin{align}
\beta'_{k_0} := \sum_{j=1}^{J_\b} \frac{k_{r,j} \log^2(k_{r,j}+1)}{ \beta_{k_{\b,r,j}}^2}.
\label{eq:defn of kappa}
\end{align}
Indeed, $\b'_{k_0}$ above is finite (and depends only on $k_0$). If, to the contrary, the series on the right-hand side of \eqref{eq:defn of kappa} diverges, then there must exists a large enough $k_2$ such that the subsequence $\{\b_{k} \}_{k\ge k_2}$ remains in the first regime of \eqref{eq:beta rule}. Therefore, $J_\b$ must be finite and so does the right-hand side of \eqref{eq:defn of kappa}, thus contradicting the unboundedness of $\b'_{k_0}$ .
To simplify the last line of \eqref{eq:to be used for feas}, let us assume that
\begin{equation}
\frac{12 \beta_{k_0}'\delta_{k_0}}{\eta_{\min}^2 k \log^2(k+1)} \le \frac{\gamma_k}{2},
\qquad \forall k\in K.
\label{eq:to be used later on}
\end{equation}
We also let $|K|=k_1-k_0+1$ denote the size of the interval $K$.
After rearranging \eqref{eq:to be used for feas} and applying \eqref{eq:to be used later on}, we arrive at
\begin{align}
& \sum_{k=k_0}^{k_1} \frac{\gamma_k}{2} \|G_k\|^2 \nonumber\\
& \le
\sum_{k=k_0}^{k_1} \left(\gamma_k - \frac{12 \beta_{k_0}'\delta_{k_0}}{\eta_{\min}^2 k \log^2(k+1)} \right)\|G_k\|^2
\qquad \text{(see \eqref{eq:to be used later on})}
\nonumber\\
& \le c\b_{k_0}\delta_{k_0} +
\frac{12 c \b'_{k_0}\delta_{k_0} \lambda'^2_h}{\eta_{\min}^2}
+
\frac{12 \b'_{k_0}\delta_{k_0} \eta_{\max}^2 B_K}{\eta_{\min}^2}
+\mu'.
\qquad \text{(see \eqref{eq:to be used for feas})}
\label{eq:final bnd on G}
\end{align}
In turn, the bound above on the gradient mapping controls the feasibility gap, namely,
\begin{align}
& \sum_{k=k_0}^{k_1} \|A(u_k)-b\|^2 \nonumber\\
& \le c\b_{k_0} + \frac{12c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{12\b_{k_0}' \eta_{\max}^2 B_K}{\eta_{\min}^2}
+ \sum_{k\in K} \frac{12\b_{k_0}' \|G_k\|^2}{\eta_{\min}^2 k \log(k+1)}
\qquad \text{(see \eqref{eq:to be used for feas})}
\nonumber\\
& \le c\b_{k_0} + \frac{12c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{12\b_{k_0}' \eta_{\max}^2 B_K}{\eta_{\min}^2}
+ \frac{1}{2\delta_{k_0}} \sum_{k\in K} \gamma_k \|G_k\|^2
\qquad \text{(see \eqref{eq:to be used later on})}
\nonumber\\
& \le 2c\b_{k_0} + \frac{24c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{24\b_{k_0}' \eta_{\max}^2 B_K}{\eta_{\min}^2}
+ \frac{\mu'}{\delta_{k_0}}.
\qquad \text{(see \eqref{eq:final bnd on G})}
\label{eq:feas bnd semi final}
\end{align}
%\subsection{Estimating $B_K$}
In order to interpret (\ref{eq:final bnd on G},\ref{eq:feas bnd semi final}), we next estimate $B_{K}$, as defined in \eqref{eq:defn of B}. To that end, let us first control the growth of the dual sequence, namely, $\{y_k\}_k$. Recalling \eqref{eq:y update recall} and for every $k\in K$, we write that
\begin{align}
\|y_k\| & \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \sigma_i \|A(u_i)-b\|
\qquad \text{(see \eqref{eq:y update recall})}
\nonumber\\
& = \|y_{k_0}\|+ \sum_{i\in K_{\sigma,l}} \sigma_i \|A(u_i)-b\|
+ \sum_{i\in K_{\sigma,m}} \sigma_i \|A(u_i)-b\|
\nonumber\\
& \qquad + \sum_{i\in K_{\sigma,u}} \sigma_i \|A(u_i)-b\|
\qquad \text{(see \eqref{eq:partition sigma})}
\nonumber\\
& \le \|y_{k_0}\|+ \sum_{k\in K_{\s,l}} \frac{\s_{k_0}^2 }{k\log^2(k+1)}
+ \sum_{i\in K_{\sigma,m}} \sigma_i \|A(u_i)-b\| \nonumber\\
& \qquad + \sum_{i\in K_{\sigma,u}} \sigma_i \|A(u_i)-b\|
\qquad \text{(see \eqref{eq:sigma rule})}
\nonumber\\
& \le \|y_{k_0}\|+c\s_{k_0}^2
+ \sum_{i\in K_{\sigma,m}} \sigma_i \|A(u_i)-b\| \nonumber\\
& \qquad + \sum_{i\in K_{\sigma,u}} \sigma_i \|A(u_i)-b\|.
\qquad \text{(see \eqref{eq:defn of c})}
\label{eq:dual growth 1}
\end{align}
To evaluate the sum over $K_{\s,m}$ in the last line above, recall the partitioning $K_{\s,m}=\cup_{j=1}^{J_{\s,m}} K_{\s,m,j}$ in \eqref{eq:another partition}. Recall also that each $K_{\s,m,j}$ is an interval with the starting point $k_{\s,m,j}$. This allows us to write that
\begin{align}
& \sum_{i\in K_{\s,m}} \s_i \|A(u_i)-b\| \nonumber\\
& = \sum_{j=1}^{J_{\s,m}} \sum_{i\in K_{\s,m,\j}} \s_i \|A(u_i)-b\|
\qquad \text{(see \eqref{eq:another partition})}
\nonumber\\
& \le
\sum_{j=1}^{J_{\s,m}} \sum_{i\in K_{\s,m,\j}} \frac{\s_{k_0}\s_i }{\sqrt{i\log^2(i+1)} }
\qquad \text{(see \eqref{eq:sigma rule})}
\nonumber\\
& = \sum_{j=1}^{J_{\s,m}} \s_{k_0}\s_{k_{\s,m,j}} \sqrt{k_{\s,m,j}} \log(k_{\s,m,j}+1) \sum_{i\in K_{\s,m,\j}} \frac{1}{i\log^2(i+1)}
\qquad \text{(see \eqref{eq:sigma rule})}
\nonumber\\
& \le c \s_{k_0}\s'_{k_0}.
\qquad \text{(see (\ref{eq:defn of c},\ref{eq:defn of sigmap}))}
\label{eq:dual growth 2}
\end{align}
To bound the last sum in \eqref{eq:dual growth 1}, we write that
\begin{align}
& \sum_{i\in K_{\s,u}} \s_i \|A(u_i) - b\|\nonumber\\
& = \sum_{i\in K_{\s,u},\, i \le k_{\max}} \s_i \|A(u_i) - b\| + \sum_{i\in K_{\s,m},\, i > k_{\max}} \s_i \|A(u_i) - b\| \nonumber\\
& \le \sum_{i\in K_{\s,u},\, i \le k_{\max}} \s_i \|A(u_i) - b\|
+ \sum_{i\in K_{\s,m},\, i > k_{\max}} \frac{1}{i \log^2(i+1)} \nonumber\\
& \le \sum_{i\in K_{\s,u},\, i \le k_{\max}} \s_i \|A(u_i) - b\| + c,
\label{eq:dual growth 2.5}
\end{align}
where the last line follows from the updates. To control the remaining sum in the last line above, we write that
\begin{align}
& \sum_{i\in K_{\s,u},\, i\le k_{\max}} \s_i \|A(u_i)-b\|
\nonumber\\ &
\le \frac{\delta'_{k_0}}{\s_{k_0}} \sum_{i\in K_{\s,u},\, i\le k_{\max}} (\b_i- \b_{i-1})\|A(u_i)-b\|
\qquad \text{(see (\ref{eq:sigma rule},\ref{eq:defn of delta}))}
\nonumber\\
& \le \frac{\delta'_{k_0} \rho }{\s_{k_0}} \sum_{i\in K_{\s,u},\, i\le k_{\max}} (\b_i- \b_{i-1})
\qquad \text{(see \eqref{eq:good neighb})} \nonumber\\
&\le \frac{\delta'_{k_0} \rho }{\s_{k_0}} \sum_{i\in K_{\s,u},\, i\le k_{\max}} \frac{2\b_{k_0} \log i}{\sqrt{(i-1)k_0} \log(k_0+1)}
\qquad \text{(see \eqref{eq:consequences})}\nonumber\\
& \le
\frac{4 \b_{k_0} \delta'_{k_0}\rho \log k_{\max} }{\s_{k_0} } \sqrt{\frac{k_{\max}}{k_0}}.
\qquad \left( \sum_{i=1}^{k} \frac{1}{\sqrt{i}} \le \int_{0}^{k} \frac{dx}{\sqrt{x}} = 2 \sqrt{k} \right)
\label{eq:dual growth 3}
\end{align}
By combining (\ref{eq:dual growth 2.5},\ref{eq:dual growth 3}), we reach
\begin{align}
& \sum_{i\in K_{\s,u}} \s_i \|A(u_i) - b\|\nonumber\\
& \le \frac{4 \b_{k_0} \delta'_{k_0}\rho \log k_{\max} }{\s_{k_0} } \sqrt{\frac{k_{\max}}{k_0}}+ c.
\end{align}
Finally, by combining (\ref{eq:dual growth 1},\ref{eq:dual growth 2},\ref{eq:dual growth 3}), we find that
\begin{align}
\|y_k\| & \le \|y_{k_0}\|+ c \s_{k_0}(\s_{k_0}+\s'_{k_0}) \nonumber\\
& \qquad + \frac{4 \b_{k_0} \delta'_{k_0}\rho \log(k_{\max}+1) }{\s_{k_0} } \sqrt{\frac{k_{\max}}{k_0}} +c =: y_{\max},
\label{eq:dual growth}
\end{align}
for every $k\in K$.
Equipped with \eqref{eq:dual growth}, we can now finally evaluate $B_{K}$ as
\begin{align}
B_{K} & = \sum_{k=k_0-1}^{{k_1-1}} \frac{ \|y_{k}\|^2 }{(k+1)\log^2( k+2)}
\qquad \text{(see \eqref{eq:defn of B})}
\nonumber\\
& \le \sum_{k=k_0-1}^{k_1-1} \frac{y_{\max}^2}{(k+1)\log^2(k+2)}
\qquad \text{(see \eqref{eq:dual growth})}
\nonumber\\
& \le c y_{\max}^2.
\qquad \text{(see \eqref{eq:defn of c})}
\label{eq:Bk evaluated}
\end{align}
Let us now revisit and simplify the condition imposed in (\ref{eq:good neighb}).
To that end, we first derive a weaker but uniform bound on the feasibility gap. For every $k\in K$, it indeed holds that
\begin{align}
& \|A(u_k)-b\|^2 \nonumber\\
& \le \sum_{i=k_0}^{k_1} \|A(u_i)-b\|^2
\nonumber\\
& \le 2c\b_{k_0} + \frac{24c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{24\b_{k_0}' \eta_{\max}^2 B_K}{\eta_{\min}^2}
+ \frac{\mu'}{\delta_{k_0}}
\qquad \text{(see \eqref{eq:feas bnd semi final})} \nonumber\\
& \le 2c\b_{k_0} + \frac{24c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{24c\b_{k_0}' \eta_{\max}^2 y_{\max}^2}{\eta_{\min}^2}
+ \frac{\mu'}{\delta_{k_0}}.
\qquad \text{(see \eqref{eq:Bk evaluated})}
\label{eq:rate of feas gap}
\end{align}
We may therefore replace \eqref{eq:good neighb} with the assumption that
\begin{equation}
2c\b_{k_0} + \frac{24c \b_{k_0}' \lambda'^2_h}{\eta_{\min}^2} + \frac{24c\b_{k_0}' \eta_{\max}^2 y_{\max}^2}{\eta_{\min}^2}
+ \frac{\mu'}{\delta_{k_0}}\le \rho^2,
\label{eq:suff cnd on rho}
\end{equation}
which ensures that \eqref{eq:good neighb} holds. We emphasize that the left-hand side above is bounded, as proved in (\ref{eq:long chain},\ref{eq:defn of kappa}).
\begin{remark}
Note that \eqref{eq:suff cnd on rho} controls the initial feasibility $\|A(u_{k_0})-b\|$, which indeed appears in (\ref{eq:defn mu},\ref{eq:defn of mup}).
\end{remark}
By combining (\ref{eq:final bnd on G},\ref{eq:feas bnd semi final},\ref{eq:Bk evaluated}), we reach
\begin{align}
& \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 + \|A(u_k)-b\|^2 \nonumber\\
&\le 2c\b_{k_0}(\delta_{k_0}+1)
+
\frac{24\b_{k_0}'\lambda'^2_h}{\eta_{\min}^2}(\delta_{k_0}+1)
\nonumber\\
& \qquad \qquad +
\frac{24\b_{k_0}'\eta_{\max}^2 B_K}{\eta_{\min}^2}(\delta_{k_0}+1)
+\frac{\mu' }{\delta_{k_0}}\left( 2\delta_{k_0}+1 \right)
\nonumber\\
& \le 2c\b_{k_0}(\delta_{k_0}+1)
+
\frac{24\b_{k_0}'\lambda'^2_h}{\eta_{\min}^2}(\delta_{k_0}+1)
\nonumber\\
& \qquad \qquad +
\frac{24c\b_{k_0}'\eta_{\max}^2 y_{\max}^2}{\eta_{\min}^2}(\delta_{k_0}+1)
+\frac{\mu' }{\delta_{k_0}}\left( 2\delta_{k_0}+1 \right),
\label{eq:raw bnd on sum}
\end{align}
where the last line above uses \eqref{eq:Bk evaluated}.
To better interpret \eqref{eq:raw bnd on sum}, we next find a lower bound on the primal step sizes $\{\gamma_k\}_k$ by invoking \eqref{eq:moreover}. To do so, we first need to gauge how smooth the augmented Lagrangian $\mathcal{L}_{\beta_k}(\cdot,y_k)$ is (as a function of its first argument). For every $k\in K$ and after recalling Lemma \ref{lem:11}, we note that
\begin{align}
\lambda_{\beta_k}
& \le \lambda_h+ \sqrt{m}\lambda_A \left( \|y_k\| + \beta_k \|A(u_k)-b\| \right)+ \beta_k \|DA(u_k)\|_F^2
\qquad \text{(see \eqref{eq:smoothness of Lagrangian})} \nonumber\\
& \le \lambda_h+ \sqrt{m}\lambda_A \left( y^{\max} + \beta_k \rho \right) +
\beta_k m \eta_{\max}^2
\qquad \text{(see (\ref{eq:defn lambdap n etaMax},\ref{eq:good neighb},\ref{eq:dual growth}))} \nonumber\\
& = \lambda_h+\sqrt{m}\lambda_A y_{\max} +
\beta_k \left( \sqrt{m}\lambda_A \rho + m \eta_{\max}^2 \right)
\nonumber\\
& \le \lambda_h + \sqrt{m}\lambda_A y_{\max} + \b_{k_0} \sqrt{\frac{k \log^2(k+1)}{k_0 \log^2(k_0+1)}} (\sqrt{m}\lambda_A\rho + m \eta_{\max}^2 )
\quad \text{(see \eqref{eq:consequences})} \nonumber\\
& \le 2\b_{k_0} \sqrt{\frac{k \log^2(k+1)}{k_0 \log^2(k_0+1)}} (\sqrt{m}\lambda_A\rho + m \eta_{\max}^2 ),
\label{eq:smoothness at k}
\end{align}
where the last line holds if $k_0$ is sufficiently large. We are now in position to invoke \eqref{eq:moreover} and write that
\begin{align}
\gamma_k & \ge \frac{\theta}{\lambda_{\beta_k}} \qquad \text{(see \eqref{eq:moreover})}\nonumber\\
& \ge \frac{\theta }{2\beta_{k_0}( \sqrt{m}\lambda_A \rho + 2m \eta_{\max}^2) }
\sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}}
\qquad \text{(see \eqref{eq:smoothness at k})} \nonumber\\
& =: \gamma' \sqrt{\frac{k_0 \log^2 (k_0+1) }{ k \log^2(k+1)}},
\label{eq:low bnd on gammas}
\end{align}
for every $k\in K$.
The lower bound on the step sizes in \eqref{eq:low bnd on gammas} has two immediate consequences. First, we find that \eqref{eq:to be used later on} automatically holds if $k_0$ is sufficiently large, because both $\delta_{k_0}$ and $\b'_{k_0}$ are bounded, respectively, as (\ref{eq:defn of delta},\ref{eq:defn of kappa}). Second, \eqref{eq:low bnd on gammas} allows us to better interpret \eqref{eq:raw bnd on sum}, namely,
\begin{align}
& \sum_{k=k_0}^{k_1} \gamma' \sqrt{\frac{k_0 \log^2 (k_0+1) }{ k \log^2(k+1)}} \|G_k\|^2 +\|A(u_k)-b\|^2 \nonumber\\
& \le \sum_{k=k_0}^{k_1}\gamma_k \|G_k\|^2 +\|A(u_k)-b\|^2
\qquad \text{(see (\ref{eq:low bnd on gammas}))}
\nonumber\\
& \le
2c\b_{k_0}(\delta_{k_0}+1)
+
\frac{24\b_{k_0}'\lambda'^2_h}{\eta_{\min}^2}(\delta_{k_0}+1)
\nonumber\\
& \qquad \qquad +
\frac{24c\b_{k_0}'\eta_{\max}^2 y_{\max}^2}{\eta_{\min}^2}(\delta_{k_0}+1)
+\frac{\mu' }{\delta_{k_0}}\left( 2\delta_{k_0}+1 \right),
\qquad \text{(see \eqref{eq:raw bnd on sum})}
\label{eq:detailed final bnd}
\end{align}
which immediately yields that
\begin{align}
\min_{k\in K} \gamma' \sqrt{\frac{k_0 \log^2(k_0+1)}{k_1\log^2(k_1+1)}} \|G_k\|^2+ \|A(u_k)-b\|^2 \lesssim \frac{1}{|K|},
\end{align}
where $\gamma'$ is independent of $k_1$ and $\lesssim$ suppresses the dependence on all parameters except $k_1$.
\begin{theorem}
\label{thm:main}
Let $\gamma_k$ be the output of the line search subroutine in our algorithm in iteration $k$.
For integers $k_0\le k_1$, consider the interval $K=[k_0:k_1]$ and the choice of parameters in (\ref{eq:beta rule},\ref{eq:sigma rule}) for $\s_{k_0}\ge \b_{k_0}>0$ and an integer $k_{\max}$. We impose the following geometric requirements on the constraints. Let $P_{T_C(u)}$ and $P_{N_C(u)}$ denote the orthogonal projection onto the tangent and normal cones at $u\in C$, respectively.
Consider a subspace $S_{K}\subseteq \mathbb{R}^d$ such that
\begin{align}
S_{K} \supseteq \bigcup_{k\in K} T_C(u_k),
\end{align}
and, with some abuse of notation, let $S_{K}$ also denote an orthonormal basis for this subspace. For $\rho>0$, we assume that the nonconvex Slater's condition holds. That is, assume that $\eta_{\min}>0$, where
\begin{align}
\eta_{\min} :=
\begin{cases}
\min_{u,v} \, \left\| S_{k_0}^\top P_{T_C(u)} (DA(u)^\top v) \right\| \\
\|v\|=1\\
\|A(u)-b\|\le \rho\\
u\in C.
\end{cases}
\label{eq:new slater}
\end{align}
Suppose that
\begin{align}
\operatorname{diam}(C) \le \frac{2\eta_{\min}}{\lambda_A},
\end{align}
and that $\rho$ is large enough so that (\ref{eq:suff cnd on rho}) holds, where the left-hand side is independent of $k_1$. Assuming that $k_0\gg 1$, it then holds that
\begin{align}
\min_{k\in K} \gamma' \sqrt{\frac{k_0 \log^2(k_0+1)}{k_1\log^2(k_1+1)}} \|G_k\|^2+ \|A(u_k)-b\|^2 \lesssim \frac{1}{k_1-k_0},
\label{eq:final thm}
\end{align}
where $\gamma'$ is independent of $k_1$ and $\lesssim$ suppresses the dependence on all parameters except $k_1$. The exact expression for the right-hand side of (\ref{eq:final thm}) is given in (\ref{eq:detailed final bnd}). The factor $\gamma'$ is defined in (\ref{eq:low bnd on gammas}).
\end{theorem}

Event Timeline