Page MenuHomec4science

nonadaptive_theory.tex
No OneTemporary

File Metadata

Created
Thu, Jun 6, 09:51

nonadaptive_theory.tex

\section{Proof of Nonadaptive Theorem}
For convenience, let us recall the updates of the algorithm in iteration $k$, namely,
\begin{align}
u_{k+1} & = P_C( u_k - \gamma_k \nabla \mathcal{L}_{\beta_k} (u_k,y_k) )
\nonumber\\
& = P_C\left( u_k - \gamma_k \nabla h(u_k) - \gamma_k DA(u_k) ^\top \left( y_k + \frac{A(u_k)-b}{\beta_k} \right) \right),
\qquad \text{(see \eqref{eq:Lagrangian})}
\label{eq:update uk recall}
\end{align}
\begin{align}
y_{k+1} =y_k + \frac{A(u_{k+1})-b}{\sigma_{k+1}},
\label{eq:y update recall}
\end{align}
\begin{equation}
G_k = G_{\beta_k,\gamma_k}(u_k,y_k) = \frac{u_k-u_{k+1}}{\gamma_k}.
\qquad \text{(see \eqref{eq:grad map})}
\label{eq:grad map recall}
\end{equation}
For integers $k_0\le k_1$, consider the interval
\begin{equation}
K = [k_0:k_1]=\{k_0,\cdots,k_1\}.
\end{equation}
Since $\gamma_k$ is determined by the line search subroutine in Lemma \ref{lem:eval Lipsc cte}, we may now apply Lemma \ref{lem:11} for every iteration in this interval to find that
\begin{align}
\frac{\gamma_k \|G_k\|^2}{2}
& \le \mathcal{L}_{\beta_k}(u_k,y_k) - \mathcal{L}_{\beta_k}(u_{k+1},y_k)
\qquad \text{(see Lemma \ref{lem:11})} \nonumber\\
& = h(u_k) - h(u_{k+1})+ \langle A(u_k)-A(u_{k+1}) , y_k \rangle+ \frac{\|A(u_k)-b\|^2 - \|A(u_{k+1})-b\|^2}{2\beta_k},
\qquad \text{(see \eqref{eq:Lagrangian})}
\label{eq:smoothness rep}
\end{align}
for every $k\in K$.
On the other hand,
\begin{align}
{y_k = y_{k_0-1} + \sum_{i=k_0}^k \frac{A(u_i)-b}{\sigma_i}},
\qquad \text{(see \eqref{eq:y update recall})}
\label{eq:y update unfolded}
\end{align}
which, after substituting in \eqref{eq:smoothness rep}, yields that
\begin{align}
\frac{\gamma_k \|G_k\|^2}{2} & \le h(u_k) - h(u_{k+1}) + \left\langle A(u_k) - A(u_{k+1}) , y_{k_0} + \sum_{i=k_0+1}^k \frac{A(u_i)-b}{\sigma_i} \right\rangle
+\frac{\|A(u_k)-b\|^2-\|A(u_{k+1})-b\|^2}{2\beta_k}.
\label{eq:key ineq}
\end{align}
%Additionally, let us assume that
%\begin{align}
%\beta_k = \frac{\beta}{\sqrt{k}},
%\qquad
%\sigma_k = \beta k,
%\qquad \forall k \in K,
%\label{eq:beta n sigma}
%\end{align}
%with $\beta>0$. For the record, the above assumptions imply that
%\begin{align}
%\frac{1}{\beta_k}- \frac{1}{\beta_{k-1}} \le \frac{1 }{\beta \sqrt{k}},
%\qquad
%\frac{1}{\sigma_k} \le \frac{1}{\beta \sqrt{k}},
%\qquad \forall k\in K,
%\label{eq:consequences}
%\end{align}
%for sufficiently large $k_0$.
{Additionally, let us assume that, for $\beta>0$,
\begin{align}
\frac{1}{\beta_k}- \frac{1}{\beta_{k-1}} \le \frac{1 }{\beta \sqrt{k}},
\qquad
\frac{1}{\sigma_k} \le \frac{1}{\beta \sqrt{k}},
\qquad \forall k\in K,
\label{eq:consequences}
\end{align}
for sufficiently large $k_0$.
For example, following choices satisfy the requirements:
\begin{align}
\beta_k = \frac{\beta}{{k}^\alpha} \text{ where } \alpha \leq 1/2,
\qquad
%\sigma_k \geq \beta \sqrt{k},
\sigma_k \geq \max(\beta \sqrt{k}, \beta k \| Au_k - b \|),
\qquad \forall k \in K,
\label{eq:beta n sigma}
\end{align}
}
By summing up the key inequality in \eqref{eq:key ineq} over $k$ from $k_0$ to $k_1$ and using \eqref{eq:beta n sigma}, we find that
\begin{align}
& \sum_{k=k_0}^{k_1} \frac{\gamma_k \|G_k\|^2}{2} \nonumber\\
& \le h(u_{k_0}) - h(u_{k_1+1}) + \langle A(u_{k_0}) - A(u_{k_1+1}) , y_{k_0-1}\rangle
+ \sum_{k=k_0}^{k_1} \sum_{i=k_0}^k \left\langle A(u_k) - A(u_{k+1}) , \frac{A(u_i)-b}{\sigma_i} \right\rangle
\nonumber\\
& \qquad + \sum_{k=k_0}^{k_1} \frac{\|A(u_k)-b\|^2}{2\beta_k} - \sum_{k=k_0}^{k_1} \frac{\|A(u_{k+1})-b\|^2}{2\beta_k}
\qquad \text{(see \eqref{eq:key ineq})}
\nonumber\\
& = h(u_{k_0}) - h(u_{{k_1}+1}) + \langle A(u_{k_0}) - A(u_{{k_1}+1}) , y_{k_0-1} \rangle
+ \sum_{k=k_0}^{k_1} \sum_{i=k_0}^k \left\langle A(u_k) - A(u_{k+1}) , \frac{A(u_i)-b}{\sigma_i} \right\rangle\nonumber\\
& \qquad + \sum_{k=k_0}^{k_1} \frac{\|A(u_k)-b\|^2}{2\beta_k} - \sum_{k=k_0+1}^{{k_1}+1} \frac{\|A(u_{k})-b\|^2}{2\beta_{k-1}} \nonumber\\
& = h(u_{k_0}) - h(u_{{k_1}+1}) + \langle A(u_{k_0}) - A(u_{{k_1}+1}) , y_{k_0-1} \rangle + \frac{\|A(u_{k_0})-b\|^2}{2\beta_{k_0}} + \sum_{i=k_0}^{k_1} \sum_{k=i}^{k_1} \left\langle A(u_k) - A(u_{k+1}) , \frac{A(u_i)-b}{\sigma_i} \right\rangle
\nonumber\\
& \qquad
+ \sum_{k=k_0+1}^{k_1} \left( \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2 - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2
\nonumber\\
& \le \mu + \sum_{i=k_0}^{k_1} \left\langle A(u_i) - A(u_{{k_1}+1}), \frac{A(u_i)-b}{\sigma_i} \right\rangle + \sum_{k=k_0}^{k_1} \left( \frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2 - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2
\qquad \text{(see \eqref{eq:defn mu})}
\nonumber\\
& = \mu + \sum_{k=k_0}^{k_1} \left( \frac{1}{\sigma_k} +\frac{1}{2\beta_k} - \frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2
- \sum_{k=k_0}^{k_1} \left \langle A(u_{{k_1}+1})-b, \frac{A(u_k)-b}{\sigma_k} \right\rangle - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2
\nonumber\\
& \le\mu + \sum_{k=k_0}^{k_1} \left( \frac{1}{\sigma_k}+ \frac{1}{2\beta_k}- \frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2
+
\sum_{k=k_0}^{k_1} \frac{1}{\sigma_k} \|A(u_{{k_1}+1})-b\| \|A(u_k)-b\| - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2 \nonumber\\
& \le \mu + \sum_{k=k_0}^{k_1} \left( \frac{1 }{\beta \sqrt{k}}+ \frac{1}{2\beta_k}-\frac{1}{2\beta_{k-1}} \right) \|A(u_k)-b\|^2
+
\sum_{k=k_0}^{k_1} \frac{ 1}{\beta \sqrt{k}} \|A(u_{{k_1}+1})-b\| \|A(u_k)-b\| - \frac{1}{2\beta_{k_1}} \| A(u_{k_1+1})-b\|^2
\qquad \text{(see (\ref{eq:consequences}))}
\nonumber\\
& \le
\mu + \sum_{k=k_0}^{k_1} \frac{3 }{2\beta \sqrt{k}} \|A(u_k)-b\|^2 + \sum_{k=k_0}^{k_1} \frac{1}{\beta \sqrt{k}} \|A(u_{{k_1}+1})-b\| \|A(u_k)-b\| - \frac{\sqrt{k}}{2\beta} \| A(u_{k_1+1})-b\|^2
\qquad \text{(see (\ref{eq:consequences}))}
\nonumber\\
& \le \mu + \sum_{k=k_0}^{k_1} \left( \frac{3}{2\beta \sqrt{k}} + \frac{1}{\beta \sqrt{k} } \right) \|A(u_k)-b\|^2 + \sum_{k=k_0}^{k_1} \frac{1}{4\beta\sqrt{k}} \|A(u_{{k_1}+1})-b\|^2 - \frac{\sqrt{k_1}}{2\beta} \| A(u_{k_1+1})-b\|^2
\qquad \left( 2ab \le ca^2 + c^{-1}b^2 \right)
\nonumber\\
& \le \mu + \sum_{k=k_0}^{k_1} \frac{5 }{2\beta \sqrt{k} } \|A(u_k)-b\|^2 + \frac{\sqrt{k_1}}{2\beta} \| A(u_{k_1+1})-b\|^2 - \frac{\sqrt{k_1}}{2\beta} \| A(u_{k_1+1})-b\|^2 \qquad \left( \sum_{k=1}^{k_1} \frac{1}{\sqrt{k}} \le \int_{1}^{k_1} \frac{d\kappa }{\sqrt{\kappa}} =2 \sqrt{k_1} \right) \nonumber\\
&
\le \mu + \sum_{k=k_0}^{k_1} \frac{5 }{2\beta\sqrt{k}} \|A(u_k)-b\|^2
\label{eq:long chain}
\end{align}
where we assumed that
\begin{equation}
\mu := \sup_k h(u_{k_0}) - h(u_k) + \langle A(u_{k_0})-A(u_k) ,y_{k_0-1}\rangle + \frac{\|A(u_{k_0})-b\|^2}{2\beta_{k_0}}< \infty.
\label{eq:defn mu}
\end{equation}
%
%Let us also assume that $\{\beta_k\}_k$ is non-increasing, namely,
%\begin{equation}
%\beta_{k+1} \le \beta_k ,
%\qquad
%\forall k \ge k_0,
%\label{eq:sigma dec}
%\end{equation}
%which allows us to further simplify the last line in \eqref{eq:long chain} as
%\begin{align}
%\sum_{k=k_0}^K \frac{\gamma_k\|G_k\|^2}{2}
%& \le
%\mu + \sum_{k=k_0+1}^K \frac{\sqrt{k}+3}{2\beta_k}
% \|A(u_k)-b\|^2 +
% \sum_{k=k_0+1}^K \frac{\|A(u_{K+1})-b\|^2}{2\sqrt{k}\beta_k}
%\nonumber\\
%& \le \mu + \sum_{k=k_0+1}^K \frac{\sqrt{k}+3}{2\beta_k} \|A(u_k)-b\|^2 +
% \sum_{k=k_0+1}^K \frac{\|A(u_{K+1})-b\|^2}{2\sqrt{k}\beta_{K+1}}
%\qquad \text{(see \eqref{eq:sigma dec})}
%\nonumber\\
%& \le \mu + \sum_{k=k_0+1}^K \frac{\sqrt{k}+3}{2\beta_k} \|A(u_k)-b\|^2 +
% \frac{\sqrt{K+1}}{\beta_{K+1}} \|A(u_{K+1})-b\|^2
% \qquad \left( \sum_{k=1}^K \frac{1}{\sqrt{k}} \le \int_{0}^K \frac{d\kappa}{\sqrt{\kappa}} = 2\sqrt{K} \right) \nonumber\\
%& \le \mu + \sum_{k=k_0+1}^{K} \frac{2\sqrt{k}}{\beta_k} \|A(u_k)\|^2 + \frac{\sqrt{K+1}}{\beta_{K+1}} \|A(u_{K+1})-b\|^2 \nonumber\\
%& \le \mu + \sum_{k=k_0+1}^{K+1} \frac{2\sqrt{k}}{\beta_k} \|A(u_k)-b\|^2.
%\label{eq:raw}
%\end{align}
%We now try to formalize Volkan's intuition that gradient mapping should derive down the feasibility gap. Let us assume that
%\begin{align}
%\sum_{k=1}^{K+1} \frac{\|A_k\|^2}{\beta_k} \le \sum_{k=0}^K \gamma_k G_k^2.
%\label{eq:assumption}
%\end{align}
%And we take
%\begin{equation}
%\sigma_k = 6 \beta_k,
%\qquad \forall k.
%\end{equation}
%Then, by combining the two inequalities above, we reach
%\begin{align}
% \sum_{k=1}^{K+1} \frac{\|A_k\|^2}{\beta_k} \le 4\mu_0,
%\end{align}
%\begin{align}
%\sum_{k=0}^K \gamma_k G_k^2 \le 4\mu_0.
%\end{align}
%That is, Volkan's assumption \eqref{eq:assumption} successfully bounds both the gradient mapping and the feasibility gap. One question is the interplay between $\{\beta_k,\gamma_k\}_k$ to ensure the validity of Volkan's assumption, which feels like some sort of \emph{uncertainty principle}.
Note that \eqref{eq:long chain} bounds the gradient mapping with the feasibility gap. We next find a converse, thus bounding the feasibility gap with the gradient mapping. To that end, let $T_C(u)$ and $P_{T_C(u)}$ be the tangent cone of $C$ at $u\in C$ and orthogonal projection onto this subspace, respectively. Likewise, let $N_C(u)$ and $P_{N_{C}(u)}$ be the normal cone of $C$ at $u$ and the corresponding orthogonal projection.
Roughly speaking, \eqref{eq:bnd on Ak final} states that the feasibility gap is itself bounded by the gradient map. In order to apply Lemma \ref{lem:bnd bnd Ak}, let us assume that (\ref{eq:good neighb}) holds. Lemma \ref{lem:bnd bnd Ak} is then in force and we may now substitute \eqref{eq:bnd on Ak final} back into \eqref{eq:long chain} to find that
\begin{align}
\sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2
& \le \frac{5 }{2\beta} \sum_{k=k_0}^{{k_1}} {\frac{1}{\sqrt{k}}} \|A(u_k)-b\|^2 +2 \mu
\qquad \text{(see \eqref{eq:long chain})}
\nonumber\\
& \le \frac{20 }{\beta \eta_{\min}^2} \sum_{k=k_0}^{{k_1}}
\beta_k^2 \frac{1}{\sqrt{k}} \left( \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2+ 2\mu
\qquad \text{(see \eqref{eq:bnd on Ak final})}
\nonumber\\
& \le
\frac{20\beta}{ \eta_{\min}^2} \sum_{k=k_0}^{{k_1}}
\frac{1}{k^{3/2}} \left( \lambda_h' + \eta_{\max} \|y_{k-1}\| + \|G_k\| \right)^2+ 2\mu
\qquad \text{(see (\ref{eq:beta n sigma}))} \nonumber\\
& \le
\frac{60\beta}{ \eta_{\min}^2} \sum_{k=k_0}^{{k_1}}
\frac{1}{k^{3/2}} \left( \lambda_h'^2 + \eta_{\max}^2 \|y_{k-1}\|^2 + \|G_k\|^2 \right)+ 2\mu,
\qquad \left( (a+b+c)^2 \le 3(a^2+b^2+c^2) \right)
\label{eq:to be used for feas}
\end{align}
where in the third inequality, we used the particular choice of $\beta_k = \frac{\beta}{k^{1/2}}$ in accordance with~\eqref{eq:beta n sigma}.
%To move forward, let us assume that
%\begin{align}
%\sigma_k = o(\sqrt{k}),
%\qquad
%\frac{\sqrt{k}\sigma_k}{\gamma_k} = o(1),
%\qquad
%k^{\frac{3}{2}} \sigma_k \|y_{k-1}\|^2 =o(1).
%\label{eq:cnd on sigma gamma}
%\end{align}
To simplify the above expression, let us assume that
\begin{equation}
\frac{60\beta}{ \eta_{\min}^2 k} \le \frac{\gamma_k}{2},
\qquad \forall k\in K.
\label{eq:to be used later on}
\end{equation}
Let $|K|=k_1-k_0+1$ be the size of the interval $K$.
After rearranging \eqref{eq:to be used for feas} and applying \eqref{eq:to be used later on}, we arrive at
\begin{align}
& { \frac{|K|}{2} \cdot \min_{k \in K} \gamma_k \|{G}_{k}\|^2} \nonumber\\
& \le \sum_{k=k_0}^{k_1} \frac{\gamma_k}{2} \|G_k\|^2 \nonumber\\
& \le \sum_{k=k_0}^{k_1} \left(\gamma_k - \frac{60\beta}{\eta_{\min}^2 k^{3/2}} \right)\|G_k\|^2
\qquad \text{(see \eqref{eq:to be used later on})}
\nonumber\\
& \le \frac{60 \beta \lambda'^2_h }{\eta_{\min}^2 } \sum_{k=k_0}^{{k_1}} \frac{1}{k^{3/2}}
+ \frac{60\beta \eta_{\max}^2}{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2 }{k^{3/2}}
%+\frac{48\beta \log(k_1+1) \|G_{k_1+1}\|^2}{\eta_{\min}^2 (k_1+1)}
+2\mu
\qquad \text{(see \eqref{eq:to be used for feas})}
\nonumber\\
% & =: \frac{48 \beta \lambda'^2_h }{\eta_{\min}^2 } \sum_{k=k_0+1}^{{k_1}+1} \frac{\log k}{k}
%+ \frac{48\beta \eta_{\max}^2}{\eta_{\min}^2} \sum_{k=k_0+1}^{{k_1}+1} \frac{\|y_{k-1}\|^2\log k}{k}
%+2\mu' \nonumber\\
% & \le \frac{60\beta \lambda'^2_h }{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{1}{k}
% +
% \frac{60\beta\eta_{\max}^2 }{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2}{k} +2 \mu \nonumber\\
& \le \frac{120\beta \lambda'^2_h C_1}{\eta_{\min}^2}
+
\frac{60\beta\eta_{\max}^2 }{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2}{k^{3/2}} +2 \mu \nonumber\\
& \le \frac{120\beta C_1}{\eta_{\min}^2} \left( \lambda'^2_h + \frac{\eta_{\max}^2}{2C_1}\sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2}{k^{3/2}} \right)
+
2 \mu,
\label{eq:final bnd on G}
\end{align}
or, equivalently,
\begin{align}
\min_{k \in K} \gamma_k \|{G}_{k}\|^2 & \le
\frac{240\beta C_1}{\eta_{\min}^2|K|} \left( \lambda'^2_h + \frac{\eta_{\max}^2}{2C_1} |K|B_K \right)
+ \frac{4\mu}{|K|},
\label{eq:bnd on gamma G}
\end{align}
where
%\begin{align}
%\mu' := \frac{24\beta \log(k_1+1) \|G_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1)} + \mu,
%\label{eq:defn of muprime}
%\end{align}
\begin{align}
B_{K} := \frac{1 }{|K|} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2}{k}, \text{ and } C_1 = \sum_{k=k_0}^{k_1} \frac{1}{k^{3/2}},
\label{eq:defn of B}
\end{align}
and we will later estimate $B_K$.
In turn, the bound above on the gradient mapping controls the feasibility gap, namely,
\begin{align}
& |K| \min_{k\in K} \|A({u}_k)-b\|^2 \nonumber\\
& \le \sum_{k=k_0}^{{k_1}} \|A(u_k)-b\|^2 \nonumber\\
& \le \frac{12\beta^2 \lambda'^2_h}{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{1}{k}
+ \frac{12\beta^2 \eta_{\max}^2}{\eta_{\min}^2} \sum_{k=k_0}^{{k_1}} \frac{\|y_{k-1}\|^2 }{k}
+ \sum_{k=k_0+1}^{{k_1}+1} \frac{12 \beta^2 }{\eta_{\min}^2 k} \|G_k\|^2
\qquad
\text{(see \eqref{eq:to be used for feas})}
\nonumber\\
& \le \frac{24\beta^2 \lambda'^2_h \log ({k_1}+1) }{\eta_{\min}^2 }
+ \frac{12\beta^2 \eta_{\max}^2}{\eta_{\min}^2} \cdot |K| B_{K}
+ \frac{\beta}{10}\sum_{k=k_0}^{{k_1}} \gamma_k \|G_k\|^2
%+ \frac{12\beta^2 \log (k_1+1) \|G_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1)}
\qquad \text{(see (\ref{eq:to be used later on},\ref{eq:defn of B}))}
\nonumber\\
& \le
\frac{24\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right)
+
%\frac{24\beta^2 \log^2(k_1+1) }{\eta_{\min}^2} \left( \lambda'^2_h+ \eta_{\max}^2 |K|B_K \right) + \frac{\beta \mu'}{2}
\frac{24\beta^2 C_1}{\eta_{\min}^2} \left( \lambda'^2_h + \frac{\eta_{\max}^2}{2C_1}|K|B_K \right) + \frac{2\beta}{5} \mu,
%+ \frac{12\beta^2 \log (k_1+1) \|G_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1)}
\qquad \text{(see \eqref{eq:final bnd on G})} \nonumber\\
& \le
\frac{24\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right)
+
\frac{24\beta^2 \log (k_1+1) }{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right) + \frac{2\beta}{5} \mu
\qquad \text{(see \eqref{eq:defn of muprime})}
\nonumber\\
& =
\frac{48\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right) + \frac{2\beta}{5} \mu,
\label{eq:final bnd on feas gap}
\end{align}
which in turn implies that
\begin{align}
\min_{k\in K}{\|A({u}_k)-b\|^2
\le \frac{48\beta^2 \log(k_1+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right) + \frac{2\beta\mu}{5|K|}},
\label{eq:feas bnd semi final}
\end{align}
if $k_0\ge 2$.
Let us now revisit and simplify the condition imposed in (\ref{eq:good neighb}).
To that end, we first derive a weaker but uniform bound on the feasibility gap. For every $k-1\in K$, it holds that
\begin{align}
\|A(u_k)-b\|^2 & \le \sum_{i=k_0}^{k_1} \|A(u_i)-b\|^2
\qquad \left( k_0\ge 2 \right)
\nonumber\\
& \le \frac{48\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right) + \frac{2\beta}{5} \mu.
\qquad \text{(see \eqref{eq:feas bnd semi final})}
\label{eq:rate of feas gap}
\end{align}
Therefore, we may replace \eqref{eq:good neighb} with the assumption that
\begin{equation}
\|A(u_{k_0})-b\| \le \rho,
\qquad
\frac{48\beta^2 \log(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ \frac{\eta_{\max}^2}{2} |K|B_K \right) + \frac{2\beta}{5} \mu \le \rho^2,
\label{eq:suff cnd on rho}
\end{equation}
which ensures that
\begin{align}
\|A(u_k)-b\| \le \rho,
\qquad \forall k\in [k_0:k_1+1].
\label{eq:uniform feas larger interval}
\end{align}
In order to interpret (\ref{eq:final bnd on G},\ref{eq:feas bnd semi final},\ref{eq:suff cnd on rho}), we next estimate $B_{K}$ in \eqref{eq:defn of B}. To that end, let us first control the growth of the dual sequence $\{y_k\}_k$. Recalling \eqref{eq:y update recall} and for every $k\in [k_0:k_1+1]$, we write that
\begin{align}
\|y_k\| & \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \frac{\|A(u_i)-b\|}{\sigma_k}
\qquad \text{(see \eqref{eq:y update recall})}
\nonumber\\
& \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \frac{\rho}{\beta k}
\qquad \text{(see (\ref{eq:beta n sigma},\ref{eq:uniform feas larger interval}))} \nonumber\\
& \le \|y_{k_0}\|+ \frac{2\rho \log k}{\beta}.
\label{eq:dual growth}
\end{align}
With the growth of the dual sequence uncovered, we evaluate $B_{K}$ as
\begin{align}
B_{K} & = \frac{1}{|K|}\sum_{k=k_0}^{{k_1}} \frac{ \|y_{k}\|^2 }{k+1}
\qquad \text{(see \eqref{eq:defn of B})}
\nonumber\\
& \le \frac{1}{|K|} \sum_{k=k_0}^{{k_1}} \frac{1}{k+1}\left( \|y_{k_0}\|+ \frac{2\rho \log k}{\beta} \right)^2
\qquad \text{(see (\ref{eq:dual growth}))}
\nonumber\\
& \le \frac{1}{|K|} \sum_{k=k_0}^{{k_1}} \frac{2\|y_{k_0}\|^2}{k+1}+ \frac{8\rho^2 \log^2 k}{\beta^2 (k+1)}
\qquad ((a+b)^2 \le 2a^2+2b^2)
\nonumber\\
& \le \frac{4\|y_{k_0}\|^2 \log ({k_1}+1)}{|K|} + \frac{16 \rho^2 \log^3 ({k_1}+1)}{|K|} \nonumber\\
& \le \frac{16 \log^3(k_1+1)}{ |K|} \left( \|y_{k_0}\|^2+\rho^2 \right).
\label{eq:Bk evaluated}
\end{align}
In order to interpret (\ref{eq:final bnd on G},\ref{eq:feas bnd semi final},\ref{eq:suff cnd on rho}), it still remains to estimate $\mu'$ in \eqref{eq:defn of muprime}. To that end, we first derive a lower bound on the step sizes $\{\gamma_k\}_k$. To invoke \eqref{eq:moreover}, we in turn need to gauge how smooth the augmented Lagrangian $\mathcal{L}_{\beta_k}(\cdot,y_k)$ is. For every $k\in [k_0:k_1+1]$, note that
\begin{align}
\lambda_{\beta_k} & \le \lambda_h+ \sqrt{m}\lambda_A \left( \|y_k\| + \frac{\|A(u_k)-b\|}{\beta_k} \right)+ \frac{\|DA(u_k)\|_F^2}{\beta_k}
\qquad \text{(see \eqref{eq:smoothness of Lagrangian})} \nonumber\\
& \le \lambda_h+ \sqrt{m}\lambda_A \left( \|y_{k_0}\|+ \frac{2\rho \log (k_1+1)}{\beta_k} + \frac{\rho}{\beta_k} \right) + \frac{ m \eta_{\max}^2}{\beta_k}
\qquad \text{(see (\ref{eq:defn lambdap n etaMax},\ref{eq:uniform feas larger interval},\ref{eq:dual growth}))} \nonumber\\
& = \lambda_h+\sqrt{m}\lambda_A \|y_{k_0}\| + \frac{ 1}{\beta_k}(2\sqrt{m}\lambda_A\rho \log(k_1+1) + \sqrt{m}\lambda_A \rho + m\eta_{\max}^2 ) \nonumber\\
& \le \frac{ 2}{\beta_k}(2\sqrt{m}\lambda_A\rho \log(k_1+1) + \sqrt{m}\lambda_A \rho + m\eta_{\max}^2 ),
\label{eq:smoothness at k}
\end{align}
where the last line holds if $k_0$ is sufficiently large. We are now in position to invoke \eqref{eq:moreover} by writing that
\begin{align}
\gamma_k & \ge \frac{\theta}{\lambda_{\beta_k}} \qquad \text{(see \eqref{eq:moreover})}\nonumber\\
& \ge \frac{\theta \beta_k}{ 4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 }
\qquad \text{(see \eqref{eq:smoothness at k})} \nonumber\\
& = \frac{\theta \beta}{ (4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 ) \sqrt{k} },
\qquad \text{(see \eqref{eq:beta n sigma})}
\label{eq:low bnd on gammas}
\end{align}
for every $k\in [k_0:k_1+1]$.
In particular, the lower bound on $\gamma_{k_1+1}$ above allows us to estimate $\mu'$ by writing that
\begin{align}
\mu' & = \frac{24\beta \log(k_1+1) \|G_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1)} + \mu
\qquad \text{(see \eqref{eq:defn of muprime})} \nonumber\\
& = \frac{24\beta \log(k_1+1) \|u_{k_1+2}-u_{k_1+1}\|^2 }{\eta_{\min}^2 (k_1+1) \gamma_{k_1+1}^2} + \mu
\qquad \text{(see \eqref{eq:grad map recall})} \nonumber\\
& \le
\frac{24\beta \log(k_1+1) \text{diam}(C)^2 }{\eta_{\min}^2 (k_1+1) \gamma_{k_1+1}^2} + \mu
\qquad \left( \{u_k\} \subset C \right)
\nonumber\\
& \le
\frac{24\beta \log(k_1+1) \text{diam}(C)^2 (4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 )^2 }{\eta_{\min}^2 \theta^2 \beta^2}
+2\mu
\qquad \text{(see \eqref{eq:low bnd on gammas})}
\nonumber\\
& =: \mu''.
\label{eq:mup to mupp}
\end{align}
Having estimated $B_K$ and $\mu'$, we can rewrite the bound on the feasibility gap as
\begin{align}
\min_{k-1\in[K]}\|A(u_k)-b\|^2
& \le \frac{48\beta^2 \log^2(k_1+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h+ \eta_{\max}^2 |K|B_K \right) + \frac{\beta\mu'}{|K|}
\qquad \text{(see \eqref{eq:feas bnd semi final})} \nonumber\\
& \le \frac{48\beta^2 \log^2(k_1+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h+ 16 \eta_{\max}^2 (\|y_{k_0}\|^2 +\rho^2) \log^3(k_1+1) \right) + \frac{\beta\mu''}{|K|}
\qquad \text{(see (\ref{eq:Bk evaluated},\ref{eq:mup to mupp}))}
\end{align}
Moreover, we can simplify the assumption in \eqref{eq:suff cnd on rho}. To be specific, thanks to (\ref{eq:Bk evaluated},\ref{eq:mup to mupp}), we can replace \eqref{eq:suff cnd on rho} with the assumption
\begin{align}
\|A(u_{k_0})-b\|\le \rho,
\qquad
\frac{48\beta^2 \log^2(k_1+1)}{\eta_{\min}^2} \left( \lambda'^2_h+ 16 \eta_{\max}^2 (\|y_{k_0}\|^2+\rho^2) \log^3(k_1+1) \right) + \beta\mu'' \le \rho^2.
\label{eq:suff cnd 2nd time}
\end{align}
The lower bound on the step sizes in \eqref{eq:low bnd on gammas} has two other consequences. First, we find that \eqref{eq:to be used later on} automatically holds if $k_0$ is sufficiently large. Second, it allows us to improve \eqref{eq:bnd on gamma G} by writing that
\begin{align}
& \min_{ k\in K } \| G_k\|^2 \nonumber\\
& \le \frac{\min_{k \in K} \gamma_k \|G_k\|^2}{\min_{k\in K} \gamma_k } \nonumber\\
& \le \frac{1}{\min_{k\in K} \gamma_k}
\left( \frac{192\beta \log^2({k_1}+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h + \eta_{\max}^2 |K|B_K \right)
+ \frac{4\mu'}{|K|} \right)
\qquad \text{(see \eqref{eq:bnd on gamma G})}
\nonumber\\
& \le \frac{1}{\min_{k\in K} \gamma_k}
\left( \frac{192\beta \log^2({k_1}+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h + 16 \eta_{\max}^2 (\|y_{k_0}\|^2+\rho^2) \log^3 (k_1+1) \right)
+ \frac{4\mu''}{|K|} \right)
\qquad \text{(see (\ref{eq:Bk evaluated},\ref{eq:mup to mupp}))}
\nonumber\\
& \le \frac{ (4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 ) \sqrt{k}}{\theta \beta}
\left( \frac{192\beta \log^2({k_1}+1)}{\eta_{\min}^2|K|} \left( \lambda'^2_h + 16 \eta_{\max}^2 (\|y_{k_0}\|^2+\rho^2) \log^3 (k_1+1) \right)
+ \frac{4\mu''}{|K|} \right)
\quad \text{(see \eqref{eq:low bnd on gammas})} \nonumber\\
& \le \frac{ 4\sqrt{m}\lambda_A\rho \log(k_1+1) + 2\sqrt{m}\lambda_A \rho + 2m\eta_{\max}^2 }{c\theta \beta \sqrt{k_1}}
\left( \frac{192\beta \log^2({k_1}+1)}{\eta_{\min}^2 } \left( \lambda'^2_h + 16 \eta_{\max}^2 (\|y_{k_0}\|^2+\rho^2) \log^3 (k_1+1) \right)
+ 4\mu'' \right),
\end{align}
where the last line holds if there exists $c>0$ for which $k_1-k_0+1 \ge ck_1 $.
Note that $\log$ factors in the convergence proof might be shaved up to a factor by choosing a more conservative step size for the dual which will automatically bound the norm of the dual. A possible choice
\begin{align}\label{eq:yk bounded}
\sigma_k \geq \max(\beta \sqrt{k}, \beta k \log^2(k+1) \| Au_k - b \|),
\end{align}
which immediately yields
\begin{align}
\|y_k\| & \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \frac{\|A(u_i)-b\|}{\sigma_k}
\qquad
\nonumber\\
& \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \frac{\rho}{\beta k\log^2(k+1)}
\qquad \text{(see (\ref{eq:yk bounded}))} \nonumber\\
& \le \|y_{k_0}\|+ \frac{\rho c}{\beta} \nonumber\\
&:= y_{max}.
\label{eq:dual bounded}
\end{align}

Event Timeline