\edita{Various problems in engineering and computational sciences can be cast as non-linear optimization programs, and the design of efficient numerical algorithms to provably solve such problems is therefore of fundamental importance. cite?
%Non-linear programming is a broad discipline in applied mathematics.
In this paper, we are particularly interested in solving the optimization program
\begin{equation}
\label{prob:01}
\begin{cases}
\min_{u} h(u),\\
A(u) = b,\\
u\in C,
\end{cases}
\end{equation}
where $h:\mathbb{R}^d\rightarrow\mathbb{R}$ and $A:\mathbb{R}^d\rightarrow\mathbb{R}^m$ satisfy
for every $u,u'\in\mathbb{R}^d$. Above, $\nabla h(u)\in\mathbb{R}^d$ is the gradient of $h$ and $DA(u)\in\mathbb{R}^{m\times d}$ is the Jacobian of $A$.
%where $h\in \mathbb{L}_{d}(\lambda_h)$ is a continuously-differentiable function from $\mathbb{R}^d$ to $\RR$ with $\lambda_h$ Lipschitz-continuous gradient, $A: \mathbb{R}^d\rightarrow\mathbb{R}^m$ is a non-linear function, with each component $A_i\in \mathbb{L}_d(\lambda_A) $ for $i\in \{1,\cdots,m\}$.
Moreover,
$C\subset\RR^d$ is non-empty, closed, and convex.
%\edita{Program \eqref{prob:01} is typically} non-convex and \edita{can be considered as a standard non-linear program, where the set $C$
%is modeled by inequalities constraints, namely, $C = \menge{u}{g(u) \leq 0}$ for some $g\colon\RR^d\to \RR^s$.}
Variants of Program~\eqref{prob:01} naturally arise in a broad range of applications in ?? \notea{Please add some representative applications above alongside some references. } For the sake of brevity, we showcase here one instance of Program $\eqref{prob:01}$.}
Let $\mathbb{S}^{d'\times d'}$ be the space of $d'\times d'$ symmetric matrices, equipped with the standard inner product $\langle x|y\rangle=\text{tr}(x^*y)$. In particular, when $x\in\mathbb{S}^{d'\times d'}$ is positive semi-definite, we write that $x\succeq0$.}
% Consider $\mathcal{C}'\subseteq \mathcal{X}$, and let $h_0\colon\mathcal{X}\to \RR$ be a differentiable
%convex function, with $\mathbb{L}_{0}$ Lipschitz-continuous gradient.}
%
\edita{Consider the program
\begin{equation}
\label{e:fac}
\begin{cases}
\min_x h'(x) \\
A'(x) = b'\\
x\in C'\\
x \succeq 0 ,
\end{cases}
\end{equation}
where $h': \mathbb{S}^{d'\times d'} \to\RR$, $A'\colon\mathbb{S}^{d'\times d'}\to\RR^m$, $b\in\RR^m$, and $C' \subseteq\mathbb{R}^{d'\times d'}$.
Variants of Program \eqref{e:fac} are popular in matrix completion and sensing \cite{park2016provable}, with a broad range of applications to problems in collaborative filtering, geophysics, and imaging, among others~\cite{Burer2005,Burer2003,tu2014practical}. Two common choices for $C'$ in Program \eqref{e:fac} are
$C' =\{x: x \ge0\}$ and $C' =\{x: \text{tr}(x)\le1\}$\cite{mixon2016clustering}.}
\edita{Solving Program \eqref{e:fac} with semi-definite programming is not scalable, becoming increasingly cumbersome as the dimension $d'$ grows.
To overcome this {computational bottleneck}, the factorized technique sets $x = uu^\top$ for $u\in\mathbb{R}^{d'\times r}$ and a sufficiently large $r$. The resulting non-convex program is then solved with respect to the much lower-dimensional variable $u$. If we also replace the constraint $uu^\top\in C'$ with $u\in C$ for a properly chosen convex set, the new problem in $u$ matches Program \eqref{prob:01} with $h(u)= h'(uu^\top)$ and $A(u)= A'(uu^\top)$. For our examples of $C'$ above, we might choose $C=\{u:u\ge0\}$ and $C=\{\|u\|_F^2\le1\}$, respectively. Here, $\|\cdot\|_F$ stands for the Frobenius norm.
}
\end{example}
\edita{The \emph{augmented Lagrangian method}\cite{luenberger1984linear} is a powerful approach to solve Program \eqref{prob:01}, see Section \ref{sec:related work} for a review of the related literature as well as other approaches to solve Program \eqref{prob:01}}. \edita{ Indeed, for positive $\beta$, it is easy to verify that Program \eqref{prob:01} is equivalent to
\begin{align}
\min_{u\in C}\max_y \,\mathcal{L}_\beta(u,y),
\label{eq:minmax}
\end{align}
where
\begin{align}
\label{eq:Lagrangian}
\mathcal{L}_\beta(u,y) := h(u) + \langle A(u)-b, y \rangle + \frac{\|A(u)-b\|^2}{2\beta},
\end{align}
is the augmented Lagrangian corresponding to Program \eqref{prob:01}. The equivalent formulation in Program \eqref{eq:minmax} naturally suggests the following algorithm to solve Program \eqref{prob:01}:}
In fact, when the penalty parameter $\beta$ is sufficiently small, the augmented Lagrangian has
a local minimum point near the true optimal point. However, we do not know exactly how small $\beta$ is.
Hence, the choice of $\beta$ plays a centreral role in practices. \notea{Is the last claim really true? Programs \eqref{prob:01} and \eqref{eq:minmax} seem to be equivalent. }
\edita{In our nonlinear framework, updating $u$ in the augmented Lagrangian method requires solving the non-convex Program \eqref{e:exac} to global optimality, which is often intractable. }\notea{We should discuss fixes to this issue, if any, and explain why they are not satisfactory.}\edita{The key contribution of this paper is to provably and efficiently address this challenge.}
\edita{
\paragraph{\emph{\textbf{Contributions.}}}
In order to solve Program \eqref{prob:01}, this paper proposes to replace the (intractable) Program \eqref{e:exac} with the update
for carefully selected sequences $\{\beta_k,\gamma_k\}_k$. Here, $P_C$ is the orthogonal projection onto the convex set $C$ which is often easy to compute in various applications and consequently the update in \eqref{eq:new update} is inexpensive and fast.
Put differently, instead of fully solving Program \eqref{e:exac}, this paper proposes to apply one iteration of the projected gradient algorithm for every update. We provide the convergence guarantees for this fast and scalable new algorithm.
}\notea{We should summarize the guarantees.}
%%%%%%%%%%%%%%%%%%%%%
\section{ Preliminaries}
\notea{I think the whole of this section should move down. The actual results are hidden deep in the paper!}
\paragraph{\textbf{\emph{Notation.}}}
We use the notations $\scal{\cdot}{\cdot}$ and $\|\cdot\|$ for the \edita{standard inner} product and \edita{the} associated norm on $\RR^d$\edita{, respectively}.
\edita{The adjoint of \edita{a} linear operator is denoted the superscript $\top$.}
Let $C\subset\mathbb{R}^d$ be nonempty, closed, \edita{and convex}. The indicator function of $C$ is denoted by $\iota_{\mathcal{C}}$, and the projection onto $C$ is denoted by $P_C$.
%The distance function is $d_{\mathcal{C}}\colon u\mapsto \inf_{a\in\mathcal{C}}\|u-a\|$. The projection of $x$ onto $C$ is denoted by $P_Cx$.
\edita{For $u\in C$, the tangent cone to $C$ at $u$ is
\begin{equation}
T_{C}(u) = \left\{v\in\RR^d : \exists t > 0 \text{ such that } u+t v \in C\right\}.
\end{equation}
The corresponding normal cone $N_C(u)$ at $u$ is the polar of the tangent cone, namely,
\edita{Necessary optimality conditions} for \edita{Program}\eqref{prob:01} are well studied in the literature \cite[Corollary 6.15]{rockafellar2009variational}. \edita{Indeed, $u$ is a (first-order) stationary point of Program \eqref{prob:01} if there exists $y$ for which
\begin{align}
\begin{cases}
-\nabla h(u) - DA(u)^\top y \in N_C(u)\\
A(u) = b.
\end{cases}
\label{e:inclu1}
\end{align}
Here, $DA(u)$ is the Jacobian of $A$ at $u$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to
\begin{align}
\begin{cases}
-\nabla_u \mathcal{L}_\beta(u,y) \in N_C(u)\\
A(u) = b,
\end{cases}
\label{e:inclu1}
\end{align}
which is in turn the necessary optimality condition for Program \eqref{eq:minmax}.
}
%
%Let $\overline{u}$ be a locally optimal and suppose that there no vector $y\not=0$ such that
%\begin{equation}
%-\nabla L(\overline{u})^* y \in N_C(\overline{u}).
%\end{equation}
%Then, the first order optimality condition for $\overline{u}$ is
In nonconvex optimization, the relation between the gradient mapping and stationarity is well-understood \cite{Lan16,Hare2009,bolte2014proximal}, \edita{which we review here for completeness.}
\begin{definition}\label{def:grad map} Given $u$ and $\gamma >0$, define
\edita{In particular, if we remove the constraints of Program \eqref{prob:01}, the gradient mapping reduces to $G_{\beta,\gamma}(u,y)=\nabla h(u)$. The gradient mapping is closely related to $\mathcal{L}_\beta$. The following standard result is proved in Appendix~\ref{sec:proof of smoothness}.}
\begin{lemma}\label{lem:11}
\edita{For fixed $y\in\RR^m$, suppose that $\nabla_u \mathcal{L}_{\beta}(\cdot, y)$ is $\lambda_\beta$ Lipschitz-continuous. For $u\in C$ and $\gamma\in(0, 1/\lambda_\beta)$, it holds that }
%\begin{proof} For a fixed $y$, the function $u\mapsto h(u)+F_{\beta}(u;y)$ is $(\mathbb{L}_h+ \mathbb{L}_{\beta})$-Lipschitz. Hence, the results follows from
In \edita{practice}, the Lipschitz constant $\lambda_{\beta}$ is often hard to evaluate exactly \edita{and we might resort to the classic line search technique, reviewed below and proved in Appendix \ref{sec:proof of eval Lipsc cte} for completeness.}
\edita{
\begin{lemma}\label{lem:eval Lipsc cte} Fix $\rho\in(0,1)$ and ${\gamma}_0$. For $\gamma'>0$, let $u^+_{\gamma'} = P_C(u -\gamma' \nabla\mathcal{L}_\beta(u,y))$ and define
%In particular, by taking $\delta =1/2$, we obtain \eqref{e:deslem}.
\edita{Optimality conditions in Section \ref{sec:opt cnds} can also be expressed in terms of the gradient mapping. Indeed, it is straightforward to verify that $u^+$ is a first-order stationary point of Program \eqref{prob:01} if
\begin{align}
\begin{cases}
G_{\beta,\gamma}(u,y) = 0\\
A(u^+) = b.
\end{cases}
\label{eq:opt grad map}
\end{align}
}
%
%
%
%\begin{lemma}
%\label{l:solv}
% Suppose that $\mathcal{L}_{\beta,\gamma}\in \mathbb{L}_d(\lambda_\beta)$. For $\gamma \in (0, 1/\lambda_\beta)$, $u^+=P_C(u-\gamma \nabla \mathcal{L}_\beta(u,y))$ is a first-order stationary point of Program (\ref{prob:01}) if
%
% $\nabla g_{\beta}(\cdot, y)$ is $\mathbb{L}_{\beta}$-Lipschitz continuous. Let $u\in C$ and
%$\gamma \in ]0, 1/(\mathbb{L}_h+ \mathbb{L}_{\beta})[$. Suppose that $Lu^+=b$ and $\| G_{\beta,\gamma}(u;y)\|^{2} =0$.
%Then $u^+$ is a stationary point, i.e., the first order optimality condition \eqref{e:inclu1}, for $ \overline{u} =u^+$, is satisfied.
%\end{lemma}
%\begin{proof}
%Since $u^+ = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$. Then
Sufficient optimality conditions for Program \eqref{prob:01} are also well understood in the litterature \cite{luenberger1984linear,rockafellar2009variational,mordukhovich2006variational,gfrerer2015complete}.
Indeed, $u$ is a local minimizer of Program \eqref{prob:01} if there exists $y$ for which
\notea{Why does above look different from sufficient cnds for Lagrangian? Suppose to be equivalent problems.}
%For simple, let us assume that $C = \menge{u}{g(u)\leq c}$ for some $\mathcal{C}^2$-function $g\colon\RR^d\to\RR$. In this section we assume that $h$ and $L$ are too $\mathcal{C}^2$-functions.
%Let $\overline{u}$ be a point such that $g(\overline{u})=c$ and $0\not\in\nabla g(\overline{u})$. Then, \cite[Proposition 10.3]{rockafellar2009variational} implies that
%Now suppose that the first oder optimality condition \eqref{e:inclu1} is satisfied for $\overline{u}$. Then there exists $\mu \geq 0$ and $y \in\RR^m$ such that
which implies that $b_0>-\infty$ whenever $(u_k)_{k\in\NN}$ or $\dom(f)$ is bounded.
\end{remark}
\subsection{Convergence}
In view of Lemma \ref{l:solv}, we need to estimate gradient mapping $\|G_{\beta_{k},\gamma_{k}}(u_k,y_{k})\|$ as well as feasibility $\|Lu_{n+1}-b\|^2$.
\begin{theorem}
\label{t:1}
Suppose that $b_0=\inf_{k\in\NN} \mathcal{L}_{\beta_{k}}(u_{k+1},y_{k}) > -\infty$ and that $z_0=\sum_{k=1}^\infty\frac{d_{k,s_k}}{(1+k)^{1+\epsilon_1}} <+\infty$,
where $s_k$ be the smallest index such that $\beta_{k,s_k} < c/(k+1)^{\alpha}$.
% \eqref{e:mapp2} and \eqref{e:feas2} follow directly
% from \eqref{e:mapp1} and \eqref{e:feas1}, respectively.
% \end{proof}
% \begin{corollary} \label{c:2}
% Under the same condition as in Theorem \ref{t:1}.
% The sequence $(F_{\beta_k}(u_k,y_{k}))_{k\in\NN}$ converges to a $F^\star$. Moreover,
% if $(\|y_{k-1}\| \sqrt{\beta_k})_{k\in\NN}$ and $(\beta_{k+1}/\beta_{k})_{k\in\NN}$ are bounded by $M$, then $(h(u_k))_{k\in\NN}$ converges to $F^\star$.
% \end{corollary}
% \begin{proof} Note that $(F_{\beta_k}(u_{k+1},y_k))_{n\in\NN}$ is bounded below. Moreover, the proof of Theorem \ref{t:1} show that
% which is \eqref{e:fr1} for $\overline{u} = u^*$ and $\overline{y} = y^*$. By assumption, $u^*$ is local minimum and
% $h(u_{n_k}) \to h(u^*)$. Therefore, $F^\star = h(u^*)$. Using Corollary \ref{c:2}, we get $h(u_k) \to h(\overline{u})$.
% \end{proof}
\section{Related Work \label{sec:related work}}
To the best of our knowledge, the proposed method is new and different from existing methods in the literature.
As mentioned in Introduction, the connection to augmented Lagrange method is already mentioned.
Our method is significantly different from the augmented Lagrange method, we perform
only step of the projected gradient step on primal variable $u$ instead of minimizing the augmented Lagrange fucntion.
Furthermore, we update the penalty parameter $\beta$ adaptively to make sure that the feasibility reduces
faster than the gradient mapping.
In the case when $h=0$, a modification of
Chambolle-Pock's method is investigated in \cite{Valkonen14} and preconditioned ADMM \cite{Matin17}
where the convergence of iterate is proved under strong assumptions not full-filling in our setting here.
%\noindent{\bf Connection to Linearized Alternating Direction Method}.\\
ADMM is the classic method proposed for solving the problem \ref{prob:01} for the case where $L$ is a linear operator and $h$ is zero \cite{gabay1976dual}.
This method is an application of the Douglas-Rachford method to the dual problem \cite{Gabay83}.
One of the main drawback of the ADMM is the appearance of the term $Lu$ in the update rule of $u_{k+1}$. To overcome this issue, some strategies were suggested.
The first strategies is proposed in \cite{shefi2014rate}, refined in \cite{banert2016fixing}, known as alternating direction proximal method of multipliers.
The second strategies is to use linearized technique \cite{lin2011linearized}.
We show here that our proposed method
is closed related to updating rule as the linearized alternating direction method \cite{lin2011linearized}. Assume that $h\equiv0$ and $L$ is a linear operator.
which is a variant version of Linearized ADMM \cite{lin2011linearized}.
%\noindent
%{\bf Connection to ALBUM3 in \cite{bolte2018nonconvex}}\\
Very recently, \cite{bolte2018nonconvex} proposes a framework with for solving the problem \ref{prob:01} with $C=\RR^d$.
In particular, a special case AlBUM3 (Proximal Linearized Alternating Minimization) in this work
is closely related to us where their conditions are checkable only when $L$ is linear. Moreover,
our updating of $\beta_{k}$ in \cite{bolte2018nonconvex} depending on the smallest eigenvalue $L^*L$. For nonlinear $L$, the application of their method remains a challenge.
%\noindent{\bf Connection to the deflected subgradient method}\\
The deflected subgradient method is investigated in \cite{burachik2010deflected} can be use to solve a special case of the Problem \ref{prob:01} for
some a compact subset $\mathcal{C}$ in $\mathcal{X}$. The basis step of the deflected subgradient method to solve: given $\beta, v$,
where $\boldsymbol{\sigma}$ is a continuous penalty function such as $\|\cdot\|$, and $K$ is bounded linear operator. In general, there is no closed
-form expression for $u^*$ since it does not split $f$, $h$, $L$ invididually. Hence, it is hard to implement deflected subgradient method. This is also a common drawback of the classic penalty method and its related works
\cite{gasimov2002augmented,burachik2010primal}.
\section{Numerical experiments}
\subsection{Hanging chain}
%\begin{thebibliography}{}
%
% and use \bibitem to create references. Consult the Instructions
\section{Proof of Lemma \ref{lem:11}\label{sec:proof of smoothness}}
Note that \eqref{e:deslem} follows immediately from an application of \cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}. It only remains to compute the smoothness parameter of $\mathcal{L}_\beta(\cdot,y)$, namely, $\lambda_\beta$. To that end, note that
For an integer $k_0$ to be set later, suppose that $k\ge k_0$.
In order to apply Lemma \ref{lem:11} in every iteration, we set $\gamma_k$ to be the output of the line search subroutine in Lemma \ref{lem:eval Lipsc cte}.
%assume that
%\begin{align}
%\gamma_k \le \ol{\gamma}_k,
%\qquad \forall k \ge k_0,
%\label{eq:smoothness emp}
%\end{align}
%We can make the assumption in \eqref{eq:smoothness emp} more concrete as follows. As a consequence of (\ref{eq:smoothness of Lagrangian},\ref{eq:moreover}), note that
where, in the last line above, we used the inequality $2ab\le ca^2+ c^{-1}b^2$ for scalars $a,b$ and $c> 0$. In \eqref{eq:long chain}, we also assumed that
%That is, Volkan's assumption \eqref{eq:assumption} successfully bounds both the gradient mapping and the feasibility gap. One question is the interplay between $\{\beta_k,\gamma_k\}_k$ to ensure the validity of Volkan's assumption, which feels like some sort of \emph{uncertainty principle}.
Note that \eqref{eq:raw} bounds the gradient mapping with the feasibility gap. We next find a converse, thus bounding the feasibility gap with the gradient mapping. To that end, let $T_C(u)$ and $P_{T_C(u)}$ be the tangent cone of $C$ at $u\in C$ and orthogonal projection onto this subspace, respectively. Likewise, let $N_C(u)$ and $P_{N_{C}(u)}$ be the normal cone of $C$ at $u$ and the corresponding orthogonal projection.
The update rule for $u_k$ in \eqref{eq:update uk recall} immediately implies that
Roughly speaking, \eqref{eq:bnd on Ak final} states that the feasibility gap is itself bounded by the gradient map. In order to apply Lemma \ref{lem:bnd bnd Ak}, let us assume that there exists $k_0$ such that (\ref{eq:good neighb},\ref{eq:cnd for well cnd}) hold. Therefore, Lemma \ref{lem:bnd bnd Ak} is in force with such $k_0$. After taking $K\ge k_0$, we may now substitute \eqref{eq:bnd on Ak final} back into \eqref{eq:raw} to find that
%where the infinite series above converges, thanks to \eqref{eq:cnd on sigma gamma}.
%By our assumption in \eqref{eq:cnd on sigma gamma}, the series in the last line above is convergent and we find a final bound on the gradient mapping, namely
%In particular, if the dual sequence $\{y_k\}$ converges, we can derive a convergence rate. We can take $\sigma_k = k^{-\frac{3}{2}-\epsilon}$ and $\gamma_k = k^{-\frac{1}{2}-2\epsilon}$ and $\beta_k = \beta$. Then
%Our argument is as follows. We select sequences $\{\beta_k,\sigma_k,\gamma_k\}_k$ that satisfy \eqref{eq:recall 2nd major cnds}.
%Then we multiply both sides of \eqref{eq:recall smoothness cnd on gamma} by $\sqrt{k}\sigma_k$ and apply the third condition in \eqref{eq:recall 2nd major cnds} to find that
where $\wedge$ stands for minimum. We impose the following geometric requirements on the constraints. Let $P_{T_C(u)}$ and $P_{N_C(u)}$ denote the orthogonal projection onto the tangent and normal cones at $u\in C$, respectively.
Consider a subspace $S_{k_0}\subseteq\mathbb{R}^d$ such that
\begin{align}
S_{k_0}\supseteq\bigcup_{k\ge k_0} T_C(u_k),
\end{align}
and, with some abuse of notation, let $S_{k_0}$ also denote an orthonormal basis for this subspace. For $\eta_{\min},\rho>0$, we assume that the nonconvex Slater's condition holds, namely,
where $\partial C$ is the boundary of $C$, and $\eta_{\min}$ returns the smallest singular value.
We say that Program \eqref{prob:01} satisfies the Slater's condition if $\psi_{A,C}>0$.
\end{definition}
As a sanity check, we have the following result.
\begin{proposition}\label{prop:convex}
The nonconvex Slater's condition for Program (\ref{prob:01}) implies the standard Slater's condition when $A$ is a linear operator and Program (\ref{prob:01}) is feasible.
\end{proposition}
\begin{proof}
Suppose that the standard Slater's condition does not hold, namely, that
\begin{equation}
\relint(\Null(A) \cap C) = \Null(A)\cap\relint(C) = \emptyset.
\label{eq:no feas}
\end{equation}
Since Program \eqref{prob:01} is feasible, there exists a feasible $u$, namely, $Au=0$ and $u\in C$. By \eqref{eq:no feas}, it must be that $u\in\partial C$ and that $\Null(A)$ supports $C$ at $u$\textbf{Why?}. In particular, it follows that $\Null(A)\cap T_C(u)\ne\{0\}$ or, equivalently,
$\row(A)\cap N_C(u)\ne\{0\}$. That is, there exists a unit-norm vector $v$ such that
\begin{align}
P_{T_C(u)}A^\top v=0,
\label{eq:existence}
\end{align}
and consequently
\begin{align}
\eta_{\min}(P_{T_C(u)}A^\top) = 0.
\end{align}
Because $\eta_{\min}(P_{T_C(u)}A^\top)$ is a continuous function of $u$\textbf{(Why?)}, we conclude that $\psi_{A,C}=0$, namely, the nonconvex Slater's condition also does not hold, thereby completing the proof of Proposition \ref{prop:convex}.
%
%Consider now the curve $g: [0,1]\rightarrow \partial C$ passing through $u$ with the direction of $A^\dagger v$, namely,
%\begin{equation}
%g(0) = u,
%\qquad
%\frac{dg}{dt}(0) = \lim_{t\rightarrow0^+}\frac{g(t)-g(0)}{t} = (A^\dagger)^\top v.
%\end{equation}
%\textbf{Does the above curve exist even when $C$ is not smooth?} Then note that
%\paragraph{\textbf{Intuition in the convex case}}
%We give some qualitative discussion about the last condition in \eqref{eq:conditions} here. Since this condition only pertains the feasibility, we consider the convex feasibility program
%\begin{equation}
%\begin{cases}
%\min 0\\
%A u = 0\\
%u\in C.
%\end{cases}
%\end{equation}
%Then consider the algorithm
%\begin{equation}
%u_{k+1} = P_C (u_k - \gamma A^T Au_ k ).
%\end{equation}
%To build intuition, assume that $A$ has orthonormal columns. Then, if $\gamma=1$, the algorithm above is an instance of the alternative projection algorithm. To further simplify the setup, let us assume that $C$ is the null space of a matrix$B$, namely,
%\begin{equation}
%\begin{cases}
%\min 0\\
%A u = 0\\
%B u= 0.
%\end{cases}
%\end{equation}
%Our algorithm simplifies to
%\begin{equation}
%u_{k+1} = (I - B^T B) (I - A^T A ) u_k.
%\end{equation}
%In this exposition, we assume that $\text{dim}(\text{null}(A))+\text{dim}(\text{null}(B))= d$ for simplicity. All these assumptions are for the sake of simplicity of our argument and can be relaxed.
%The convergence is dictated by the corresponding spectral norm:
%where $\eta_{\min}$ returns the smallest singular value. The term on the far left is exactly $ \eta_{\min}$ in the condition \eqref{eq:conditions}. As $ \eta_{\min} $ reduces, the convergence rate slows down. This is also easy to visualize with two lines in the plane. \textbf{To summarize, ignoring the objective function $h$, we can think of our algorithm as alternative projections in the convex setting. There, the convergence rate is exactly determined with the $\eta_{\min}$ in \eqref{eq:conditions}.}
\section{Proof of Lemma \ref{lem:bnd bnd Ak}}
If $A(u_k)=b$, then \eqref{eq:bnd on Ak final} holds trivially. Otherwise,
%we show that $P_{T_C(u_{k+1})} DA(u_k)^\top$ is a well-conditioned matrix, in order to lower bound \eqref{eq:bnd on Ak raw}. More specifically,
for an integer $k_0$, let
\begin{align}
S _{k_0}= \bigcup_{k\ge k_0} T_{C}(u_k),
\label{eq:defn of S}
\end{align}
and let $S_{k_0}$ with orthonormal columns denote a basis for this subspace, with some abuse of notation. We then assume that
\begin{align}
0 < \eta_{\min} :=
\begin{cases}
\min_u \,\left\| S_{k_0}^\top P_{T_C(u)} ( DA(u)^\top v ) \right\|\\
\|v\|=1\\
\|A(u)-b\|\le\rho\\
u\in C.
\end{cases}
\label{eq:new slater proof}
\end{align}
If $\sup_{k\ge k_0} \|A(u_k)-b\|\le\rho\in[0,\infty]$,
then \eqref{eq:new slater proof} is in force and, for every $k\ge k_0$, we may write that
all the conditions in Theorem \ref{t:1} is satisfied. Furthermore,
Suppose that for some $k_0$, $(u_{k})_{k\geq k_0} \subset B(\overline{u};\epsilon)$ and $(y_{k})_{k\geq k_0} \subset B(\overline{y};\epsilon)$, and
\eqref{e:maic} is satisfied for every $\beta\in(\beta_k)_{k\geq k_0}$. If $\rho_1 > 0$ or $\rho_2 > 0$, then $u_{k}\to\overline{u}$ or $y_k \to\overline{y}$.
\end{theorem}
\begin{proof} Without lost of generality, $k_0=0$.
It follows from Theorem \ref{t:1} that $\gamma_k\|(u_{k+1}-u_k)/\gamma_k\|^2\to0$. Since $(\gamma_k)_{k\in\NN}$
is bounded below by $\underline{\gamma}$, we obtain $G_{\beta_k,\gamma_k}(u_k)\to0$.
Since $\beta_{k+1}^{-1}\|Lu_{k+1}-b\|^2\to0$ and $(\beta_{k})_{k\in\NN}$ is bounded above by $\overline{\beta}=c$,
we get $\|Lu_{k+1}-b\|^2\to0$. Since $L\overline{u} =b$, we can rewrite \eqref{e:fr1} as $-\nabla F_{\beta}(\overline{u},\overline{y})\in N_{C}(\overline{u})$. Hence