Page MenuHomec4science

AL_theory.tex
No OneTemporary

File Metadata

Created
Fri, May 10, 10:59

AL_theory.tex

%!TEX root = ../main.tex
\section{Optimality Conditions}
\label{sec:opt_cnds}
{First-order necessary optimality conditions} for \eqref{prob:01} are well-studied. {Indeed, $x\in \RR^d$ is a first-order stationary point of~\eqref{prob:01} if there exists $y\in \RR^m$ such that
%
%%% I am cutting this to save space. put it back if necessary (VC)
%\begin{align}
%%\begin{cases}
%-\nabla f(x) - DA(x)^\top y \in \partial g(x), \qquad \,\, A(x) = 0,
%%\end{cases}
%\label{e:inclu1}
%\end{align}
%
%where $DA(x)$ is the Jacobian of $A$ at $x$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to
\begin{align}
%\begin{cases}
-\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x),\qquad A(x) = 0,
%\end{cases}
\label{e:inclu2}
\end{align}
which is in turn the necessary optimality condition for \eqref{eq:minmax}.}
%Note that \eqref{e:inclu2} is equivalent to
%\begin{align}
%\left[
%\begin{array}{c}
%\nabla_x \L_\b(x,y) \\
%\nabla_y \L_\b(x,y)
%\end{array}
%\right]
%\in
%\left\{
%\left[
%\begin{array}{c}
% v\\
%0
%\end{array}
%\right]
%: v\in \partial g(x) \right\}
%,
%\end{align}
%which rephrases the stationarity condition in terms of the gradient of the duality gap of Program~\eqref{eq:Lagrangian}.
%\editf{mfs: check approx. optimality conditions, how they apply in this setting.}
Inspired by this, we say that $x$ is an $(\epsilon_f,\b)$ first-order stationary point of \eqref{eq:minmax} if {there exists a $y \in \RR^m$} such that
\begin{align}
%\begin{cases}
\dist(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x)) \leq \epsilon_f, \qquad \| A(x) \| \leq \epsilon_f,
%\end{cases}
\label{eq:inclu3app}
\end{align}
for $\epsilon_f\ge 0$.
In light of \eqref{eq:inclu3app}, a metric for evaluating the stationarity of a pair $(x,y)\in \RR^d\times \RR^m$ is
\begin{align}
\dist\left(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x) \right) + \|A(x)\| ,
\label{eq:cvg metricapp}
\end{align}
which we use as the first-order stopping criterion.
%When $g=0$, it is also not difficult to verify that the expression in \eqref{eq:cvg metric} is the norm of the gradient of the duality gap in \eqref{eq:minmax}.
As an example, for a convex set $\mathcal{X}\subset\RR^d$, suppose that $g = \delta_\mathcal{X}$ is the indicator function on $\mathcal{X}$.
Let also $T_\mathcal{X}(x) \subseteq \RR^d$ denote the tangent cone to $\mathcal{X}$ at $x$, and with $P_{T_\mathcal{X}(x)}:\RR^d\rightarrow\RR^d$ we denote the orthogonal projection onto this tangent cone. Then, for $u\in\RR^d$, it is not difficult to verify that
\begin{align}\label{eq:dist_subgrad}
\dist\left(u, \partial g(x) \right) = \| P_{T_\mathcal{X}(x)}(u) \|.
\end{align}
%
When $g = 0$, a first-order stationary point $x\in \RR^d$ of \eqref{prob:01} is also second-order stationary if
%
%\vspace{-.5cm}
\begin{equation}
\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))\ge 0,
\end{equation}
%\vspace{-.5cm}
%
where $\nabla_{xx}\mathcal{L}_\b$ is the Hessian of $\mathcal{L}_\b$ with respect to $x$, and $\lambda_{\text{min}}(\cdot)$ returns the smallest eigenvalue of its argument.
Analogously, $x$ is an $(\epsilon_f, \epsilon_s,\b)$ second-order stationary point if, in addition to \eqref{eq:inclu3}, it holds that
%
%\vspace{-.5cm}
\begin{equation}\label{eq:sec_opt}
\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) \ge -\epsilon_s,
\end{equation}
for $\epsilon_s\ge 0$.
%\vspace{-.5cm}
%
Naturally, for second-order stationarity, we use $\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))$ as the stopping criterion.
\section{Total Complexity Results}
\subsection{First-Order Optimality \label{sec:first-o-opt}}
%\textbf{AE: we go back and forth between "subroutine" and "solver". for consistency, I'm just using "solver" everywhere.}
Let us first consider the case where the solver in Step~2 is is the first-order algorithm APGM, described in detail in ~\cite{ghadimi2016accelerated}. At a high level,
APGM makes use of $\nabla_x \mathcal{L}_{\beta}(x,y)$ in \eqref{eq:Lagrangian}, the proximal operator $\text{prox}_g$, and the classical Nesterov acceleration~\cite{nesterov1983method} to reach first-order stationarity for the subproblem in~\eqref{e:exac}.
% \textbf{AE: Ahmet to give a brief description of APGM here. }
%First, we characterize the iteration complexity of Algorithm~\ref{Algo:2} for finding a first-order stationary point of~\eqref{prob:01}.
%We propose to use the standard accelerated proximal method (APGM), guarantees of which are analyzed in~\cite{ghadimi2016accelerated} for nonconvex problems of the form~\eqref{e:exac}.
Suppose that $g=\delta_\mathcal{X}$ is the indicator function on a bounded convex set $\mathcal{X}\subset \RR^d$ and let
\begin{align}
\rho= \max_{x\in \mathcal{X}}\|x\|,
\end{align}
be the radius of a ball centered at the origin that includes $\mathcal{X}$.
Then, adapting the results in~\cite{ghadimi2016accelerated} to our setup, APGM reaches $x_{k}$ in Step 2 of Algorithm~\ref{Algo:2} after
\begin{equation}
\mathcal{O}\left ( \frac{\lambda_{\beta_{k}}^2 \rho^{2} }{\epsilon_{k+1}} \right)
\label{eq:iter_1storder}
\end{equation}
(inner) iterations, where $\lambda_{\beta_{k}}$ denotes the Lipschitz constant of $\nabla_x{\mathcal{L}_{\beta_{k}}(x, y)}$, bounded in~\eqref{eq:smoothness of Lagrangian}. For the clarity of the presentation, we have used a looser bound in \eqref{eq:iter_1storder} compared to~\cite{ghadimi2016accelerated}.
Using \eqref{eq:iter_1storder}, we derive the following corollary, describing the total iteration complexity of Algorithm~\ref{Algo:2} in terms of the number calls made to the first-order oracle in APGM. %\textbf{AE: we haven't talked about oracle before.}
\begin{corollary}\label{cor:first_supp}
For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point,
after $T$ calls to the first-order oracle, where
%
\begin{equation}
T = \mathcal{O}\left( \frac{Q^3 \rho^2}{\epsilon^{3}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} \rho^2}{\epsilon^{3}} \right).
\end{equation}
\end{corollary}
\begin{proof}
Let $K$ denote the number of (outer) iterations of Algorithm~\ref{Algo:2} and let $\epsilon_{f}$ denote the desired accuracy of Algorithm~\ref{Algo:2}, see~(\ref{eq:inclu3}). Recalling Theorem~\ref{thm:main}, we can then write that
\begin{equation}
\epsilon_{f} = \frac{Q}{\b_{K}},
\label{eq:acc_to_b}
\end{equation}
or, equivalently, $\b_{K} = Q/\epsilon_{f}$.
%where $K$ denotes the last outer iterate and $\epsilon$ is the final accuracy we would like to get for the optimality conditions given in~\eqref{eq:inclu3}.
We now count the number of total (inner) iterations $T$ of Algorithm~\ref{Algo:2} to reach the accuracy $\epsilon_{f}$. From \eqref{eq:smoothness of Lagrangian} and for sufficiently large $k$, recall that $\lambda_{\b_k}\le \lambda'' \b_k$ is the smoothness parameter of the augmented Lagrangian. Then, from \eqref{eq:iter_1storder} ad by summing over the outer iterations, we bound the total number of (inner) iterations of Algorithm~\ref{Algo:2} as
\begin{align}\label{eq: tk_bound}
T &= \sum_{k=1}^K\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}}^2 \rho^2 }{\epsilon_k} \right) \nonumber\\
& = \sum_{k=1}^K\mathcal{O}\left (\beta_{k-1}^3 \rho^2 \right)
\qquad \text{(Step 1 of Algorithm \ref{Algo:2})}
\nonumber\\
& \leq \mathcal{O} \left(K\beta_{K-1}^3 \rho^2 \right)
\qquad \left( \{\b_k\}_k \text{ is increasing} \right)
\nonumber\\
& \le \mathcal{O}\left( \frac{K Q^{{3}} \rho^2}{\epsilon_{f}^{{3}}} \right).
\qquad \text{(see \eqref{eq:acc_to_b})}
\end{align}
In addition, if we specify $\beta_k=b^k$ for all $k$, we can further refine $T$. Indeed,
\begin{equation}
\beta_K = b^K~~ \Longrightarrow~~ K = \log_b \left( \frac{Q}{\epsilon_f} \right),
\end{equation}
which, after substituting into~\eqref{eq: tk_bound} gives the final bound in Corollary~\ref{cor:first}.
\end{proof}
%\textbf{AE: if we find some time, i'll add a couple of lines for how 1st order opt implies 2nd order opt for smooth constraints.}
\subsection{Second-Order Optimality \label{sec:second-o-opt}}
%{\color{red}{Ahmet (note to myself): not sure of the constants of trust-region, check again}} \\
Let us now consider the second-order optimality case where the solver in Step~2 is the the trust region method developed in~\cite{cartis2012complexity}.
Trust region method minimizes a quadratic approximation of the function within a dynamically updated trust-region radius.
Second-order trust region method that we consider in this section makes use of Hessian (or an approximation of Hessian) of the augmented Lagrangian in addition to first order oracles.
As shown in~\cite{nouiehed2018convergence}, finding approximate second-order stationary points of convex-constrained problems is in general NP-hard. For this reason, we focus in this section on the special case of~\eqref{prob:01} with $g=0$.
%We first give a theorem to show the convergence rate of the algorithm for second order stationarity: \\
%{\color{red}{Ahmet: I think that it is possible to remove sufficiently large k0 assumption here. }} \textbf{AE: not worth it really}
%\begin{corollary}
%\label{thm:main_second}
%Under the assumptions of Theorem~\ref{thm:main}, the output of Algorithm~\ref{Algo:2} satisfies
%\begin{align}
%%& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\
%%& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\
%%&=: \frac{Q}{\beta_{k-1}^2}.
%\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_{k-1}}(x_k, y_k)) \geq - \frac{C}{\beta_{k-1}} - \epsilon_{k-1}.
%\end{align}
%\end{corollary}
%
%We consider \textbf{AE: Ahmet to add a brief description of the algorithm.}
Let us compute the total computational complexity of Algorithm~\ref{Algo:2} with the trust region method in Step~2, in terms of the number of calls made to the second-order oracle. By adapting the result in~\cite{cartis2012complexity} to our setup, we find that the number of (inner) iterations required in Step~2 of Algorithm~\ref{Algo:2} to produce $x_{k+1}$ is
%
\begin{equation}
\mathcal{O}\left ( \frac{\lambda_{\beta_{k}, H}^2 (\mathcal{L}_{\beta_{k}}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y))}{\epsilon_k^3} \right),
\label{eq:sec_inn_comp}
\end{equation}
%
%<<<<<<< HEAD
where $\lambda_{\beta, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop.
In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y)$ is bounded by a constant independent of $\epsilon$.
We assume a uniform bound for this quantity for every $ \beta_k$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following:
%=======
%where $\lambda_{\beta_k, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta_k$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop.
%In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta_k}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y)$ is bounded by a constant independent of $\epsilon_k$.
%We assume a uniform bound for this quantity for all $\b$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following result. The proof is very similar to that of Corollary~\ref{cor:first} and hence omitted here.
%>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732
%
\begin{corollary}\label{cor:second_supp}
For $b>1$, let $\beta_k =b^k $ for every $k$.
We assume that
\begin{equation}
\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) \leq L_{u},\qquad \forall \beta.
\end{equation}
If we use the trust region method from~\cite{cartis2012complexity} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon$-second-order stationary point of~(\ref{prob:01}) in $T$ calls to the second-order oracle where
%
\begin{equation}
T = \mathcal{O}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \log_b{\left( \frac{Q'}{\epsilon} \right)} \right) = \widetilde{\mathcal{O}}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \right).
\end{equation}
\end{corollary}
%
%\notea{I can't remember: what is $Q'$ and why isn't it defined?}
Before closing this section, we note that the remark after Corollary~\ref{cor:first} applies here as well.
\subsection{Approximate optimality of \eqref{prob:01}.}
%{\paragraph{Approximate optimality of \eqref{prob:01}. }
Corollary \ref{cor:first} establishes the iteration complexity of Algorithm~\ref{Algo:2} to reach approximate first-order stationarity for the equivalent formulation of \eqref{prob:01} presented in \eqref{eq:minmax}. Unlike the exact case, approximate first-order stationarity in \eqref{eq:minmax} does not immediately lend itself to approximate stationarity in \eqref{prob:01}, and the study of approximate stationarity for the penalized problem (special case of our setting with dual variable set to $0$) has also precedent in~\cite{bhojanapalli2018smoothed}. %\textbf{AE: I think the reference is wrong... It's important to mention some precedent for what we do here to strengthen our argument.}
However, it is not difficult to verify that, with the more aggressive regime of $\epsilon_{k+1}=1/\b_k^2$ in Step 1 of Algorithm~\ref{Algo:2}, one can achieve $\epsilon$-first-order stationarity for \eqref{prob:01} with the iteration complexity of $T=\tilde{O}(Q^3\rho^2/\epsilon^6)$ in Corollary~\ref{cor:first}.
Note that this conversion is by a naive computation using loose bounds rather than using duality arguments for a tight conversion.
For a precedent in convex optimization for relating the convergence in augmented Lagrangian to the constrained problem using duality, see~\cite{tran2018smooth}.
For the second-order case, it is in general not possible to establish approximate second-order optimality for \eqref{eq:minmax} from Corollary~\ref{cor:second}, with the exception of linear constraints.
}
\section{Proof of Theorem \ref{thm:main} \label{sec:theory}}
For every $k\ge2$, recall from (\ref{eq:Lagrangian}) and Step~2 of Algorithm~\ref{Algo:2} that $x_{k}$ satisfies
\begin{align}
& \dist(-\nabla f(x_k) - DA(x_k)^\top y_{k-1} \nonumber\\
& \qquad - \b_{k-1} DA(x_{k})^\top A(x_k) ,\partial g(x_k) ) \nonumber\\
& = \dist(-\nabla_x \L_{\b_{k-1}} (x_k ,y_{k-1}) ,\partial g(x_k) ) \le \epsilon_{k}.
\end{align}
With an application of the triangle inequality, it follows that
\begin{align}
& \dist( -\b_{k-1} DA(x_k)^\top A(x_k) , \partial g(x_k) ) \nonumber\\
& \qquad \le \| \nabla f(x_k )\| + \| DA(x_k)^\top y_{k-1}\| +
\epsilon_k,
\end{align}
which in turn implies that
\begin{align}
& \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \nonumber\\
& \le \frac{ \| \nabla f(x_k )\|}{\b_{k-1} } + \frac{\| DA(x_k)^\top y_{k-1}\|}{\b_{k-1} } +
\frac{\epsilon_k}{\b_{k-1} } \nonumber\\
& \le \frac{\lambda'_f+\lambda'_A \|y_{k-1}\|+\epsilon_k}{\b_{k-1}} ,
\label{eq:before_restriction}
\end{align}
where $\lambda'_f,\lambda'_A$ were defined in \eqref{eq:defn_restricted_lipsichtz}.
%For example, when $g=\delta_\mathcal{X}$ is the indicator , then \eqref{eq:before_restriction} reads as
%\begin{align}
%& \| P_{T_C(x_k)} DA(x_k)^\top A(x_k) \| \nonumber\\
%& \qquad \le \frac{ \lambda'_f+ \lambda'_A \|y_{k-1}\| + \epsilon_k }{\b_{k-1} } .
%\label{eq:before}
%\end{align}
We next translate \eqref{eq:before_restriction} into a bound on the feasibility gap $\|A(x_k)\|$. Using the regularity condition \eqref{eq:regularity}, the left-hand side of \eqref{eq:before_restriction} can be bounded below as
\begin{align}
& \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \ge \nu \|A(x_k) \|.
\qquad \text{(see (\ref{eq:regularity}))}
\label{eq:restrited_pre}
\end{align}
%provided that $\rho$ satisfy
%\begin{align}
%\max_{k\in K} \|A(x_k)\| \le \rho.
%\label{eq:to_be_checked_later}
%\end{align}
By substituting \eqref{eq:restrited_pre} back into \eqref{eq:before_restriction}, we find that
\begin{align}
\|A(x_k)\| \le \frac{ \lambda'_f + \lambda'_A \|y_{k-1}\| + \epsilon_k}{\nu \b_{k-1} }.
\label{eq:before_dual_controlled}
\end{align}
In words, the feasibility gap is directly controlled by the dual sequence $\{y_k\}_k$. We next establish that the dual sequence is bounded. Indeed, for every $k\in K$, note that
\begin{align}
\|y_k\| & = \| y_0 + \sum_{i=1}^{k} \s_i A(x_i) \|
\quad \text{(Step 5 of Algorithm \ref{Algo:2})}
\nonumber\\
& \le \|y_0\|+ \sum_{i=1}^k \s_i \|A(x_i)\|
\qquad \text{(triangle inequality)} \nonumber\\
& \le \|y_0\|+ \sum_{i=1}^k \frac{ \|A(x_1)\| \log^2 2 }{ k \log^2(k+1)}
\quad \text{(Step 4)}
\nonumber\\
& \le \|y_0\|+ c \|A(x_1) \| \log^2 2 =: y_{\max},
\label{eq:dual growth}
\end{align}
where
\begin{align}
c \ge \sum_{i=1}^{\infty} \frac{1}{k \log^2 (k+1)}.
\end{align}
Substituting \eqref{eq:dual growth} back into \eqref{eq:before_dual_controlled}, we reach
\begin{align}
\|A(x_k)\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} } \nonumber\\
& \le \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} } ,
\label{eq:cvg metric part 2}
\end{align}
where the second line above holds if $k_0$ is large enough, which would in turn guarantees that $\epsilon_k=1/\b_{k-1}$ is sufficiently small since $\{\b_k\}_k$ is increasing and unbounded.
%Let us now revisit and simplify \eqref{eq:to_be_checked_later}.
%Note that $\rho'$ automatically satisfies the second inequality there, owing to Step~3 of Algorithm~\ref{Algo:2}.
%Note that $\rho$ satisfies \eqref{eq:to_be_checked_later} if
%\begin{align}
%\frac{\lambda'_f+\lambda'_A y_{\max}}{ \nu_A \b_1} \le \rho/2,
%\qquad \text{(see \eqref{eq:cvg metric part 2})}
%\end{align}
%provided that
%\begin{align}
%\beta_1 \ge \b_0 := \left(\lambda'_f + \lambda'_A y_{\max} \right)^{-1}.
%\label{eq:beta-0}
%\end{align}
%\begin{align}
%\|A(x_k)\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} }
%\qquad \text{(see \eqref{eq:cvg metric part 2})}
%\nonumber\\
%& \le \frac{ 2( \lambda'_f + \lambda'_A y_{\max})}{\nu \b_1 }
%\qquad \text{(see \eqref{eq:dual growth})},
%\end{align}
%where the last line above holds when $k_0$ is large enough, because $\b_k \ge \b_1$ and $\lim_{k\rightarrow\infty} \epsilon_k=0$.
It remains to control the first term in \eqref{eq:cvg metric}. To that end, after recalling Step 2 of Algorithm~\ref{Algo:2} and applying the triangle inequality, we can write that
\begin{align}
& \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\
& \le \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) , \partial g(x_{k}) ) \nonumber\\
& + \| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \|.
\label{eq:cvg metric part 1 brk down}
\end{align}
The first term on the right-hand side above is bounded by $\epsilon_k$, by Step 5 of Algorithm~\ref{Algo:2}.
For the second term on the right-hand side of \eqref{eq:cvg metric part 1 brk down}, we write that
\begin{align}
& \| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \| \nonumber\\
& = \| DA(x_k)^\top (y_k - y_{k-1}) \|
\qquad \text{(see \eqref{eq:Lagrangian})}
\nonumber\\
& \le \lambda'_A \|y_k- y_{k-1}\|
\qquad \text{(see \eqref{eq:defn_restricted_lipsichtz})} \nonumber\\
& = \lambda'_A \s_k \|A (x_k) \|
\qquad \text{(see Step 5 of Algorithm \ref{Algo:2})} \nonumber\\
& \le \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) .
\qquad \text{(see \eqref{eq:cvg metric part 2})}
\label{eq:part_1_2}
\end{align}
By combining (\ref{eq:cvg metric part 1 brk down},\ref{eq:part_1_2}), we find that
\begin{align}
& \dist( \nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\
& \le \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) + \epsilon_k.
\label{eq:cvg metric part 1}
\end{align}
By combining (\ref{eq:cvg metric part 2},\ref{eq:cvg metric part 1}), we find that
\begin{align}
& \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\| \nonumber\\
& \le \left( \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) + \epsilon_k \right) \nonumber\\
& \qquad + 2\left( \frac{ \lambda'_f + \lambda'_A y_{\max}}{\nu \b_{k-1} } \right).
\end{align}
Applying $\s_k\le \s_1$, we find that
\begin{align}
& \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\| \nonumber\\
& \le \frac{ 2\lambda'_A\s_1 + 2}{ \nu\b_{k-1}} ( \lambda'_f+\lambda'_A y_{\max}) + \epsilon_k.
\end{align}
For the second part of the theorem, we use the Weyl's inequality and Step 5 of Algorithm~\ref{Algo:2} to write
\begin{align}\label{eq:sec}
\lambda_{\text{min}} &(\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k-1})) \geq \lambda_{\text{min}} (\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k})) \notag \\&- \sigma_k \| \sum_{i=1}^m A_i(x_k) \nabla^2 A_i(x_k) \|.
\end{align}
The first term on the right-hand side is lower bounded by $-\epsilon_{k-1}$ by Step 2 of Algorithm~\ref{Algo:2}. We next bound the second term on the right-hand side above as
\begin{align*}
& \sigma_k \| \sum_{i=1}^m A_i(x_k) \nabla^2 A_i(x_k) \| \\
&\le \sigma_k \sqrt{m} \max_{i} \| A_i(x_k)\| \| \nabla^2 A_i(x_k)\| \\
&\le \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} },
\end{align*}
where the last inequality is due to~(\ref{eq:smoothness basic},\ref{eq:cvg metric part 2}).
Plugging into~\eqref{eq:sec} gives
\begin{align*}
& \lambda_{\text{min}}(\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k-1}))\nonumber\\
& \geq -\epsilon_{k-1} - \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} },
\end{align*}
which completes the proof of Theorem \ref{thm:main}.

Event Timeline