Page MenuHomec4science

AL_theory.tex
No OneTemporary

File Metadata

Created
Mon, Oct 7, 11:26

AL_theory.tex

\section{Proof of Theorem \ref{thm:main} \label{sec:theory}}
{\color{red} Ahmet: I'll modify with $Ax-b$, and also remove the assumption on $k_0$, also add the small proof for second order stationarity we discussed with Armin} \textbf{AE: IMHO, the first two are not worth the effort right now.}
For every $k\ge2$, recall from (\ref{eq:Lagrangian}) and Step~2 of Algorithm~\ref{Algo:2} that $x_{k}$ satisfies
\begin{align}
& \dist(-\nabla f(x_k) - DA(x_k)^\top y_{k-1} \nonumber\\
& \qquad - \b_{k-1} DA(x_{k})^\top A(x_k) ,\partial g(x_k) ) \nonumber\\
& = \dist(-\nabla_x \L_{\b_{k-1}} (x_k ,y_{k-1}) ,\partial g(x_k) ) \le \epsilon_{k}.
\end{align}
With an application of the triangle inequality, it follows that
\begin{align}
& \dist( -\b_{k-1} DA(x_k)^\top A(x_k) , \partial g(x_k) ) \nonumber\\
& \qquad \le \| \nabla f(x_k )\| + \| DA(x_k)^\top y_{k-1}\| +
\epsilon_k,
\end{align}
which in turn implies that
\begin{align}
& \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \nonumber\\
& \le \frac{ \| \nabla f(x_k )\|}{\b_{k-1} } + \frac{\| DA(x_k)^\top y_{k-1}\|}{\b_{k-1} } +
\frac{\epsilon_k}{\b_{k-1} } \nonumber\\
& \le \frac{\lambda'_f+\lambda'_A \|y_{k-1}\|+\epsilon_k}{\b_{k-1}} ,
\label{eq:before_restriction}
\end{align}
where $\lambda'_f,\lambda'_A$ were defined in \eqref{eq:defn_restricted_lipsichtz}.
%For example, when $g=\delta_\mathcal{X}$ is the indicator , then \eqref{eq:before_restriction} reads as
%\begin{align}
%& \| P_{T_C(x_k)} DA(x_k)^\top A(x_k) \| \nonumber\\
%& \qquad \le \frac{ \lambda'_f+ \lambda'_A \|y_{k-1}\| + \epsilon_k }{\b_{k-1} } .
%\label{eq:before}
%\end{align}
We next translate \eqref{eq:before_restriction} into a bound on the feasibility gap $\|A(x_k)\|$. Using the regularity condition \eqref{eq:regularity}, the left-hand side of \eqref{eq:before_restriction} can be bounded below as
\begin{align}
& \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \nonumber\\
& \ge \nu \|A(x_k) \|,
\qquad \text{(see (\ref{eq:regularity}))}
\label{eq:restrited_pre}
\end{align}
provided that $\rho,\rho'$ satisfy
\begin{align}
\max_{k\in K} \|A(x_k)\| \le \rho,
\qquad
\max_{k\in K} \|x_k\| \le \rho'.
\label{eq:to_be_checked_later}
\end{align}
By substituting \eqref{eq:restrited_pre} back into \eqref{eq:before_restriction}, we find that
\begin{align}
\|A(x_k)\| \le \frac{ \lambda'_f + \lambda'_A \|y_{k-1}\| + \epsilon_k}{\nu \b_{k-1} }.
\label{eq:before_dual_controlled}
\end{align}
In words, the feasibility gap is directly controlled by the dual sequence $\{y_k\}_k$. We next establish that the dual sequence is bounded. Indeed, for every $k\in K$, note that
\begin{align}
\|y_k\| & = \| y_0 + \sum_{i=1}^{k} \s_i A(x_i) \|
\quad \text{(Step 5 of Algorithm \ref{Algo:2})}
\nonumber\\
& \le \|y_0\|+ \sum_{i=1}^k \s_i \|A(x_i)\|
\qquad \text{(triangle inequality)} \nonumber\\
& \le \|y_0\|+ \sum_{i=1}^k \frac{ \|A(x_1)\| \log^2 2 }{ k \log^2(k+1)}
\quad \text{(Step 4)}
\nonumber\\
& \le \|y_0\|+ c \|A(x_1) \| \log^2 2 =: y_{\max},
\label{eq:dual growth}
\end{align}
where
\begin{align}
c \ge \sum_{i=1}^{\infty} \frac{1}{k \log^2 (k+1)}.
\end{align}
Substituting \eqref{eq:dual growth} back into \eqref{eq:before_dual_controlled}, we reach
\begin{align}
\|A(x_k)\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} } \nonumber\\
& \le \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} } ,
\label{eq:cvg metric part 2}
\end{align}
where the second line above holds if $k_0$ is large enough, which would in turn guarantees that $\epsilon_k=1/\b_{k-1}$ is sufficiently small since $\{\b_k\}_k$ is increasing and unbounded.
Let us now revisit and simplify \eqref{eq:to_be_checked_later}. Note that $\rho'$ automatically satisfies the second inequality there, owing to Step~3 of Algorithm~\ref{Algo:2}. Also, $\rho$ satisfies the first inequality in \eqref{eq:to_be_checked_later} if
\begin{align}
\frac{\lambda'_f+\lambda'_A y_{\max}}{ \nu_A \b_1} \le \rho/2,
\end{align}
and $k_0$ is large enough. Indeed, this claim follows directly from \eqref{eq:cvg metric part 2}.
%\begin{align}
%\|A(x_k)\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} }
%\qquad \text{(see \eqref{eq:cvg metric part 2})}
%\nonumber\\
%& \le \frac{ 2( \lambda'_f + \lambda'_A y_{\max})}{\nu \b_1 }
%\qquad \text{(see \eqref{eq:dual growth})},
%\end{align}
%where the last line above holds when $k_0$ is large enough, because $\b_k \ge \b_1$ and $\lim_{k\rightarrow\infty} \epsilon_k=0$.
It remains to control the first term in \eqref{eq:cvg metric}. To that end, after recalling Step 2 of Algorithm~\ref{Algo:2} and applying the triangle inequality, we can write that
\begin{align}
& \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\
& \le \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) , \partial g(x_{k-1}) ) \nonumber\\
& + \| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \|.
\label{eq:cvg metric part 1 brk down}
\end{align}
The first term on the right-hand side above is bounded by $\epsilon_k$, by Step 5 of Algorithm~\ref{Algo:2}.
For the second term on the right-hand side of \eqref{eq:cvg metric part 1 brk down}, we write that
\begin{align}
& \| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \| \nonumber\\
& = \| DA(x_k)^\top (y_k - y_{k-1}) \|
\qquad \text{(see \eqref{eq:Lagrangian})}
\nonumber\\
& \le \lambda'_A \|y_k- y_{k-1}\|
\qquad \text{(see \eqref{eq:defn_restricted_lipsichtz})} \nonumber\\
& = \lambda'_A \s_k \|A (x_k) \|
\qquad \text{(see Step 5 of Algorithm \ref{Algo:2})} \nonumber\\
& \le \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) .
\qquad \text{(see \eqref{eq:cvg metric part 2})}
\label{eq:part_1_2}
\end{align}
By combining (\ref{eq:cvg metric part 1 brk down},\ref{eq:part_1_2}), we find that
\begin{align}
& \dist( \nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\
& \le \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) + \epsilon_k.
\label{eq:cvg metric part 1}
\end{align}
By combining (\ref{eq:cvg metric part 1},\ref{eq:cvg metric part 2}), we find that
\begin{align}
& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\
& \le \left( \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) + \epsilon_k \right)^2 \nonumber\\
& \qquad + 4\left( \frac{ \lambda'_f + \lambda'_A y_{\max}}{\nu \b_{k-1} } \right)^2.
\end{align}
Applying the inequalities $(a+b)^2 \le 2a^2+2b^2$ and $\s_k\le \s_1$, we find that
\begin{align}
& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\
& \le \frac{8 \lambda'_A\s_1^2 + 4}{ \nu^2\b_{k-1}^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\epsilon_k^2,
\end{align}
For the second part of the theorem, we use the Weyl's inequality and Step 5 of Algorithm~\ref{Algo:2} to write
\begin{align}\label{eq:sec}
\lambda_{\text{min}} &(\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k-1})) \geq \lambda_{\text{min}} (\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k})) \notag \\&- \| \sum_{i=1}^m \nabla^2 A_i(x_k) \sigma_k A_i(x_k) \|.
\end{align}
The first term on the right hand side is lower bounded by $-\epsilon_{k-1}$ by Step 2 of Algorithm~\ref{Algo:2}, we next have to bound the second term on the right hand side.
\begin{align*}
&\| \sum_{i=1}^m \nabla^2 A_i(x_k) \sigma_k A_i(x_k) \| \leq \\
&\sigma_k \sqrt{m} \max_{i} \| \nabla^2 A_i(x_k)\| \| A_i(x_k)\| \leq \\ &\sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} },
\end{align*}
where the last inequality is due to~\eqref{eq:cvg metric part 2} and~\eqref{eq:smoothness basic}.
Plugging into~\eqref{eq:sec} gives
\begin{align*}
\lambda_{\text{min}} &(\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k-1})) \geq -\epsilon_{k-1} \\&- \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} }.
\end{align*}
This completes the proof of Theorem \ref{thm:main}.

Event Timeline