AL_theory.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Oct 7, 11:26

AL_theory.tex
View Options

	\section{Proof of Theorem \ref{thm:main} \label{sec:theory}}
	{\color{red} Ahmet: I'll modify with $Ax-b$, and also remove the assumption on $k_0$, also add the small proof for second order stationarity we discussed with Armin} \textbf{AE: IMHO, the first two are not worth the effort right now.}

	For every $k\ge2$, recall from (\ref{eq:Lagrangian}) and Step~2 of Algorithm~\ref{Algo:2} that $x_{k}$ satisfies
	\begin{align}
	& \dist(-\nabla f(x_k) - DA(x_k)^\top y_{k-1} \nonumber\\
	& \qquad - \b_{k-1} DA(x_{k})^\top A(x_k) ,\partial g(x_k) ) \nonumber\\
	& = \dist(-\nabla_x \L_{\b_{k-1}} (x_k ,y_{k-1}) ,\partial g(x_k) ) \le \epsilon_{k}.
	\end{align}
	With an application of the triangle inequality, it follows that
	\begin{align}
	& \dist( -\b_{k-1} DA(x_k)^\top A(x_k) , \partial g(x_k) ) \nonumber\\
	& \qquad \le \\| \nabla f(x_k )\\| + \\| DA(x_k)^\top y_{k-1}\\| +
	\epsilon_k,
	\end{align}
	which in turn implies that
	\begin{align}
	& \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \nonumber\\
	& \le \frac{ \\| \nabla f(x_k )\\|}{\b_{k-1} } + \frac{\\| DA(x_k)^\top y_{k-1}\\|}{\b_{k-1} } +
	\frac{\epsilon_k}{\b_{k-1} } \nonumber\\
	& \le \frac{\lambda'_f+\lambda'_A \\|y_{k-1}\\|+\epsilon_k}{\b_{k-1}} ,
	\label{eq:before_restriction}
	\end{align}
	where $\lambda'_f,\lambda'_A$ were defined in \eqref{eq:defn_restricted_lipsichtz}.
	%For example, when $g=\delta_\mathcal{X}$ is the indicator , then \eqref{eq:before_restriction} reads as
	%\begin{align}
	%& \\| P_{T_C(x_k)} DA(x_k)^\top A(x_k) \\| \nonumber\\
	%& \qquad \le \frac{ \lambda'_f+ \lambda'_A \\|y_{k-1}\\| + \epsilon_k }{\b_{k-1} } .
	%\label{eq:before}
	%\end{align}
	We next translate \eqref{eq:before_restriction} into a bound on the feasibility gap $\\|A(x_k)\\|$. Using the regularity condition \eqref{eq:regularity}, the left-hand side of \eqref{eq:before_restriction} can be bounded below as
	\begin{align}
	& \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \nonumber\\
	& \ge \nu \\|A(x_k) \\|,
	\qquad \text{(see (\ref{eq:regularity}))}
	\label{eq:restrited_pre}
	\end{align}
	provided that $\rho,\rho'$ satisfy
	\begin{align}
	\max_{k\in K} \\|A(x_k)\\| \le \rho,
	\qquad
	\max_{k\in K} \\|x_k\\| \le \rho'.
	\label{eq:to_be_checked_later}
	\end{align}
	By substituting \eqref{eq:restrited_pre} back into \eqref{eq:before_restriction}, we find that
	\begin{align}
	\\|A(x_k)\\| \le \frac{ \lambda'_f + \lambda'_A \\|y_{k-1}\\| + \epsilon_k}{\nu \b_{k-1} }.
	\label{eq:before_dual_controlled}
	\end{align}
	In words, the feasibility gap is directly controlled by the dual sequence $\{y_k\}_k$. We next establish that the dual sequence is bounded. Indeed, for every $k\in K$, note that
	\begin{align}
	\\|y_k\\| & = \\| y_0 + \sum_{i=1}^{k} \s_i A(x_i) \\|
	\quad \text{(Step 5 of Algorithm \ref{Algo:2})}
	\nonumber\\
	& \le \\|y_0\\|+ \sum_{i=1}^k \s_i \\|A(x_i)\\|
	\qquad \text{(triangle inequality)} \nonumber\\
	& \le \\|y_0\\|+ \sum_{i=1}^k \frac{ \\|A(x_1)\\| \log^2 2 }{ k \log^2(k+1)}
	\quad \text{(Step 4)}
	\nonumber\\
	& \le \\|y_0\\|+ c \\|A(x_1) \\| \log^2 2 =: y_{\max},
	\label{eq:dual growth}
	\end{align}
	where
	\begin{align}
	c \ge \sum_{i=1}^{\infty} \frac{1}{k \log^2 (k+1)}.
	\end{align}
	Substituting \eqref{eq:dual growth} back into \eqref{eq:before_dual_controlled}, we reach
	\begin{align}
	\\|A(x_k)\\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} } \nonumber\\
	& \le \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} } ,
	\label{eq:cvg metric part 2}
	\end{align}
	where the second line above holds if $k_0$ is large enough, which would in turn guarantees that $\epsilon_k=1/\b_{k-1}$ is sufficiently small since $\{\b_k\}_k$ is increasing and unbounded.
	Let us now revisit and simplify \eqref{eq:to_be_checked_later}. Note that $\rho'$ automatically satisfies the second inequality there, owing to Step~3 of Algorithm~\ref{Algo:2}. Also, $\rho$ satisfies the first inequality in \eqref{eq:to_be_checked_later} if
	\begin{align}
	\frac{\lambda'_f+\lambda'_A y_{\max}}{ \nu_A \b_1} \le \rho/2,
	\end{align}
	and $k_0$ is large enough. Indeed, this claim follows directly from \eqref{eq:cvg metric part 2}.
	%\begin{align}
	%\\|A(x_k)\\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} }
	%\qquad \text{(see \eqref{eq:cvg metric part 2})}
	%\nonumber\\
	%& \le \frac{ 2( \lambda'_f + \lambda'_A y_{\max})}{\nu \b_1 }
	%\qquad \text{(see \eqref{eq:dual growth})},
	%\end{align}
	%where the last line above holds when $k_0$ is large enough, because $\b_k \ge \b_1$ and $\lim_{k\rightarrow\infty} \epsilon_k=0$.


	It remains to control the first term in \eqref{eq:cvg metric}. To that end, after recalling Step 2 of Algorithm~\ref{Algo:2} and applying the triangle inequality, we can write that
	\begin{align}
	& \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\
	& \le \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) , \partial g(x_{k-1}) ) \nonumber\\
	& + \\| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \\|.
	\label{eq:cvg metric part 1 brk down}
	\end{align}
	The first term on the right-hand side above is bounded by $\epsilon_k$, by Step 5 of Algorithm~\ref{Algo:2}.
	For the second term on the right-hand side of \eqref{eq:cvg metric part 1 brk down}, we write that
	\begin{align}
	& \\| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \\| \nonumber\\
	& = \\| DA(x_k)^\top (y_k - y_{k-1}) \\|
	\qquad \text{(see \eqref{eq:Lagrangian})}
	\nonumber\\
	& \le \lambda'_A \\|y_k- y_{k-1}\\|
	\qquad \text{(see \eqref{eq:defn_restricted_lipsichtz})} \nonumber\\
	& = \lambda'_A \s_k \\|A (x_k) \\|
	\qquad \text{(see Step 5 of Algorithm \ref{Algo:2})} \nonumber\\
	& \le \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) .
	\qquad \text{(see \eqref{eq:cvg metric part 2})}
	\label{eq:part_1_2}
	\end{align}
	By combining (\ref{eq:cvg metric part 1 brk down},\ref{eq:part_1_2}), we find that
	\begin{align}
	& \dist( \nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\
	& \le \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) + \epsilon_k.
	\label{eq:cvg metric part 1}
	\end{align}
	By combining (\ref{eq:cvg metric part 1},\ref{eq:cvg metric part 2}), we find that
	\begin{align}
	& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \\| A(x_k)\\|^2 \nonumber\\
	& \le \left( \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) + \epsilon_k \right)^2 \nonumber\\
	& \qquad + 4\left( \frac{ \lambda'_f + \lambda'_A y_{\max}}{\nu \b_{k-1} } \right)^2.
	\end{align}
	Applying the inequalities $(a+b)^2 \le 2a^2+2b^2$ and $\s_k\le \s_1$, we find that
	\begin{align}
	& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \\| A(x_k)\\|^2 \nonumber\\
	& \le \frac{8 \lambda'_A\s_1^2 + 4}{ \nu^2\b_{k-1}^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\epsilon_k^2,
	\end{align}

	For the second part of the theorem, we use the Weyl's inequality and Step 5 of Algorithm~\ref{Algo:2} to write
	\begin{align}\label{eq:sec}
	\lambda_{\text{min}} &(\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k-1})) \geq \lambda_{\text{min}} (\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k})) \notag \\&- \\| \sum_{i=1}^m \nabla^2 A_i(x_k) \sigma_k A_i(x_k) \\|.
	\end{align}
	The first term on the right hand side is lower bounded by $-\epsilon_{k-1}$ by Step 2 of Algorithm~\ref{Algo:2}, we next have to bound the second term on the right hand side.
	\begin{align*}
	&\\| \sum_{i=1}^m \nabla^2 A_i(x_k) \sigma_k A_i(x_k) \\| \leq \\
	&\sigma_k \sqrt{m} \max_{i} \\| \nabla^2 A_i(x_k)\\| \\| A_i(x_k)\\| \leq \\ &\sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} },
	\end{align*}
	where the last inequality is due to~\eqref{eq:cvg metric part 2} and~\eqref{eq:smoothness basic}.
	Plugging into~\eqref{eq:sec} gives
	\begin{align*}
	\lambda_{\text{min}} &(\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k-1})) \geq -\epsilon_{k-1} \\&- \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} }.
	\end{align*}

	This completes the proof of Theorem \ref{thm:main}.

AL_theory.texNo OneTemporaryActions

File Metadata

AL_theory.texView Options

Event Timeline

AL_theory.tex
No OneTemporary
Actions

AL_theory.tex
View Options