Page MenuHomec4science

guarantees.tex
No OneTemporary

File Metadata

Created
Tue, Jun 4, 13:23

guarantees.tex

%!TEX root = ../main.tex
\section{Convergence Rate \label{sec:cvg rate}}
In this section, we detail the convergence rate of Algorithm~\ref{Algo:2} for finding first-order and second-order stationary points, along with the iteration complexity results.
All the proofs are deferred to Appendix~\ref{sec:theory} for the clarity.
Theorem~\ref{thm:main} below characterizes the convergence rate of Algorithm~\ref{Algo:2} for finding stationary points in terms of the number of outer iterations.\\
%<<<<<<< HEAD
%{\color{red}{Ahmet: Maybe instead of sufficiently large k0 we can say k0 such that beta is bigger than 1, it makes the proof go thru, it would be a bit more explanatory}}
%=======
%>>>>>>> b39eea61625954bc9d3858590b7b37a182a6af4f
\begin{theorem}\textbf{\emph{(convergence rate)}}
\label{thm:main}
Suppose that $f$ and $A$ are smooth in the sense specified in (\ref{eq:smoothness basic}). For $\rho'>0$, let
\begin{align}
\lambda'_f = \max_{\|x\|\le \rho'} \|\nabla f(x)\|,
~~
\lambda'_A = \max_{\|x\| \le \rho'} \|DA(x)\|,
\label{eq:defn_restricted_lipsichtz}
\end{align}
be the (restricted) Lipschitz constants of $f$ and $A$, respectively.
For sufficiently large integers $k_0\le k_1$, consider the interval $K=[k_0:k_1]$, and let $\{x_k\}_{k\in K}$ be the output sequence of Algorithm~\ref{Algo:2} on the interval $K$. For $\nu>0$, assume that
\begin{align}
\nu \|A(x_k)\|
& \le \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right),
\label{eq:regularity}
\end{align}
for every $k\in K$. We consider two cases:
%{\color{red} Ahmet: I removed the squares and showed the rate on dist + feasibility, since it is the measure for the stationarity, using squares confuses complexity analysis...}
\begin{itemize}[leftmargin=*]
\item If a first-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f},\b_k)$ first-order stationary point of (\ref{prob:01}) with
%\begin{align}
%& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\
%& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_1^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\
%&=: \frac{Q(f,g,A)}{\beta_{k-1}^2},
%\end{align}
\begin{align}
%\epsilon_{k,f} & = \frac{1}{\beta_{k-1}} \sqrt{ \frac{8 \lambda'_A\s_1^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2 } \nonumber \\
%&=: \frac{Q(f,g,A,\s_1)}{\beta_{k-1}},
\epsilon_{k,f} & = \frac{1}{\beta_{k-1}} \left(\frac{2(\lambda'_f+\lambda'_A y_{\max}) (1+\lambda_A' \sigma_k)}{\nu}+1\right) \nonumber \\
&=: \frac{Q(f,g,A,\s_1)}{\beta_{k-1}},
\label{eq:stat_prec_first}
\end{align}
for every $k\in K$, where the expression for $y_{\max}=y_{\max}(x_1,y_0,\s_1)$ is given in (\ref{eq:dual growth}).
\item If a second-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f}, \epsilon_{k,s},\b_k)$ second-order stationary point of~(\ref{prob:01}) with $\epsilon_{k,s}$ specified above and with
\begin{align}
\epsilon_{k,s} &= \epsilon_{k-1} + \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} \nonumber\\
&= \frac{\nu + \sigma_k \sqrt{m} \lambda_A 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} =: \frac{Q'}{\beta_{k-1}}.
\end{align}
\end{itemize}
\end{theorem}
%A few remarks are in order about Theorem~\ref{thm:main}.
%\textbf{Tradeoff.}
%Roughly speaking, Theorem~\ref{thm:main} states that the iterates of Algorithm~\ref{Algo:2} approach first-order stationarity for~\eqref{prob:01} at the rate of $1/{\b_k}$, where $\b_k$ is the penalty weight in AL in iteration $k$ as in~\eqref{eq:Lagrangian}. Note the inherent tradeoff here: For a faster sequence $\{\b_k\}_k$, the iterates approach stationarity faster but the computational cost of the inexact primal subroutine (Step 2) increases since a higher accuracy is required and Lipschitz constant of the unconstrained problem is also affected by $\beta_k$. In words, there is a tradeoff between the number of outer and inner loop iterations of Algorithm~\ref{Algo:2}. We highlight this tradeoff for a number of subroutines below.
Loosely speaking, Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} converges to a (first- or second-) order stationary point of \eqref{prob:01} at the rate of $1/\b_k$.
A few remarks are in order.
\textbf{Regularity.} The key condition in Theorem~\ref{thm:main} is \eqref{eq:regularity} which, broadly speaking, controls the problem geometry.
As the penalty weight $\b_k$ grows, the primal solver in Step~2 places an increasing emphasis on reducing the feasibility gap and \eqref{eq:regularity} formalizes this intuition.
In contrast to most conditions in the nonconvex optimization literature, such as~\cite{flores2012complete}, our condition in~\eqref{eq:regularity} appears to be easier to verify, as we see in Section~\ref{sec:experiments}.
We now argue that such a condition is necessary for controlling the feasibility gap of~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}. Consider the case where $f=0$ and $g=\delta_\mathcal{X}$, where $\mathcal{X}$ is a convex set, $A$ is a linear operator. In this case, solving \eqref{prob:01} finds a point in $\mathcal{X}\cap \text{null}(A)$, where the subspace $\text{null}(A)=\{ x\in\mathbb{R}^d: A(x) = 0 \} \subset \RR^d$ is the null space of $A$.
Here, the Slater's condition requires that
\begin{equation}
\text{relint} (\mathcal{X}) \cap \text{null}(A)\neq \emptyset.
\end{equation}
%<<<<<<< HEAD
In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}.
Intuitively, the Slater's condition here removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01}, namely, finding a point in $\mathcal{X}\cap \text{null}(A)$, can be particularly difficult. For instance, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) has arbitrarily slow convergence, see Figure~\ref{fig:convex_slater}.
%{\color{red} Ahmet: a reference would be cool here}
%=======
%In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}. Intuitively, the Slater's condition here removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01}, namely, finding a point in $\mathcal{X}\cap \text{null}(A)$, can be particularly difficult. For instance, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) has arbitrarily slow convergence, see Figure~\ref{fig:convex_slater}.
%>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732
\begin{figure}
\begin{center}
\includegraphics[scale=.5]{Slater.pdf}
\end{center}
\caption{Solving \eqref{prob:01} can be particularly difficult, even when \eqref{prob:01} is a convex program. We present a pathological geometry where the Slater's condition does not apply. See the first remark after Theorem~\ref{thm:main} for more details. \label{fig:convex_slater}}
\end{figure}
\textbf{Computational complexity.} Theorem~\ref{thm:main} allows us to specify the number of iterations that Algorithm~\ref{Algo:2} requires to reach a near-stationary point of Program~\eqref{prob:01} with a prescribed precision and, in particular, specifies the number of calls made to the solver in Step~2. In this sense, Theorem~\ref{thm:main} does not fully capture the computational complexity of Algorithm~\ref{Algo:2}, as it does not take into account the computational cost of the solver in Step~2.
To better understand the total complexity of Algorithm~\ref{Algo:2}, we consider two scenarios in the following. In the first scenario, we take the solver in Step~2 to be the Accelerated Proximal Gradient Method (APGM), a well-known first-order algorithm~\cite{ghadimi2016accelerated}. In the second scenario, we will use the second-order trust region method developed in~\cite{cartis2012complexity}.
\subsection{First-Order Optimality}
%\textbf{AE: we go back and forth between "subroutine" and "solver". for consistency, I'm just using "solver" everywhere.}
Let us first consider the first-order optimality case where the solver in Step~2 is APGM~\cite{ghadimi2016accelerated}.
APGM makes use of $\nabla_x \mathcal{L}_{\beta}(x,y)$, $\text{prox}_g$ and classical Nesterov acceleration step for the iterates~\cite{nesterov1983method} to obtain first order stationarity guarantees for solving~\eqref{e:exac}.
% \textbf{AE: Ahmet to give a brief description of APGM here. }
%First, we characterize the iteration complexity of Algorithm~\ref{Algo:2} for finding a first-order stationary point of~\eqref{prob:01}.
%We propose to use the standard accelerated proximal method (APGM), guarantees of which are analyzed in~\cite{ghadimi2016accelerated} for nonconvex problems of the form~\eqref{e:exac}.
Suppose that $g=\delta_\mathcal{X}$ is the indicator function on a bounded convex set $\mathcal{X}\subset \RR^d$ and let
\begin{align}
x_{\max}= \max_{x\in \mathcal{X}}\|x\|,
\end{align}
be the radius of a ball centered at the origin that includes $\mathcal{X}$.
Then, adapting the results in~\cite{ghadimi2016accelerated} to our setup, APGM reaches $x_{k}$ in Step 2 of Algorithm~\ref{Algo:2} after
\begin{equation}
\mathcal{O}\left ( \frac{\lambda_{\beta_{k}}^2 x_{\max}^2 }{\epsilon_{k+1}} \right)
\label{eq:iter_1storder}
\end{equation}
(inner) iterations, where $\lambda_{\beta_{k}}$ denotes the Lipschitz constant of $\nabla_x{\mathcal{L}_{\beta_{k}}(x, y)}$, bounded in~\eqref{eq:smoothness of Lagrangian}. We note that for simplicity, we use a looser bound in \eqref{eq:iter_1storder} than~\cite{ghadimi2016accelerated}.
Using \eqref{eq:iter_1storder}, we derive the following corollary, describing the total computational complexity of Algorithm~\ref{Algo:2} in terms of the calls to the first-order oracle in APGM. %\textbf{AE: we haven't talked about oracle before.}
\begin{corollary}\label{cor:first}
For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point, see (\ref{eq:inclu3}),
after $T$ calls to the first-order oracle, where
%
\begin{equation}
T = \mathcal{O}\left( \frac{Q^3 x_{\max}^2}{\epsilon^{3}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} x_{\max}^2}{\epsilon^{3}} \right).
\end{equation}
\end{corollary}
For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is intractable. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight. This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}.
%\textbf{AE: if we find some time, i'll add a couple of lines for how 1st order opt implies 2nd order opt for smooth constraints.}
\subsection{Second-Order Optimality}
%{\color{red}{Ahmet (note to myself): not sure of the constants of trust-region, check again}} \\
Let us now consider the second-order optimality case where the solver in Step~2 is the the trust region method developed in~\cite{cartis2012complexity}.
Trust region method minimizes quadratic approximation of the function within a dynamically updated trust-region radius.
Second-order trust region method that we consider in this section makes use of Hessian (or an approximation of Hessian) of the augmented Lagrangian in addition to first order oracles.
As shown in~\cite{nouiehed2018convergence}, finding approximate second-order stationary points of convex-constrained problems is in general NP-hard. For this reason, we focus in this section on the special case of~\eqref{prob:01} with $g=0$.
%We first give a theorem to show the convergence rate of the algorithm for second order stationarity: \\
%{\color{red}{Ahmet: I think that it is possible to remove sufficiently large k0 assumption here. }} \textbf{AE: not worth it really}
%\begin{corollary}
%\label{thm:main_second}
%Under the assumptions of Theorem~\ref{thm:main}, the output of Algorithm~\ref{Algo:2} satisfies
%\begin{align}
%%& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\
%%& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\
%%&=: \frac{Q}{\beta_{k-1}^2}.
%\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_{k-1}}(x_k, y_k)) \geq - \frac{C}{\beta_{k-1}} - \epsilon_{k-1}.
%\end{align}
%\end{corollary}
%
%We consider \textbf{AE: Ahmet to add a brief description of the algorithm.}
Let us compute the total computational complexity of Algorithm~\ref{Algo:2} with the trust region method in Step~2, in terms of the number of calls made to the second-order oracle. By adapting the result in~\cite{cartis2012complexity} to our setup, we find that the number of (inner) iterations required in Step~2 of Algorithm~\ref{Algo:2} to produce $x_{k+1}$ is
%
\begin{equation}
\mathcal{O}\left ( \frac{\lambda_{\beta_{k}, H}^2 (\mathcal{L}_{\beta_{k}}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y))}{\epsilon_k^3} \right),
\label{eq:sec_inn_comp}
\end{equation}
%
%<<<<<<< HEAD
where $\lambda_{\beta, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop.
In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y)$ is bounded by a constant independent of $\epsilon$.
We assume a uniform bound for this quantity $\forall \beta_k$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following:
%=======
%where $\lambda_{\beta_k, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta_k$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop.
%In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta_k}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y)$ is bounded by a constant independent of $\epsilon_k$.
%We assume a uniform bound for this quantity for all $\b$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following result. The proof is very similar to that of Corollary~\ref{cor:first} and hence omitted here.
%>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732
%
\begin{corollary}\label{cor:second}
For $b>1$, let $\beta_k =b^k $ for every $k$.
We assume that
\begin{equation}
\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) \leq L_{u},~~ \forall \beta.
\end{equation}
If we use the trust region method from~\cite{cartis2012complexity} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon$-second-order stationary point of~(\ref{prob:01}) in $T$ calls to the second-order oracle where
%
\begin{equation}
T \leq \mathcal{O}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \log_b{\left( \frac{Q'}{\epsilon} \right)} \right) = \widetilde{\mathcal{O}}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \right).
\end{equation}
\end{corollary}
%
Before closing this section, we note that the remark after Corollary~\ref{cor:first} applies here as well.
%

Event Timeline