Page MenuHomec4science

guarantees.tex
No OneTemporary

File Metadata

Created
Wed, Sep 4, 18:13

guarantees.tex

%!TEX root = ../main.tex
\section{Convergence Rate \label{sec:cvg rate}}
In this section, we derive the total iteration complexity of Algorithm~\ref{Algo:2} for finding first and second-order stationary points of problem \eqref{prob:01}.
All the proofs are deferred to Appendix~\ref{sec:theory}.
Theorem~\ref{thm:main} characterizes the convergence rate of Algorithm~\ref{Algo:2} for finding stationary points in terms of the number of outer iterations.\\
%<<<<<<< HEAD
%{\color{red}{Ahmet: Maybe instead of sufficiently large k0 we can say k0 such that beta is bigger than 1, it makes the proof go thru, it would be a bit more explanatory}}
%=======
%>>>>>>> b39eea61625954bc9d3858590b7b37a182a6af4f
\begin{theorem}\textbf{\emph{(convergence rate)}}
\label{thm:main} For integers $2 \le k_0\le k_1$, consider the interval $K=[k_0:k_1]$, and let $\{x_k\}_{k\in K}$ be the output sequence of Algorithm~\ref{Algo:2} on the interval $K$.\footnote{The choice of $k_1 = \infty$ is valid here too.} Let also $\rho:=\sup_{k\in [K]} \|x_k\|$.\footnote{If necessary, to ensure that $\rho<\infty$, one can add a small factor of $\|x\|^2$ to $\mathcal{L}_{\b}$ in \eqref{eq:Lagrangian}. Then it is easy to verify that the iterates of Algorithm \ref{Algo:2} remain bounded, provided that the penalty weight $\b$ is large enough, $\sup_x \|\nabla f(x)\|/\|x\|< \infty$, $\sup_x \|A(x)\| < \infty$, and $\sup_x \|D A(x)\| <\infty$. } Suppose that $f$ and $A$ satisfy (\ref{eq:smoothness basic}) and let
\begin{align}
\lambda'_f = \max_{\|x\|\le \rho} \|\nabla f(x)\|,\qquad
\lambda'_A = \max_{\|x\| \le \rho} \|DA(x)\|,
\label{eq:defn_restricted_lipsichtz}
\end{align}
be the (restricted) Lipschitz constants of $f$ and $A$, respectively.
%Suppose that $\b_1 \ge \b_0$, where the expression for $\b_0(f,g,A,\s_1)$ is given in (\ref{eq:beta-0}).
With $\nu>0$, assume that
\begin{align}
\nu \|A(x_k)\|
& \le \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right),
\label{eq:regularity}
\end{align}
for every $k\in K$. We consider two cases:
%{\color{red} Ahmet: I removed the squares and showed the rate on dist + feasibility, since it is the measure for the stationarity, using squares confuses complexity analysis...}
\begin{itemize}[leftmargin=*]
\item If a first-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f},\b_k)$ first-order stationary point of (\ref{prob:01}) with
\begin{align}
\epsilon_{k,f} & = \frac{1}{\beta_{k-1}} \left(\frac{2(\lambda'_f+\lambda'_A y_{\max}) (1+\lambda_A' \sigma_k)}{\nu}+1\right) =: \frac{Q(f,g,A,\s_1)}{\beta_{k-1}},
\label{eq:stat_prec_first}
\end{align}
for every $k\in K$, where $y_{\max}(x_1,y_0,\s_1)$ is specified in (\ref{eq:dual growth}) due to the limited space.
\item If a second-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f}, \epsilon_{k,s},\b_k)$ second-order stationary point of~(\ref{prob:01}) with $\epsilon_{k,s}$ specified above and with
\begin{align}
\epsilon_{k,s} &= \epsilon_{k-1} + \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} = \frac{\nu + \sigma_k \sqrt{m} \lambda_A 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} =: \frac{Q'(f,g,A,\s_1)}{\beta_{k-1}}.
\end{align}
\end{itemize}
\end{theorem}
%A few remarks are in order about Theorem~\ref{thm:main}.
%\textbf{Tradeoff.}
%Roughly speaking, Theorem~\ref{thm:main} states that the iterates of Algorithm~\ref{Algo:2} approach first-order stationarity for~\eqref{prob:01} at the rate of $1/{\b_k}$, where $\b_k$ is the penalty weight in AL in iteration $k$ as in~\eqref{eq:Lagrangian}. Note the inherent tradeoff here: For a faster sequence $\{\b_k\}_k$, the iterates approach stationarity faster but the computational cost of the inexact primal subroutine (Step 2) increases since a higher accuracy is required and Lipschitz constant of the unconstrained problem is also affected by $\beta_k$. In words, there is a tradeoff between the number of outer and inner loop iterations of Algorithm~\ref{Algo:2}. We highlight this tradeoff for a number of subroutines below.
Loosely speaking, Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} converges to a (first- or second-) order stationary point of \eqref{prob:01} at the rate of $1/\b_k$, further specified in Sections \ref{sec:first-o-opt} and \ref{sec:second-o-opt}. A few remarks are in order about Theorem \ref{thm:main}.
\textbf{Regularity.} The key geometric condition in Theorem~\ref{thm:main} is \eqref{eq:regularity} which, broadly speaking, ensures that the primal updates of Algorithm \ref{Algo:2} reduce the feasibility gap as the penalty weight $\b_k$ grows. We will verify this condition for several examples in Section \ref{sec:experiments}.
This condition in \eqref{eq:regularity} is closely related to those in the existing literature. In the special case where $g=0$ in~\eqref{prob:01}, it is easy to verify that \eqref{eq:regularity} reduces to the Polyak-Lojasiewicz (PL) condition for minimizing $\|A(x)\|^2$~\cite{karimi2016linear}. PL condition itself is a special case of Kurdyka-Lojasiewicz with $\theta = 1/2$, see \cite[Definition 1.1]{xu2017globally}. When $g=0$, it is also easy to see that \eqref{eq:regularity} is weaker than the Mangasarian-Fromovitz (MF) condition in nonlinear optimization \cite[Assumption 1]{bolte2018nonconvex}.
{Moreover, \edita{when $g$ is the indicator on a convex set,} \eqref{eq:regularity} is a consequence of the \textit{basic constraint qualification} in \cite{rockafellar1993lagrange}, which itself generalizes the MF condition to the case when $g$ is an indicator function of a convex set.}
%\notea{I'm not sure if the claim about basic constraint qualification is true because our condition should locally hold rather globally. Could you add the exact equation number in [55] and double check this? Also consider double checking other claims in this paragraph.}
Loosely speaking, we may think of \eqref{eq:regularity} as a local condition, which should hold within a neighborhood of the constraint set $\{x:A(x)=0\}$ rather than everywhere in $\mathbb{R}^d$. There is a constant complexity algorithm in \cite{bolte2018nonconvex} to reach this so-called ``information zone'', which supplements Theorem \ref{thm:main}.
Lastly, in contrast to most conditions in the nonconvex optimization literature, such as~\cite{flores2012complete}, the condition in~\eqref{eq:regularity} appears to be easier to verify, as we see in Section~\ref{sec:experiments}.
%\edita{\textbf{AE: Fatih, I think you had added the following two references to our response for icml. Could you discuss their relevance here? [2] Rockafellar, Lagrange Multipliers and Optimality, 1993
%[3] Bertsekas, On penalty and multiplier methods for constrained minimization. 1996}}
%\edita{\textbf{AE: the spars review talks about the "Pong-Li" work. Fatih, do you know what is that?}}
%
%\editf{(mfs:) I do not think this work is relevant to our condition but the template seems similar. We can mention in the related works instead.}
%\notea{Ok. Yes, consider adding it to the related work.}
%We now argue that such a condition is necessary for controlling the feasibility gap of~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}.
%To provide further insight about \eqref{eq:regularity}, consider the convex case where $f=0$ and $g=\delta_\mathcal{X}$, where $\mathcal{X}$ is a convex set and $A$ is a linear operator. In this case, solving \eqref{prob:01} finds a point in the convex set $\mathcal{X}\cap \text{null}(A)$, where the subspace $\text{null}(A)=\{ x\in\mathbb{R}^d: A(x) = 0 \} \subset \RR^d$ is the null space of $A$.
%Here, the Slater's condition~\cite{boyd2004convex} requires that
%\begin{equation}
%\text{relint} (\mathcal{X}) \cap \text{null}(A)\neq \emptyset.
%\end{equation}
%%<<<<<<< HEAD
%Recall that the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}.
%Intuitively, the Slater's condition achieves this by removing pathological cases and ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01} and finding a point in $\mathcal{X}\cap \text{null}(A)$ can be particularly slow. For instance, in Figure \ref{fig:convex_slater}, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) converges to the intersection point at the suboptimal rate of $1/\sqrt{k}$ rather than the standard $1/k$ rate.
%{\color{red} Ahmet: a reference would be cool here}
%=======
%In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}. Intuitively, the Slater's condition here removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01}, namely, finding a point in $\mathcal{X}\cap \text{null}(A)$, can be particularly difficult. For instance, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) has arbitrarily slow convergence, see Figure~\ref{fig:convex_slater}.
%>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732
%\begin{figure}
%\begin{center}
%\includegraphics[scale=.5]{Slater.pdf}
%\end{center}
%\caption{In pathological cases where the Slater's condition does not apply, solving \eqref{prob:01} can be particularly slow, even when \eqref{prob:01} is a convex program. See the first remark after Theorem~\ref{thm:main} for more details. \label{fig:convex_slater}}
%\end{figure}
\paragraph{Penalty method.}
A classical algorithm to solve \eqref{prob:01} is the penalty method, which is characterized by the absence of the dual variable ($y=0$) in \eqref{eq:Lagrangian}. Indeed, ALM can be interpreted as an adaptive penalty or smoothing method with a variable center determined by the dual variable. It is worth noting that, with the same proof technique, one can establish the same convergence rate of Theorem \ref{thm:main} for the penalty method. However, while both methods have the same convergence rate in theory, iALM outperforms the penalty method in practice by virtue of its variable center and has been excluded from this presentation for this reason.
\textbf{Computational complexity.} Theorem~\ref{thm:main} specifies the number of (outer) iterations that Algorithm~\ref{Algo:2} requires to reach a near-stationary point of problem~\eqref{prob:01} with a prescribed precision and, in particular, specifies the number of calls made to the solver in Step~2. In this sense, Theorem~\ref{thm:main} does not fully capture the computational complexity of Algorithm~\ref{Algo:2}, as it does not take into account the computational cost of the solver in Step~2.
To better understand the total iteration complexity of Algorithm~\ref{Algo:2}, we consider two scenarios in the following. In the first scenario, we take the solver in Step~2 to be the Accelerated Proximal Gradient Method (APGM), a well-known first-order algorithm~\cite{ghadimi2016accelerated}. In the second scenario, we will use the second-order trust region method developed in~\cite{cartis2012complexity}.
\subsection{First-Order Optimality \label{sec:first-o-opt}}
%\textbf{AE: we go back and forth between "subroutine" and "solver". for consistency, I'm just using "solver" everywhere.}
Let us first consider the case where the solver in Step~2 is is the first-order algorithm APGM, described in detail in ~\cite{ghadimi2016accelerated}. At a high level,
APGM makes use of $\nabla_x \mathcal{L}_{\beta}(x,y)$ in \eqref{eq:Lagrangian}, the proximal operator $\text{prox}_g$, and the classical Nesterov acceleration~\cite{nesterov1983method} to reach first-order stationarity for the subproblem in~\eqref{e:exac}.
% \textbf{AE: Ahmet to give a brief description of APGM here. }
%First, we characterize the iteration complexity of Algorithm~\ref{Algo:2} for finding a first-order stationary point of~\eqref{prob:01}.
%We propose to use the standard accelerated proximal method (APGM), guarantees of which are analyzed in~\cite{ghadimi2016accelerated} for nonconvex problems of the form~\eqref{e:exac}.
Suppose that $g=\delta_\mathcal{X}$ is the indicator function on a bounded convex set $\mathcal{X}\subset \RR^d$ and let
\begin{align}
\rho= \max_{x\in \mathcal{X}}\|x\|,
\end{align}
be the radius of a ball centered at the origin that includes $\mathcal{X}$.
Then, adapting the results in~\cite{ghadimi2016accelerated} to our setup, APGM reaches $x_{k}$ in Step 2 of Algorithm~\ref{Algo:2} after
\begin{equation}
\mathcal{O}\left ( \frac{\lambda_{\beta_{k}}^2 \rho^{2} }{\epsilon_{k+1}} \right)
\label{eq:iter_1storder}
\end{equation}
(inner) iterations, where $\lambda_{\beta_{k}}$ denotes the Lipschitz constant of $\nabla_x{\mathcal{L}_{\beta_{k}}(x, y)}$, bounded in~\eqref{eq:smoothness of Lagrangian}. For the clarity of the presentation, we have used a looser bound in \eqref{eq:iter_1storder} compared to~\cite{ghadimi2016accelerated}.
Using \eqref{eq:iter_1storder}, we derive the following corollary, describing the total iteration complexity of Algorithm~\ref{Algo:2} in terms of the number calls made to the first-order oracle in APGM. %\textbf{AE: we haven't talked about oracle before.}
\begin{corollary}\label{cor:first}
For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point,
after $T$ calls to the first-order oracle, where
%
\begin{equation}
T = \mathcal{O}\left( \frac{Q^3 \rho^2}{\epsilon^{3}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} \rho^2}{\epsilon^{3}} \right).
\end{equation}
\end{corollary}
For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is thus not feasible. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight.\footnote{In this context, homotopy loosely corresponds to the gradual enforcement of the constraints by increasing the penalty weight.} This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}.
%\textbf{AE: if we find some time, i'll add a couple of lines for how 1st order opt implies 2nd order opt for smooth constraints.}
\subsection{Second-Order Optimality \label{sec:second-o-opt}}
%{\color{red}{Ahmet (note to myself): not sure of the constants of trust-region, check again}} \\
Let us now consider the second-order optimality case where the solver in Step~2 is the the trust region method developed in~\cite{cartis2012complexity}.
Trust region method minimizes a quadratic approximation of the function within a dynamically updated trust-region radius.
Second-order trust region method that we consider in this section makes use of Hessian (or an approximation of Hessian) of the augmented Lagrangian in addition to first order oracles.
As shown in~\cite{nouiehed2018convergence}, finding approximate second-order stationary points of convex-constrained problems is in general NP-hard. For this reason, we focus in this section on the special case of~\eqref{prob:01} with $g=0$.
%We first give a theorem to show the convergence rate of the algorithm for second order stationarity: \\
%{\color{red}{Ahmet: I think that it is possible to remove sufficiently large k0 assumption here. }} \textbf{AE: not worth it really}
%\begin{corollary}
%\label{thm:main_second}
%Under the assumptions of Theorem~\ref{thm:main}, the output of Algorithm~\ref{Algo:2} satisfies
%\begin{align}
%%& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\
%%& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\
%%&=: \frac{Q}{\beta_{k-1}^2}.
%\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_{k-1}}(x_k, y_k)) \geq - \frac{C}{\beta_{k-1}} - \epsilon_{k-1}.
%\end{align}
%\end{corollary}
%
%We consider \textbf{AE: Ahmet to add a brief description of the algorithm.}
Let us compute the total computational complexity of Algorithm~\ref{Algo:2} with the trust region method in Step~2, in terms of the number of calls made to the second-order oracle. By adapting the result in~\cite{cartis2012complexity} to our setup, we find that the number of (inner) iterations required in Step~2 of Algorithm~\ref{Algo:2} to produce $x_{k+1}$ is
%
\begin{equation}
\mathcal{O}\left ( \frac{\lambda_{\beta_{k}, H}^2 (\mathcal{L}_{\beta_{k}}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y))}{\epsilon_k^3} \right),
\label{eq:sec_inn_comp}
\end{equation}
%
%<<<<<<< HEAD
where $\lambda_{\beta, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop.
In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y)$ is bounded by a constant independent of $\epsilon$.
We assume a uniform bound for this quantity for every $ \beta_k$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following:
%=======
%where $\lambda_{\beta_k, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta_k$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop.
%In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta_k}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y)$ is bounded by a constant independent of $\epsilon_k$.
%We assume a uniform bound for this quantity for all $\b$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following result. The proof is very similar to that of Corollary~\ref{cor:first} and hence omitted here.
%>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732
%
\begin{corollary}\label{cor:second}
For $b>1$, let $\beta_k =b^k $ for every $k$.
We assume that
\begin{equation}
\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) \leq L_{u},\qquad \forall \beta.
\end{equation}
If we use the trust region method from~\cite{cartis2012complexity} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon$-second-order stationary point of~(\ref{prob:01}) in $T$ calls to the second-order oracle where
%
\begin{equation}
T = \mathcal{O}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \log_b{\left( \frac{Q'}{\epsilon} \right)} \right) = \widetilde{\mathcal{O}}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \right).
\end{equation}
\end{corollary}
%
%\notea{I can't remember: what is $Q'$ and why isn't it defined?}
Before closing this section, we note that the remark after Corollary~\ref{cor:first} applies here as well.
%

Event Timeline