Page MenuHomec4science

guarantees.tex
No OneTemporary

File Metadata

Created
Sat, May 18, 12:54

guarantees.tex

%!TEX root = ../main.tex
\section{Convergence Rate \label{sec:cvg rate}}
This section presents the total iteration complexity of Algorithm~\ref{Algo:2} for finding first and second-order stationary points of problem \eqref{eq:minmax}.
All the proofs are deferred to Appendix~\ref{sec:theory}.
Theorem~\ref{thm:main} characterizes the convergence rate of Algorithm~\ref{Algo:2} for finding stationary points in the number of outer iterations.
\begin{theorem}\textbf{\emph{(convergence rate)}}
\label{thm:main} For integers $2 \le k_0\le k_1$, consider the interval $K=[k_0:k_1]$, and let $\{x_k\}_{k\in K}$ be the output sequence of Algorithm~\ref{Algo:2} on the interval $K$.\footnote{The choice of $k_1 = \infty$ is valid here too.} Let also $\rho:=\sup_{k\in [K]} \|x_k\|$.\footnote{If necessary, to ensure that $\rho<\infty$, one can add a small factor of $\|x\|^2$ to $\mathcal{L}_{\b}$ in \eqref{eq:Lagrangian}. Then it is easy to verify that the iterates of Algorithm \ref{Algo:2} remain bounded, provided that the initial penalty weight $\beta_0$ is large enough, $\sup_x \|\nabla f(x)\|/\|x\|< \infty$, $\sup_x \|A(x)\| < \infty$, and $\sup_x \|D A(x)\| <\infty$. } Suppose that $f$ and $A$ satisfy (\ref{eq:smoothness basic}) and let
\begin{align}
\lambda'_f = \max_{\|x\|\le \rho} \|\nabla f(x)\|,\qquad
\lambda'_A = \max_{\|x\| \le \rho} \|DA(x)\|,
\label{eq:defn_restricted_lipsichtz}
\end{align}
be the (restricted) Lipschitz constants of $f$ and $A$, respectively.
%Suppose that $\b_1 \ge \b_0$, where the expression for $\b_0(f,g,A,\s_1)$ is given in (\ref{eq:beta-0}).
With $\nu>0$, assume that
\begin{align}
\nu \|A(x_k)\|
& \le \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right),
\label{eq:regularity}
\end{align}
for every $k\in K$. We consider two cases:
%{\color{red} Ahmet: I removed the squares and showed the rate on dist + feasibility, since it is the measure for the stationarity, using squares confuses complexity analysis...}
\begin{itemize}[leftmargin=*]
\item If a first-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f},\b_k)$ first-order stationary point of (\ref{eq:minmax}) with
\begin{align}
\epsilon_{k,f} & = \frac{1}{\beta_{k-1}} \left(\frac{2(\lambda'_f+\lambda'_A y_{\max}) (1+\lambda_A' \sigma_k)}{\nu}+1\right) =: \frac{Q(f,g,A,\s_1)}{\beta_{k-1}},
\label{eq:stat_prec_first}
\end{align}
for every $k\in K$, where $y_{\max}(x_1,y_0,\s_1)$ is specified in (\ref{eq:dual growth}) due to the limited space.
\item If a second-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f}, \epsilon_{k,s},\b_k)$ second-order stationary point of~(\ref{eq:minmax}) with $\epsilon_{k,s}$ specified above and with
\begin{align}
\epsilon_{k,s} &= \epsilon_{k-1} + \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} = \frac{\nu + \sigma_k \sqrt{m} \lambda_A 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} =: \frac{Q'(f,g,A,\s_1)}{\beta_{k-1}}.
\end{align}
\end{itemize}
\end{theorem}
%A few remarks are in order about Theorem~\ref{thm:main}.
%\textbf{Tradeoff.}
%Roughly speaking, Theorem~\ref{thm:main} states that the iterates of Algorithm~\ref{Algo:2} approach first-order stationarity for~\eqref{prob:01} at the rate of $1/{\b_k}$, where $\b_k$ is the penalty weight in AL in iteration $k$ as in~\eqref{eq:Lagrangian}. Note the inherent tradeoff here: For a faster sequence $\{\b_k\}_k$, the iterates approach stationarity faster but the computational cost of the inexact primal subroutine (Step 2) increases since a higher accuracy is required and Lipschitz constant of the unconstrained problem is also affected by $\beta_k$. In words, there is a tradeoff between the number of outer and inner loop iterations of Algorithm~\ref{Algo:2}. We highlight this tradeoff for a number of subroutines below.
Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} converges to a (first- or second-) order stationary point of \eqref{eq:minmax} at the rate of $1/\b_k$, further specified in
Corollary \ref{cor:first} and Corollary \ref{cor:second}.
%Sections \ref{sec:first-o-opt} and \ref{sec:second-o-opt}.
A few remarks are in order about Theorem \ref{thm:main}.
\paragraph{Regularity.} The key geometric condition in Theorem~\ref{thm:main} is \eqref{eq:regularity} which, broadly speaking, ensures that the primal updates of Algorithm \ref{Algo:2} reduce the feasibility gap as the penalty weight $\b_k$ grows. We will verify this condition for several examples in Section \ref{sec:experiments}.
This condition in \eqref{eq:regularity} is closely related to those in the existing literature. In the special case where $g=0$ in~\eqref{prob:01}, it is easy to verify that \eqref{eq:regularity} reduces to the Polyak-Lojasiewicz (PL) condition for minimizing $\|A(x)\|^2$~\cite{karimi2016linear}. PL condition itself is a special case of Kurdyka-Lojasiewicz with $\theta = 1/2$, see \cite[Definition 1.1]{xu2017globally}. When $g=0$, it is also easy to see that \eqref{eq:regularity} is weaker than the Mangasarian-Fromovitz (MF) condition in nonlinear optimization \cite[Assumption 1]{bolte2018nonconvex}.
{Moreover, {when $g$ is the indicator on a convex set,} \eqref{eq:regularity} is a consequence of the \textit{basic constraint qualification} in \cite{rockafellar1993lagrange}, which itself generalizes the MF condition to the case when $g$ is an indicator function of a convex set.}
%\notea{I'm not sure if the claim about basic constraint qualification is true because our condition should locally hold rather globally. Could you add the exact equation number in [55] and double check this? Also consider double checking other claims in this paragraph.}
We may think of \eqref{eq:regularity} as a local condition, which should hold within a neighborhood of the constraint set $\{x:A(x)=0\}$ rather than everywhere in $\mathbb{R}^d$. There is a constant complexity algorithm in \cite{bolte2018nonconvex} to reach this so-called ``information zone'', which supplements Theorem \ref{thm:main}.
Lastly, in contrast to most conditions in the nonconvex optimization literature, such as~\cite{flores2012complete}, the condition in~\eqref{eq:regularity} appears to be easier to verify, as we see in the sequel.
%\begin{exmp}[Clustering]
%Clustering is an important application
%\end{exmp}
%\edita{\textbf{AE: Fatih, I think you had added the following two references to our response for icml. Could you discuss their relevance here? [2] Rockafellar, Lagrange Multipliers and Optimality, 1993
%[3] Bertsekas, On penalty and multiplier methods for constrained minimization. 1996}}
%\edita{\textbf{AE: the spars review talks about the "Pong-Li" work. Fatih, do you know what is that?}}
%
%\editf{(mfs:) I do not think this work is relevant to our condition but the template seems similar. We can mention in the related works instead.}
%\notea{Ok. Yes, consider adding it to the related work.}
%We now argue that such a condition is necessary for controlling the feasibility gap of~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}.
%To provide further insight about \eqref{eq:regularity}, consider the convex case where $f=0$ and $g=\delta_\mathcal{X}$, where $\mathcal{X}$ is a convex set and $A$ is a linear operator. In this case, solving \eqref{prob:01} finds a point in the convex set $\mathcal{X}\cap \text{null}(A)$, where the subspace $\text{null}(A)=\{ x\in\mathbb{R}^d: A(x) = 0 \} \subset \RR^d$ is the null space of $A$.
%Here, the Slater's condition~\cite{boyd2004convex} requires that
%\begin{equation}
%\text{relint} (\mathcal{X}) \cap \text{null}(A)\neq \emptyset.
%\end{equation}
%%<<<<<<< HEAD
%Recall that the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}.
%Intuitively, the Slater's condition achieves this by removing pathological cases and ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01} and finding a point in $\mathcal{X}\cap \text{null}(A)$ can be particularly slow. For instance, in Figure \ref{fig:convex_slater}, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) converges to the intersection point at the suboptimal rate of $1/\sqrt{k}$ rather than the standard $1/k$ rate.
%{\color{red} Ahmet: a reference would be cool here}
%=======
%In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}. Intuitively, the Slater's condition here removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01}, namely, finding a point in $\mathcal{X}\cap \text{null}(A)$, can be particularly difficult. For instance, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) has arbitrarily slow convergence, see Figure~\ref{fig:convex_slater}.
%>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732
%\begin{figure}
%\begin{center}
%\includegraphics[scale=.5]{Slater.pdf}
%\end{center}
%\caption{In pathological cases where the Slater's condition does not apply, solving \eqref{prob:01} can be particularly slow, even when \eqref{prob:01} is a convex program. See the first remark after Theorem~\ref{thm:main} for more details. \label{fig:convex_slater}}
%\end{figure}
\paragraph{Penalty method.}
A classical algorithm to solve \eqref{prob:01} is the penalty method, which is characterized by the absence of the dual variable ($y=0$) in \eqref{eq:Lagrangian}. Indeed, ALM can be interpreted as an adaptive penalty or smoothing method with a variable center determined by the dual variable. It is worth noting that, with the same proof technique, one can establish the same convergence rate of Theorem \ref{thm:main} for the penalty method. However, while both methods have the same convergence rate in theory, we ignore the uncompetitive penalty method since it is significantly outperformed by iALM in practice. % outperforms the penalty method significantly in practice. % by virtue of its variable center and has been excluded from this presentation for this reason.
\paragraph{Computational complexity.} Theorem~\ref{thm:main} specifies the number of (outer) iterations that Algorithm~\ref{Algo:2} requires to reach a near-stationary point of problem~\eqref{eq:Lagrangian} with a prescribed precision and, in particular, specifies the number of calls made to the solver in Step~2. In this sense, Theorem~\ref{thm:main} does not fully capture the computational complexity of Algorithm~\ref{Algo:2}, as it does not take into account the computational cost of the solver in Step~2.
To better understand the total iteration complexity of Algorithm~\ref{Algo:2}, we consider two scenarios in the following. In the first scenario, we take the solver in Step~2 to be the Accelerated Proximal Gradient Method (APGM), a well-known first-order algorithm~\cite{ghadimi2016accelerated}. In the second scenario, we will use the second-order trust region method developed in~\cite{cartis2012complexity}.
We have the following two corollaries showing the total complexity of our algorithm to reach first and second-order stationary points.
Appendix \ref{sec:opt_cnds} contains the proofs and more detailed discussion for the complexity results.
%
\begin{corollary}[First-order optimality]\label{cor:first}
For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point of~\eqref{eq:minmax},
after $T$ calls to the first-order oracle, where
%
\begin{equation}
T = \mathcal{O}\left( \frac{Q^3 \rho^2}{\epsilon^{3}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} \rho^2}{\epsilon^{3}} \right).
\end{equation}
\end{corollary}
%For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is thus not feasible. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight.\footnote{In this context, homotopy loosely corresponds to the gradual enforcement of the constraints by increasing the penalty weight.} This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}.
For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is thus not feasible. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight. This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}.
\begin{corollary}[Second-order optimality]\label{cor:second}
For $b>1$, let $\beta_k =b^k $ for every $k$.
We assume that
\begin{equation}
\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) \leq L_{u},\qquad \forall \beta.
\end{equation}
If we use the trust region method from~\cite{cartis2012complexity} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon$-second-order stationary point of~\eqref{eq:minmax} in $T$ calls to the second-order oracle where
%
\begin{equation}
T = \mathcal{O}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \log_b{\left( \frac{Q'}{\epsilon} \right)} \right) = \widetilde{\mathcal{O}}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \right).
\end{equation}
\end{corollary}
\paragraph{Remark.}
These complexity results for first and second-order are stationarity with respect to~\eqref{eq:Lagrangian}.
We note that these complexities match~\cite{cartis2018optimality} and~\cite{birgin2016evaluation}.
However, the stationarity criteria and the definition of dual variable in these papers differ from ours.
We include more discussion on this in the Appendix.

Event Timeline