guarantees.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, May 18, 12:54

guarantees.tex
View Options

	%!TEX root = ../main.tex

	\section{Convergence Rate \label{sec:cvg rate}}
	This section presents the total iteration complexity of Algorithm~\ref{Algo:2} for finding first and second-order stationary points of problem \eqref{eq:minmax}.
	All the proofs are deferred to Appendix~\ref{sec:theory}.
	Theorem~\ref{thm:main} characterizes the convergence rate of Algorithm~\ref{Algo:2} for finding stationary points in the number of outer iterations.
	\begin{theorem}\textbf{\emph{(convergence rate)}}
	\label{thm:main} For integers $2 \le k_0\le k_1$, consider the interval $K=[k_0:k_1]$, and let $\{x_k\}_{k\in K}$ be the output sequence of Algorithm~\ref{Algo:2} on the interval $K$.\footnote{The choice of $k_1 = \infty$ is valid here too.} Let also $\rho:=\sup_{k\in [K]} \\|x_k\\|$.\footnote{If necessary, to ensure that $\rho<\infty$, one can add a small factor of $\\|x\\|^2$ to $\mathcal{L}_{\b}$ in \eqref{eq:Lagrangian}. Then it is easy to verify that the iterates of Algorithm \ref{Algo:2} remain bounded, provided that the initial penalty weight $\beta_0$ is large enough, $\sup_x \\|\nabla f(x)\\|/\\|x\\|< \infty$, $\sup_x \\|A(x)\\| < \infty$, and $\sup_x \\|D A(x)\\| <\infty$. } Suppose that $f$ and $A$ satisfy (\ref{eq:smoothness basic}) and let
	\begin{align}
	\lambda'_f = \max_{\\|x\\|\le \rho} \\|\nabla f(x)\\|,\qquad
	\lambda'_A = \max_{\\|x\\| \le \rho} \\|DA(x)\\|,
	\label{eq:defn_restricted_lipsichtz}
	\end{align}
	be the (restricted) Lipschitz constants of $f$ and $A$, respectively.
	%Suppose that $\b_1 \ge \b_0$, where the expression for $\b_0(f,g,A,\s_1)$ is given in (\ref{eq:beta-0}).
	With $\nu>0$, assume that
	\begin{align}
	\nu \\|A(x_k)\\|
	& \le \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right),
	\label{eq:regularity}
	\end{align}
	for every $k\in K$. We consider two cases:
	%{\color{red} Ahmet: I removed the squares and showed the rate on dist + feasibility, since it is the measure for the stationarity, using squares confuses complexity analysis...}
	\begin{itemize}[leftmargin=*]
	\item If a first-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f},\b_k)$ first-order stationary point of (\ref{eq:minmax}) with
	\begin{align}
	\epsilon_{k,f} & = \frac{1}{\beta_{k-1}} \left(\frac{2(\lambda'_f+\lambda'_A y_{\max}) (1+\lambda_A' \sigma_k)}{\nu}+1\right) =: \frac{Q(f,g,A,\s_1)}{\beta_{k-1}},
	\label{eq:stat_prec_first}
	\end{align}
	for every $k\in K$, where $y_{\max}(x_1,y_0,\s_1)$ is specified in (\ref{eq:dual growth}) due to the limited space.
	\item If a second-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f}, \epsilon_{k,s},\b_k)$ second-order stationary point of~(\ref{eq:minmax}) with $\epsilon_{k,s}$ specified above and with
	\begin{align}
	\epsilon_{k,s} &= \epsilon_{k-1} + \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} = \frac{\nu + \sigma_k \sqrt{m} \lambda_A 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} =: \frac{Q'(f,g,A,\s_1)}{\beta_{k-1}}.
	\end{align}
	\end{itemize}
	\end{theorem}


	%A few remarks are in order about Theorem~\ref{thm:main}.

	%\textbf{Tradeoff.}
	%Roughly speaking, Theorem~\ref{thm:main} states that the iterates of Algorithm~\ref{Algo:2} approach first-order stationarity for~\eqref{prob:01} at the rate of $1/{\b_k}$, where $\b_k$ is the penalty weight in AL in iteration $k$ as in~\eqref{eq:Lagrangian}. Note the inherent tradeoff here: For a faster sequence $\{\b_k\}_k$, the iterates approach stationarity faster but the computational cost of the inexact primal subroutine (Step 2) increases since a higher accuracy is required and Lipschitz constant of the unconstrained problem is also affected by $\beta_k$. In words, there is a tradeoff between the number of outer and inner loop iterations of Algorithm~\ref{Algo:2}. We highlight this tradeoff for a number of subroutines below.

	Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} converges to a (first- or second-) order stationary point of \eqref{eq:minmax} at the rate of $1/\b_k$, further specified in
	Corollary \ref{cor:first} and Corollary \ref{cor:second}.
	%Sections \ref{sec:first-o-opt} and \ref{sec:second-o-opt}.
	A few remarks are in order about Theorem \ref{thm:main}.


	\paragraph{Regularity.} The key geometric condition in Theorem~\ref{thm:main} is \eqref{eq:regularity} which, broadly speaking, ensures that the primal updates of Algorithm \ref{Algo:2} reduce the feasibility gap as the penalty weight $\b_k$ grows. We will verify this condition for several examples in Section \ref{sec:experiments}.


	This condition in \eqref{eq:regularity} is closely related to those in the existing literature. In the special case where $g=0$ in~\eqref{prob:01}, it is easy to verify that \eqref{eq:regularity} reduces to the Polyak-Lojasiewicz (PL) condition for minimizing $\\|A(x)\\|^2$~\cite{karimi2016linear}. PL condition itself is a special case of Kurdyka-Lojasiewicz with $\theta = 1/2$, see \cite[Definition 1.1]{xu2017globally}. When $g=0$, it is also easy to see that \eqref{eq:regularity} is weaker than the Mangasarian-Fromovitz (MF) condition in nonlinear optimization \cite[Assumption 1]{bolte2018nonconvex}.
	{Moreover, {when $g$ is the indicator on a convex set,} \eqref{eq:regularity} is a consequence of the \textit{basic constraint qualification} in \cite{rockafellar1993lagrange}, which itself generalizes the MF condition to the case when $g$ is an indicator function of a convex set.}
	%\notea{I'm not sure if the claim about basic constraint qualification is true because our condition should locally hold rather globally. Could you add the exact equation number in [55] and double check this? Also consider double checking other claims in this paragraph.}

	We may think of \eqref{eq:regularity} as a local condition, which should hold within a neighborhood of the constraint set $\{x:A(x)=0\}$ rather than everywhere in $\mathbb{R}^d$. There is a constant complexity algorithm in \cite{bolte2018nonconvex} to reach this so-called ``information zone'', which supplements Theorem \ref{thm:main}.
	Lastly, in contrast to most conditions in the nonconvex optimization literature, such as~\cite{flores2012complete}, the condition in~\eqref{eq:regularity} appears to be easier to verify, as we see in the sequel.

	%\begin{exmp}[Clustering]
	%Clustering is an important application
	%\end{exmp}


	%\edita{\textbf{AE: Fatih, I think you had added the following two references to our response for icml. Could you discuss their relevance here? [2] Rockafellar, Lagrange Multipliers and Optimality, 1993
	%[3] Bertsekas, On penalty and multiplier methods for constrained minimization. 1996}}

	%\edita{\textbf{AE: the spars review talks about the "Pong-Li" work. Fatih, do you know what is that?}}
	%
	%\editf{(mfs:) I do not think this work is relevant to our condition but the template seems similar. We can mention in the related works instead.}
	%\notea{Ok. Yes, consider adding it to the related work.}
	%We now argue that such a condition is necessary for controlling the feasibility gap of~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}.
	%To provide further insight about \eqref{eq:regularity}, consider the convex case where $f=0$ and $g=\delta_\mathcal{X}$, where $\mathcal{X}$ is a convex set and $A$ is a linear operator. In this case, solving \eqref{prob:01} finds a point in the convex set $\mathcal{X}\cap \text{null}(A)$, where the subspace $\text{null}(A)=\{ x\in\mathbb{R}^d: A(x) = 0 \} \subset \RR^d$ is the null space of $A$.
	%Here, the Slater's condition~\cite{boyd2004convex} requires that
	%\begin{equation}
	%\text{relint} (\mathcal{X}) \cap \text{null}(A)\neq \emptyset.
	%\end{equation}
	%%<<<<<<< HEAD
	%Recall that the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}.
	%Intuitively, the Slater's condition achieves this by removing pathological cases and ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01} and finding a point in $\mathcal{X}\cap \text{null}(A)$ can be particularly slow. For instance, in Figure \ref{fig:convex_slater}, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) converges to the intersection point at the suboptimal rate of $1/\sqrt{k}$ rather than the standard $1/k$ rate.

	%{\color{red} Ahmet: a reference would be cool here}
	%=======
	%In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}. Intuitively, the Slater's condition here removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01}, namely, finding a point in $\mathcal{X}\cap \text{null}(A)$, can be particularly difficult. For instance, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) has arbitrarily slow convergence, see Figure~\ref{fig:convex_slater}.
	%>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732

	%\begin{figure}
	%\begin{center}
	%\includegraphics[scale=.5]{Slater.pdf}
	%\end{center}
	%\caption{In pathological cases where the Slater's condition does not apply, solving \eqref{prob:01} can be particularly slow, even when \eqref{prob:01} is a convex program. See the first remark after Theorem~\ref{thm:main} for more details. \label{fig:convex_slater}}
	%\end{figure}
	\paragraph{Penalty method.}
	A classical algorithm to solve \eqref{prob:01} is the penalty method, which is characterized by the absence of the dual variable ($y=0$) in \eqref{eq:Lagrangian}. Indeed, ALM can be interpreted as an adaptive penalty or smoothing method with a variable center determined by the dual variable. It is worth noting that, with the same proof technique, one can establish the same convergence rate of Theorem \ref{thm:main} for the penalty method. However, while both methods have the same convergence rate in theory, we ignore the uncompetitive penalty method since it is significantly outperformed by iALM in practice. % outperforms the penalty method significantly in practice. % by virtue of its variable center and has been excluded from this presentation for this reason.


	\paragraph{Computational complexity.} Theorem~\ref{thm:main} specifies the number of (outer) iterations that Algorithm~\ref{Algo:2} requires to reach a near-stationary point of problem~\eqref{eq:Lagrangian} with a prescribed precision and, in particular, specifies the number of calls made to the solver in Step~2. In this sense, Theorem~\ref{thm:main} does not fully capture the computational complexity of Algorithm~\ref{Algo:2}, as it does not take into account the computational cost of the solver in Step~2.

	To better understand the total iteration complexity of Algorithm~\ref{Algo:2}, we consider two scenarios in the following. In the first scenario, we take the solver in Step~2 to be the Accelerated Proximal Gradient Method (APGM), a well-known first-order algorithm~\cite{ghadimi2016accelerated}. In the second scenario, we will use the second-order trust region method developed in~\cite{cartis2012complexity}.
	We have the following two corollaries showing the total complexity of our algorithm to reach first and second-order stationary points.
	Appendix \ref{sec:opt_cnds} contains the proofs and more detailed discussion for the complexity results.

	%
	\begin{corollary}[First-order optimality]\label{cor:first}
	For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point of~\eqref{eq:minmax},
	after $T$ calls to the first-order oracle, where
	%
	\begin{equation}
	T = \mathcal{O}\left( \frac{Q^3 \rho^2}{\epsilon^{3}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} \rho^2}{\epsilon^{3}} \right).
	\end{equation}
	\end{corollary}
	%For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is thus not feasible. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight.\footnote{In this context, homotopy loosely corresponds to the gradual enforcement of the constraints by increasing the penalty weight.} This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}.
	For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is thus not feasible. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight. This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}.

	\begin{corollary}[Second-order optimality]\label{cor:second}
	For $b>1$, let $\beta_k =b^k $ for every $k$.
	We assume that
	\begin{equation}
	\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) \leq L_{u},\qquad \forall \beta.
	\end{equation}
	If we use the trust region method from~\cite{cartis2012complexity} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon$-second-order stationary point of~\eqref{eq:minmax} in $T$ calls to the second-order oracle where
	%
	\begin{equation}
	T = \mathcal{O}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \log_b{\left( \frac{Q'}{\epsilon} \right)} \right) = \widetilde{\mathcal{O}}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \right).
	\end{equation}
	\end{corollary}

	\paragraph{Remark.}
	These complexity results for first and second-order are stationarity with respect to~\eqref{eq:Lagrangian}.
	We note that these complexities match~\cite{cartis2018optimality} and~\cite{birgin2016evaluation}.
	However, the stationarity criteria and the definition of dual variable in these papers differ from ours.
	We include more discussion on this in the Appendix.

guarantees.texNo OneTemporaryActions

File Metadata

guarantees.texView Options

Event Timeline

guarantees.tex
No OneTemporary
Actions

guarantees.tex
View Options