diff --git a/NeurIPS 19/main.tex b/NeurIPS 19/main.tex index cec33a4..06a1d87 100644 --- a/NeurIPS 19/main.tex +++ b/NeurIPS 19/main.tex @@ -1,215 +1,217 @@ \documentclass{article} %\usepackage[a4paper,top=3cm,bottom=2cm,left=3cm,right=3cm,marginparwidth=1.75cm]{geometry} %%% if you need to pass options to natbib, use, e.g.: %%% \PassOptionsToPackage{numbers, compress}{natbib} %%% before loading neurips_2019 %% %%% ready for submission \usepackage[final]{neurips_2019} %% %%% to compile a preprint version, e.g., for submission to arXiv, add add the %%% [preprint] option: %%% \usepackage[preprint]{neurips_2019} %% %%% to compile a camera-ready version, add the [final] option, e.g.: % \usepackage[final]{neurips_2019} %% %%% to avoid loading the natbib package, add option nonatbib: %%% \usepackage[nonatbib]{neurips_2019} % % \usepackage[utf8]{inputenc} % allow utf-8 input \usepackage[T1]{fontenc} % use 8-bit T1 fonts \usepackage{hyperref} % hyperlinks \usepackage{url} % simple URL typesetting \usepackage{booktabs} % professional-quality tables \usepackage{amsfonts} % blackboard math symbols \usepackage{nicefrac} % compact symbols for 1/2, etc. \usepackage{microtype} % microtypography \usepackage{amsmath} \usepackage{graphicx} \usepackage{multirow} \usepackage{amsthm} %%%%%%% HCGM formatting FROM HERE %%%%%%% %%\documentclass[11pt]{article} %% %%\usepackage{microtype} %%\usepackage{graphicx} %%\usepackage{subfigure} %%\usepackage{booktabs} %% %%\usepackage{hyperref} %%\usepackage{multirow} %% %%\newcommand{\theHalgorithm}{\arabic{algorithm}} %% %%\synctex=1 %% %%\newcommand\hmmax{0} %%\newcommand\bmmax{0} %% %%\usepackage{amsmath,amsbsy,amsgen,amscd,amssymb,amsthm,amsfonts,stmaryrd} %%\usepackage{mathtools} %% %%\usepackage{bm} %% %%\usepackage{microtype} %% %%\usepackage{hyperref} %%\usepackage{url} %% %%\usepackage[usenames,dvipsnames]{xcolor} %%\usepackage{graphicx} %% %%\graphicspath{{art/}} %% %%\DeclareFontFamily{OT1}{pzc}{} %%\DeclareFontShape{OT1}{pzc}{m}{it}{<-> s * [1.200] pzcmi7t}{} %%\DeclareMathAlphabet{\mathpzc}{OT1}{pzc}{m}{it} %%\usepackage[margin=1in]{geometry} %%\usepackage{scrlayer-scrpage} %% %% %%\newcommand\blfootnote[1]{% %% \begingroup %% \renewcommand\thefootnote{}\footnote{#1}% %% \addtocounter{footnote}{-1}% %% \endgroup %%} %%%%%%%%% UNTIL HERE %%%%%%%% \input{preamble} \title{An Inexact Augmented Lagrangian Framework for \\ Nonconvex Optimization with Nonlinear Constraints} % The \author macro works with any number of authors. There are two commands % used to separate the names and addresses of multiple authors: \And and \AND. % % Using \And between authors leaves it to LaTeX to determine where to break the % lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4 % authors names on the first line, and the last on the second line, try using % \AND instead of \And before the third author name. \author{Mehmet Fatih Sahin\\ \texttt{mehmet.sahin@epfl.ch} \And Armin Eftekhari \\ \texttt{armin.eftekhari@epfl.ch} \And Ahmet Alacaoglu \\ \texttt{ahmet.alacaoglu@epfl.ch} \And Fabian Latorre \\ \texttt{fabian.latorre@epfl.ch} \And Volkan Cevher \\ \texttt{volkan.cevher@epfl.ch} \And \\ LIONS, Ecole Polytechnique F\'ed\'erale de Lausanne, Switzerland %\texttt{{\footnotesize{\{mehmet.sahin, armin.eftekhari, ahmet.alacaoglu, fabian.latorregomez, volkan.cevher\}@epfl.ch}}} %\thanks{Correspondence to: Mehmet Fatih Sahin <\texttt{mehmet.sahin@epfl.ch}>} } %\author{Mehmet Fatih Sahin %$\qquad$Armin Eftekhari %$\qquad$Ahmet Alacaoglu\\~\\ %Fabian Latorre %$\qquad$Volkan Cevher\\ %~\\ %LIONS, Ecole Polytechnique F\'ed\'erale de Lausanne, Switzerland \\ %%\texttt{{\footnotesize{\{mehmet.sahin, armin.eftekhari, ahmet.alacaoglu, fabian.latorregomez, volkan.cevher\}@epfl.ch}}} %\blfootnote{Correspondence to: Volkan Cevher $<$\texttt{volkan.cevher@epfl.ch}$>$ } %} % %\date{} %\ofoot{asdfads} % %\author{% % David S.~Hippocampus\thanks{Use footnote for providing further information % about author (webpage, alternative address)---\emph{not} for acknowledging % funding agencies.} \\ % Department of Computer Science\\ % Cranberry-Lemon University\\ % Pittsburgh, PA 15213 \\ % \texttt{hippo@cs.cranberry-lemon.edu} \\ % % examples of more authors % % \And % % Coauthor \\ % % Affiliation \\ % % Address \\ % % \texttt{email} \\ % % \AND % % Coauthor \\ % % Affiliation \\ % % Address \\ % % \texttt{email} \\ % % \And % % Coauthor \\ % % Affiliation \\ % % Address \\ % % \texttt{email} \\ % % \And % % Coauthor \\ % % Affiliation \\ % % Address \\ % % \texttt{email} \\ %} \begin{document} \maketitle \input{sections/abstract.tex} \input{sections/introduction.tex} \input{sections/preliminaries.tex} \input{sections/AL.tex} \input{sections/guarantees.tex} \input{sections/related_works.tex} \input{sections/experiments.tex} \input{sections/conclusion.tex} \section*{Acknowledgements} \vspace{-2mm} +The authors would like to thank Nicolas Boumal and Nadav Hallak for the helpful suggestions. \\ +~\\ This project has received funding from the European Research Council (ERC) under the European Union's Horizon $2020$ research and innovation programme (grant agreement n$^{\circ}$ $725594$ - time-data) and was supported by the Swiss National Science Foundation (SNSF) under grant number $200021\_178865 / 1$. This project was also sponsored by the Department of the Navy, Office of Naval Research (ONR) under a grant number N$62909$-$17$-$1$-$2111$ and was supported by Hasler Foundation Program: Cyber Human Systems (project number $16066$). This research was supported by the PhD fellowship program of the Swiss Data Science Center (SDSC) under grant lD number P18-07. \bibliographystyle{abbrv} % basic style, author-year citations \bibliography{bibliography.bib} % name your BibTeX data base \newpage \appendix \input{sections/AL_theory.tex} \input{sections/appendix.tex} %\subsubsection*{Acknowledgments} % %Use unnumbered third level headings for the acknowledgments. All acknowledgments %go at the end of the paper. Do not include acknowledgments in the anonymized %submission, only in the final paper. % %\section*{References} % %References follow the acknowledgments. Use unnumbered first-level heading for %the references. Any choice of citation style is acceptable as long as you are %consistent. It is permissible to reduce the font size to \verb+small+ (9 point) %when listing the references. {\bf Remember that you can use more than eight % pages as long as the additional pages contain \emph{only} cited references.} %\medskip % %\small % %[1] Alexander, J.A.\ \& Mozer, M.C.\ (1995) Template-based algorithms for %connectionist rule extraction. In G.\ Tesauro, D.S.\ Touretzky and T.K.\ Leen %(eds.), {\it Advances in Neural Information Processing Systems 7}, %pp.\ 609--616. Cambridge, MA: MIT Press. % %[2] Bower, J.M.\ \& Beeman, D.\ (1995) {\it The Book of GENESIS: Exploring % Realistic Neural Models with the GEneral NEural SImulation System.} New York: %TELOS/Springer--Verlag. % %[3] Hasselmo, M.E., Schnell, E.\ \& Barkai, E.\ (1995) Dynamics of learning and %recall at excitatory recurrent synapses and cholinergic modulation in rat %hippocampal region CA3. {\it Journal of Neuroscience} {\bf 15}(7):5249-5262. \end{document} \ No newline at end of file diff --git a/NeurIPS 19/sections/AL_theory.tex b/NeurIPS 19/sections/AL_theory.tex index efc9a88..529cb15 100644 --- a/NeurIPS 19/sections/AL_theory.tex +++ b/NeurIPS 19/sections/AL_theory.tex @@ -1,365 +1,367 @@ %!TEX root = ../main.tex %\section{Optimality Conditions} %\label{sec:opt_cnds} %{First-order necessary optimality conditions} for \eqref{prob:01} are well-studied. {Indeed, $x\in \RR^d$ is a first-order stationary point of~\eqref{prob:01} if there exists $y\in \RR^m$ such that %% %%%% I am cutting this to save space. put it back if necessary (VC) %%\begin{align} %%%\begin{cases} %%-\nabla f(x) - DA(x)^\top y \in \partial g(x), \qquad \,\, A(x) = 0, %%%\end{cases} %%\label{e:inclu1} %%\end{align} %% %%where $DA(x)$ is the Jacobian of $A$ at $x$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to %\begin{align} %%\begin{cases} %-\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x),\qquad A(x) = 0, %%\end{cases} %\label{e:inclu2} %\end{align} %which is in turn the necessary optimality condition for \eqref{eq:minmax}.} %%Note that \eqref{e:inclu2} is equivalent to %%\begin{align} %%\left[ %%\begin{array}{c} %%\nabla_x \L_\b(x,y) \\ %%\nabla_y \L_\b(x,y) %%\end{array} %%\right] %%\in %%\left\{ %%\left[ %%\begin{array}{c} %% v\\ %%0 %%\end{array} %%\right] %%: v\in \partial g(x) \right\} %%, %%\end{align} %%which rephrases the stationarity condition in terms of the gradient of the duality gap of Program~\eqref{eq:Lagrangian}. %%\editf{mfs: check approx. optimality conditions, how they apply in this setting.} %Inspired by this, we say that $x$ is an $(\epsilon_f,\b)$ first-order stationary point of \eqref{eq:minmax} if {there exists a $y \in \RR^m$} such that %\begin{align} %%\begin{cases} %\dist(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x)) \leq \epsilon_f, \qquad \| A(x) \| \leq \epsilon_f, %%\end{cases} %\label{eq:inclu3app} %\end{align} %for $\epsilon_f\ge 0$. %In light of \eqref{eq:inclu3app}, a metric for evaluating the stationarity of a pair $(x,y)\in \RR^d\times \RR^m$ is %\begin{align} %\dist\left(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x) \right) + \|A(x)\| , %\label{eq:cvg metricapp} %\end{align} %which we use as the first-order stopping criterion. %%When $g=0$, it is also not difficult to verify that the expression in \eqref{eq:cvg metric} is the norm of the gradient of the duality gap in \eqref{eq:minmax}. %As an example, for a convex set $\mathcal{X}\subset\RR^d$, suppose that $g = \delta_\mathcal{X}$ is the indicator function on $\mathcal{X}$. %Let also $T_\mathcal{X}(x) \subseteq \RR^d$ denote the tangent cone to $\mathcal{X}$ at $x$, and with $P_{T_\mathcal{X}(x)}:\RR^d\rightarrow\RR^d$ we denote the orthogonal projection onto this tangent cone. Then, for $u\in\RR^d$, it is not difficult to verify that %\begin{align}\label{eq:dist_subgrad} %\dist\left(u, \partial g(x) \right) = \| P_{T_\mathcal{X}(x)}(u) \|. %\end{align} %% %When $g = 0$, a first-order stationary point $x\in \RR^d$ of \eqref{prob:01} is also second-order stationary if %% %%\vspace{-.5cm} %\begin{equation} %\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))\ge 0, %\end{equation} %%\vspace{-.5cm} %% %where $\nabla_{xx}\mathcal{L}_\b$ is the Hessian of $\mathcal{L}_\b$ with respect to $x$, and $\lambda_{\text{min}}(\cdot)$ returns the smallest eigenvalue of its argument. %Analogously, $x$ is an $(\epsilon_f, \epsilon_s,\b)$ second-order stationary point if, in addition to \eqref{eq:inclu3}, it holds that %% %%\vspace{-.5cm} %\begin{equation}\label{eq:sec_opt} %\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) \ge -\epsilon_s, %\end{equation} %for $\epsilon_s\ge 0$. %%\vspace{-.5cm} %% %Naturally, for second-order stationarity, we use $\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))$ as the stopping criterion. % \section{Complexity Results}\label{sec:opt_cnds} \subsection{First-Order Optimality \label{sec:first-o-opt}} %\textbf{AE: we go back and forth between "subroutine" and "solver". for consistency, I'm just using "solver" everywhere.} Let us first consider the case where the solver in Step~2 is is the first-order algorithm APGM, described in detail in ~\cite{ghadimi2016accelerated}. At a high level, APGM makes use of $\nabla_x \mathcal{L}_{\beta}(x,y)$ in \eqref{eq:Lagrangian}, the proximal operator $\text{prox}_g$, and the classical Nesterov acceleration~\cite{nesterov1983method} to reach first-order stationarity for the subproblem in~\eqref{e:exac}. % \textbf{AE: Ahmet to give a brief description of APGM here. } %First, we characterize the iteration complexity of Algorithm~\ref{Algo:2} for finding a first-order stationary point of~\eqref{prob:01}. %We propose to use the standard accelerated proximal method (APGM), guarantees of which are analyzed in~\cite{ghadimi2016accelerated} for nonconvex problems of the form~\eqref{e:exac}. Suppose that $g=\delta_\mathcal{X}$ is the indicator function on a bounded convex set $\mathcal{X}\subset \RR^d$ and let \begin{align} \rho= \max_{x\in \mathcal{X}}\|x\|, \end{align} be the radius of a ball centered at the origin that includes $\mathcal{X}$. - Then, adapting the results in~\cite{ghadimi2016accelerated} to our setup, APGM reaches $x_{k}$ in Step 2 of Algorithm~\ref{Algo:2} after + Then, adapting the results in~\cite{ghadimi2016accelerated} to our setup, APGM reaches $x_{k}$ in Step 2 of Algorithm~\ref{Algo:2} {\color{blue}after \begin{equation} -\mathcal{O}\left ( \frac{\lambda_{\beta_{k}}^2 \rho^{2} }{\epsilon_{k+1}} \right) +\mathcal{O}\left ( \frac{\lambda_{\beta_{k}}^2 \rho^{2} }{\epsilon_{k+1}^2} \right) \label{eq:iter_1storder} \end{equation} +} (inner) iterations, where $\lambda_{\beta_{k}}$ denotes the Lipschitz constant of $\nabla_x{\mathcal{L}_{\beta_{k}}(x, y)}$, bounded in~\eqref{eq:smoothness of Lagrangian}. For the clarity of the presentation, we have used a looser bound in \eqref{eq:iter_1storder} compared to~\cite{ghadimi2016accelerated}. Using \eqref{eq:iter_1storder}, we derive the following corollary, describing the total iteration complexity of Algorithm~\ref{Algo:2} in terms of the number calls made to the first-order oracle in APGM. %\textbf{AE: we haven't talked about oracle before.} \begin{corollary}\label{cor:first_supp} For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point, after $T$ calls to the first-order oracle, where % +{\color{blue} \begin{equation} -T = \mathcal{O}\left( \frac{Q^3 \rho^2}{\epsilon^{3}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} \rho^2}{\epsilon^{3}} \right). -\end{equation} +T = \mathcal{O}\left( \frac{Q^3 \rho^2}{\epsilon^{4}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} \rho^2}{\epsilon^{4}} \right). +\end{equation}} \end{corollary} \begin{proof} Let $K$ denote the number of (outer) iterations of Algorithm~\ref{Algo:2} and let $\epsilon_{f}$ denote the desired accuracy of Algorithm~\ref{Algo:2}, see~(\ref{eq:inclu3}). Recalling Theorem~\ref{thm:main}, we can then write that \begin{equation} \epsilon_{f} = \frac{Q}{\b_{K}}, \label{eq:acc_to_b} \end{equation} or, equivalently, $\b_{K} = Q/\epsilon_{f}$. %where $K$ denotes the last outer iterate and $\epsilon$ is the final accuracy we would like to get for the optimality conditions given in~\eqref{eq:inclu3}. We now count the number of total (inner) iterations $T$ of Algorithm~\ref{Algo:2} to reach the accuracy $\epsilon_{f}$. From \eqref{eq:smoothness of Lagrangian} and for sufficiently large $k$, recall that $\lambda_{\b_k}\le \lambda'' \b_k$ is the smoothness parameter of the augmented Lagrangian. Then, from \eqref{eq:iter_1storder} ad by summing over the outer iterations, we bound the total number of (inner) iterations of Algorithm~\ref{Algo:2} as +{\color{blue} \begin{align}\label{eq: tk_bound} -T &= \sum_{k=1}^K\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}}^2 \rho^2 }{\epsilon_k} \right) \nonumber\\ -& = \sum_{k=1}^K\mathcal{O}\left (\beta_{k-1}^3 \rho^2 \right) +T &= \sum_{k=1}^K\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}}^2 \rho^2 }{\epsilon_k^2} \right) \nonumber\\ +& = \sum_{k=1}^K\mathcal{O}\left (\beta_{k-1}^4 \rho^2 \right) \qquad \text{(Step 1 of Algorithm \ref{Algo:2})} \nonumber\\ -& \leq \mathcal{O} \left(K\beta_{K-1}^3 \rho^2 \right) +& \leq \mathcal{O} \left(K\beta_{K-1}^4 \rho^2 \right) \qquad \left( \{\b_k\}_k \text{ is increasing} \right) \nonumber\\ - & \le \mathcal{O}\left( \frac{K Q^{{3}} \rho^2}{\epsilon_{f}^{{3}}} \right). + & \le \mathcal{O}\left( \frac{K Q^{{3}} \rho^2}{\epsilon_{f}^{{4}}} \right). \qquad \text{(see \eqref{eq:acc_to_b})} \end{align} In addition, if we specify $\beta_k=b^k$ for all $k$, we can further refine $T$. Indeed, \begin{equation} \beta_K = b^K~~ \Longrightarrow~~ K = \log_b \left( \frac{Q}{\epsilon_f} \right), \end{equation} which, after substituting into~\eqref{eq: tk_bound} gives the final bound in Corollary~\ref{cor:first}. +} \end{proof} %\textbf{AE: if we find some time, i'll add a couple of lines for how 1st order opt implies 2nd order opt for smooth constraints.} \subsection{Second-Order Optimality \label{sec:second-o-opt}} %{\color{red}{Ahmet (note to myself): not sure of the constants of trust-region, check again}} \\ Let us now consider the second-order optimality case where the solver in Step~2 is the the trust region method developed in~\cite{cartis2012complexity}. Trust region method minimizes a quadratic approximation of the function within a dynamically updated trust-region radius. Second-order trust region method that we consider in this section makes use of Hessian (or an approximation of Hessian) of the augmented Lagrangian in addition to first order oracles. As shown in~\cite{nouiehed2018convergence}, finding approximate second-order stationary points of convex-constrained problems is in general NP-hard. For this reason, we focus in this section on the special case of~\eqref{prob:01} with $g=0$. %We first give a theorem to show the convergence rate of the algorithm for second order stationarity: \\ %{\color{red}{Ahmet: I think that it is possible to remove sufficiently large k0 assumption here. }} \textbf{AE: not worth it really} %\begin{corollary} %\label{thm:main_second} %Under the assumptions of Theorem~\ref{thm:main}, the output of Algorithm~\ref{Algo:2} satisfies %\begin{align} %%& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ %%& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\ %%&=: \frac{Q}{\beta_{k-1}^2}. %\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_{k-1}}(x_k, y_k)) \geq - \frac{C}{\beta_{k-1}} - \epsilon_{k-1}. %\end{align} %\end{corollary} % %We consider \textbf{AE: Ahmet to add a brief description of the algorithm.} Let us compute the total computational complexity of Algorithm~\ref{Algo:2} with the trust region method in Step~2, in terms of the number of calls made to the second-order oracle. By adapting the result in~\cite{cartis2012complexity} to our setup, we find that the number of (inner) iterations required in Step~2 of Algorithm~\ref{Algo:2} to produce $x_{k+1}$ is % \begin{equation} \mathcal{O}\left ( \frac{\lambda_{\beta_{k}, H}^2 (\mathcal{L}_{\beta_{k}}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y))}{\epsilon_k^3} \right), \label{eq:sec_inn_comp} \end{equation} % %<<<<<<< HEAD where $\lambda_{\beta, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop. In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y)$ is bounded by a constant independent of $\epsilon$. We assume a uniform bound for this quantity for every $ \beta_k$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following: %======= %where $\lambda_{\beta_k, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta_k$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop. %In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta_k}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y)$ is bounded by a constant independent of $\epsilon_k$. %We assume a uniform bound for this quantity for all $\b$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following result. The proof is very similar to that of Corollary~\ref{cor:first} and hence omitted here. %>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732 % \begin{corollary}\label{cor:second_supp} For $b>1$, let $\beta_k =b^k $ for every $k$. We assume that \begin{equation} \mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) \leq L_{u},\qquad \forall \beta. \end{equation} If we use the trust region method from~\cite{cartis2012complexity} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon$-second-order stationary point of~(\ref{prob:01}) in $T$ calls to the second-order oracle where % \begin{equation} T = \mathcal{O}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \log_b{\left( \frac{Q'}{\epsilon} \right)} \right) = \widetilde{\mathcal{O}}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \right). \end{equation} \end{corollary} % %\notea{I can't remember: what is $Q'$ and why isn't it defined?} Before closing this section, we note that the remark after Corollary~\ref{cor:first} applies here as well. \subsection{Approximate optimality of \eqref{prob:01}.} %{\paragraph{Approximate optimality of \eqref{prob:01}. } Corollary \ref{cor:first} establishes the iteration complexity of Algorithm~\ref{Algo:2} to reach approximate first-order stationarity for the equivalent formulation of \eqref{prob:01} presented in \eqref{eq:minmax}. Unlike the exact case, approximate first-order stationarity in \eqref{eq:minmax} does not immediately lend itself to approximate stationarity in \eqref{prob:01}, and the study of approximate stationarity for the penalized problem (special case of our setting with dual variable set to $0$) has also precedent in~\cite{bhojanapalli2018smoothed}. %\textbf{AE: I think the reference is wrong... It's important to mention some precedent for what we do here to strengthen our argument.} - -However, it is not difficult to verify that, with the more aggressive regime of $\epsilon_{k+1}=1/\b_k^2$ in Step 1 of Algorithm~\ref{Algo:2}, one can achieve $\epsilon$-first-order stationarity for \eqref{prob:01} with the iteration complexity of $T=\tilde{O}(Q^3\rho^2/\epsilon^6)$ in Corollary~\ref{cor:first}. -Note that this conversion is by a naive computation using loose bounds rather than using duality arguments for a tight conversion. +%However, it is not difficult to verify that, with the more aggressive regime of $\epsilon_{k+1}=1/\b_k^2$ in Step 1 of Algorithm~\ref{Algo:2}, one can achieve $\epsilon$-first-order stationarity for \eqref{prob:01} with the iteration complexity of $T=\tilde{O}(Q^3\rho^2/\epsilon^6)$ in Corollary~\ref{cor:first}. +%Note that this conversion is by a naive computation using loose bounds rather than using duality arguments for a tight conversion. For a precedent in convex optimization for relating the convergence in augmented Lagrangian to the constrained problem using duality, see~\cite{tran2018smooth}. - -For the second-order case, it is in general not possible to establish approximate second-order optimality for \eqref{eq:minmax} from Corollary~\ref{cor:second}, with the exception of linear constraints. +For the second-order case, it is in general not possible to establish approximate second-order optimality for \eqref{eq:minmax} from Corollary~\ref{cor:second}, with the exception of linear constraints. \cite{nouiehed2018convergence} provides an hardness result by showing that checking an approximate second-order stationarity is NP-hard. } \section{Proof of Theorem \ref{thm:main} \label{sec:theory}} For every $k\ge2$, recall from (\ref{eq:Lagrangian}) and Step~2 of Algorithm~\ref{Algo:2} that $x_{k}$ satisfies \begin{align} & \dist(-\nabla f(x_k) - DA(x_k)^\top y_{k-1} \nonumber\\ & \qquad - \b_{k-1} DA(x_{k})^\top A(x_k) ,\partial g(x_k) ) \nonumber\\ & = \dist(-\nabla_x \L_{\b_{k-1}} (x_k ,y_{k-1}) ,\partial g(x_k) ) \le \epsilon_{k}. \end{align} With an application of the triangle inequality, it follows that \begin{align} & \dist( -\b_{k-1} DA(x_k)^\top A(x_k) , \partial g(x_k) ) \nonumber\\ & \qquad \le \| \nabla f(x_k )\| + \| DA(x_k)^\top y_{k-1}\| + \epsilon_k, \end{align} which in turn implies that \begin{align} & \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \nonumber\\ & \le \frac{ \| \nabla f(x_k )\|}{\b_{k-1} } + \frac{\| DA(x_k)^\top y_{k-1}\|}{\b_{k-1} } + \frac{\epsilon_k}{\b_{k-1} } \nonumber\\ & \le \frac{\lambda'_f+\lambda'_A \|y_{k-1}\|+\epsilon_k}{\b_{k-1}} , \label{eq:before_restriction} \end{align} where $\lambda'_f,\lambda'_A$ were defined in \eqref{eq:defn_restricted_lipsichtz}. %For example, when $g=\delta_\mathcal{X}$ is the indicator , then \eqref{eq:before_restriction} reads as %\begin{align} %& \| P_{T_C(x_k)} DA(x_k)^\top A(x_k) \| \nonumber\\ %& \qquad \le \frac{ \lambda'_f+ \lambda'_A \|y_{k-1}\| + \epsilon_k }{\b_{k-1} } . %\label{eq:before} %\end{align} We next translate \eqref{eq:before_restriction} into a bound on the feasibility gap $\|A(x_k)\|$. Using the regularity condition \eqref{eq:regularity}, the left-hand side of \eqref{eq:before_restriction} can be bounded below as \begin{align} & \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \ge \nu \|A(x_k) \|. \qquad \text{(see (\ref{eq:regularity}))} \label{eq:restrited_pre} \end{align} %provided that $\rho$ satisfy %\begin{align} %\max_{k\in K} \|A(x_k)\| \le \rho. %\label{eq:to_be_checked_later} %\end{align} By substituting \eqref{eq:restrited_pre} back into \eqref{eq:before_restriction}, we find that \begin{align} \|A(x_k)\| \le \frac{ \lambda'_f + \lambda'_A \|y_{k-1}\| + \epsilon_k}{\nu \b_{k-1} }. \label{eq:before_dual_controlled} \end{align} In words, the feasibility gap is directly controlled by the dual sequence $\{y_k\}_k$. We next establish that the dual sequence is bounded. Indeed, for every $k\in K$, note that \begin{align} \|y_k\| & = \| y_0 + \sum_{i=1}^{k} \s_i A(x_i) \| \quad \text{(Step 5 of Algorithm \ref{Algo:2})} \nonumber\\ & \le \|y_0\|+ \sum_{i=1}^k \s_i \|A(x_i)\| \qquad \text{(triangle inequality)} \nonumber\\ & \le \|y_0\|+ \sum_{i=1}^k \frac{ \|A(x_1)\| \log^2 2 }{ k \log^2(k+1)} \quad \text{(Step 4)} \nonumber\\ & \le \|y_0\|+ c \|A(x_1) \| \log^2 2 =: y_{\max}, \label{eq:dual growth} \end{align} where \begin{align} c \ge \sum_{i=1}^{\infty} \frac{1}{k \log^2 (k+1)}. \end{align} Substituting \eqref{eq:dual growth} back into \eqref{eq:before_dual_controlled}, we reach \begin{align} \|A(x_k)\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} } \nonumber\\ & \le \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} } , \label{eq:cvg metric part 2} \end{align} where the second line above holds if $k_0$ is large enough, which would in turn guarantees that $\epsilon_k=1/\b_{k-1}$ is sufficiently small since $\{\b_k\}_k$ is increasing and unbounded. %Let us now revisit and simplify \eqref{eq:to_be_checked_later}. %Note that $\rho'$ automatically satisfies the second inequality there, owing to Step~3 of Algorithm~\ref{Algo:2}. %Note that $\rho$ satisfies \eqref{eq:to_be_checked_later} if %\begin{align} %\frac{\lambda'_f+\lambda'_A y_{\max}}{ \nu_A \b_1} \le \rho/2, %\qquad \text{(see \eqref{eq:cvg metric part 2})} %\end{align} %provided that %\begin{align} %\beta_1 \ge \b_0 := \left(\lambda'_f + \lambda'_A y_{\max} \right)^{-1}. %\label{eq:beta-0} %\end{align} %\begin{align} %\|A(x_k)\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} } %\qquad \text{(see \eqref{eq:cvg metric part 2})} %\nonumber\\ %& \le \frac{ 2( \lambda'_f + \lambda'_A y_{\max})}{\nu \b_1 } %\qquad \text{(see \eqref{eq:dual growth})}, %\end{align} %where the last line above holds when $k_0$ is large enough, because $\b_k \ge \b_1$ and $\lim_{k\rightarrow\infty} \epsilon_k=0$. It remains to control the first term in \eqref{eq:cvg metric}. To that end, after recalling Step 2 of Algorithm~\ref{Algo:2} and applying the triangle inequality, we can write that \begin{align} & \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\ & \le \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) , \partial g(x_{k}) ) \nonumber\\ & + \| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \|. \label{eq:cvg metric part 1 brk down} \end{align} The first term on the right-hand side above is bounded by $\epsilon_k$, by Step 5 of Algorithm~\ref{Algo:2}. For the second term on the right-hand side of \eqref{eq:cvg metric part 1 brk down}, we write that \begin{align} & \| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \| \nonumber\\ & = \| DA(x_k)^\top (y_k - y_{k-1}) \| \qquad \text{(see \eqref{eq:Lagrangian})} \nonumber\\ & \le \lambda'_A \|y_k- y_{k-1}\| \qquad \text{(see \eqref{eq:defn_restricted_lipsichtz})} \nonumber\\ & = \lambda'_A \s_k \|A (x_k) \| \qquad \text{(see Step 5 of Algorithm \ref{Algo:2})} \nonumber\\ & \le \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) . \qquad \text{(see \eqref{eq:cvg metric part 2})} \label{eq:part_1_2} \end{align} By combining (\ref{eq:cvg metric part 1 brk down},\ref{eq:part_1_2}), we find that \begin{align} & \dist( \nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\ & \le \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) + \epsilon_k. \label{eq:cvg metric part 1} \end{align} By combining (\ref{eq:cvg metric part 2},\ref{eq:cvg metric part 1}), we find that \begin{align} & \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\| \nonumber\\ & \le \left( \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) + \epsilon_k \right) \nonumber\\ & \qquad + 2\left( \frac{ \lambda'_f + \lambda'_A y_{\max}}{\nu \b_{k-1} } \right). \end{align} Applying $\s_k\le \s_1$, we find that \begin{align} & \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\| \nonumber\\ & \le \frac{ 2\lambda'_A\s_1 + 2}{ \nu\b_{k-1}} ( \lambda'_f+\lambda'_A y_{\max}) + \epsilon_k. \end{align} For the second part of the theorem, we use the Weyl's inequality and Step 5 of Algorithm~\ref{Algo:2} to write \begin{align}\label{eq:sec} \lambda_{\text{min}} &(\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k-1})) \geq \lambda_{\text{min}} (\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k})) \notag \\&- \sigma_k \| \sum_{i=1}^m A_i(x_k) \nabla^2 A_i(x_k) \|. \end{align} The first term on the right-hand side is lower bounded by $-\epsilon_{k-1}$ by Step 2 of Algorithm~\ref{Algo:2}. We next bound the second term on the right-hand side above as \begin{align*} & \sigma_k \| \sum_{i=1}^m A_i(x_k) \nabla^2 A_i(x_k) \| \\ &\le \sigma_k \sqrt{m} \max_{i} \| A_i(x_k)\| \| \nabla^2 A_i(x_k)\| \\ &\le \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} }, \end{align*} where the last inequality is due to~(\ref{eq:smoothness basic},\ref{eq:cvg metric part 2}). Plugging into~\eqref{eq:sec} gives \begin{align*} & \lambda_{\text{min}}(\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k-1}))\nonumber\\ & \geq -\epsilon_{k-1} - \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} }, \end{align*} which completes the proof of Theorem \ref{thm:main}. diff --git a/NeurIPS 19/sections/abstract.tex b/NeurIPS 19/sections/abstract.tex index 5faaf74..23d9884 100644 --- a/NeurIPS 19/sections/abstract.tex +++ b/NeurIPS 19/sections/abstract.tex @@ -1,20 +1,22 @@ %!TEX root = ../main.tex \begin{abstract} %We consider a canonical nonlinear-constrained nonconvex problem template with broad applications in machine learning, theoretical computer science, and signal processing. %This template involves the Burer-Monteiro splitting for semidefinite programming as a special case. We propose a practical inexact augmented Lagrangian method (iALM) for nonconvex problems with nonlinear constraints. %involving nonlinear operators. We characterize the total computational complexity of our method subject to a verifiable geometric condition, which is closely related to the Polyak-Lojasiewicz and Mangasarian-Fromowitz conditions. \vspace{1mm} -In particular, when a first-order solver is used for the inner iterates, we prove that iALM finds a first-order stationary point with $\tilde{\mathcal{O}}(1/\epsilon^3)$ calls to the first-order oracle. {If, in addition, the problem is smooth and} a second-order solver is used for the inner iterates, iALM finds a second-order stationary point with $\tilde{\mathcal{O}}(1/\epsilon^5)$ calls to the second-order oracle. +In particular, when a first-order solver is used for the inner iterates, we prove that iALM finds a first-order stationary point with {\color{blue} $\tilde{\mathcal{O}}(1/\epsilon^4)$} calls to the first-order oracle. {If, in addition, the problem is smooth and} a second-order solver is used for the inner iterates, iALM finds a second-order stationary point with $\tilde{\mathcal{O}}(1/\epsilon^5)$ calls to the second-order oracle. These complexity results match the known theoretical results in the literature. % with a simple, implementable and versatile algorithm. \vspace{1mm} We also provide strong numerical evidence on large-scale machine learning problems, including the Burer-Monteiro factorization of semidefinite programs, and a novel nonconvex relaxation of the standard basis pursuit template. For these examples, we also show how to verify our geometric condition. %For these problems and under suitable assumptions, our algorithm in fact achieves global optimality for the underlying convex SDP. +{\color{blue} This paper was updated on 22 December 2020 due to an error in the proof of Corollary \eqref{cor:first}. The gradient complexity for obtaining a first order stationary point is now $\tilde{\mathcal{O}}( 1/{\epsilon^4})$}. + %\textbf{AE: we should add something about gans if we do that in time.} \end{abstract} diff --git a/NeurIPS 19/sections/guarantees.tex b/NeurIPS 19/sections/guarantees.tex index 94cc86a..8214308 100644 --- a/NeurIPS 19/sections/guarantees.tex +++ b/NeurIPS 19/sections/guarantees.tex @@ -1,162 +1,165 @@ %!TEX root = ../main.tex \section{Convergence Rate \label{sec:cvg rate}} This section presents the total iteration complexity of Algorithm~\ref{Algo:2} for finding first and second-order stationary points of problem \eqref{eq:minmax}. All the proofs are deferred to Appendix~\ref{sec:theory}. Theorem~\ref{thm:main} characterizes the convergence rate of Algorithm~\ref{Algo:2} for finding stationary points in the number of outer iterations. \begin{theorem}\textbf{\emph{(convergence rate)}} \label{thm:main} For integers $2 \le k_0\le k_1$, consider the interval $K=[k_0:k_1]$, and let $\{x_k\}_{k\in K}$ be the output sequence of Algorithm~\ref{Algo:2} on the interval $K$.\footnote{The choice of $k_1 = \infty$ is valid here too.} Let also $\rho:=\sup_{k\in [K]} \|x_k\|$.\footnote{If necessary, to ensure that $\rho<\infty$, one can add a small factor of $\|x\|^2$ to $\mathcal{L}_{\b}$ in \eqref{eq:Lagrangian}. Then it is easy to verify that the iterates of Algorithm \ref{Algo:2} remain bounded, provided that the initial penalty weight $\beta_0$ is large enough, $\sup_x \|\nabla f(x)\|/\|x\|< \infty$, $\sup_x \|A(x)\| < \infty$, and $\sup_x \|D A(x)\| <\infty$. } Suppose that $f$ and $A$ satisfy (\ref{eq:smoothness basic}) and let \begin{align} \lambda'_f = \max_{\|x\|\le \rho} \|\nabla f(x)\|,\qquad \lambda'_A = \max_{\|x\| \le \rho} \|DA(x)\|, \label{eq:defn_restricted_lipsichtz} \end{align} be the (restricted) Lipschitz constants of $f$ and $A$, respectively. %Suppose that $\b_1 \ge \b_0$, where the expression for $\b_0(f,g,A,\s_1)$ is given in (\ref{eq:beta-0}). With $\nu>0$, assume that \begin{align} \nu \|A(x_k)\| & \le \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right), \label{eq:regularity} \end{align} for every $k\in K$. We consider two cases: %{\color{red} Ahmet: I removed the squares and showed the rate on dist + feasibility, since it is the measure for the stationarity, using squares confuses complexity analysis...} \begin{itemize}[leftmargin=*] \item If a first-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f},\b_k)$ first-order stationary point of (\ref{eq:minmax}) with \begin{align} \epsilon_{k,f} & = \frac{1}{\beta_{k-1}} \left(\frac{2(\lambda'_f+\lambda'_A y_{\max}) (1+\lambda_A' \sigma_k)}{\nu}+1\right) =: \frac{Q(f,g,A,\s_1)}{\beta_{k-1}}, \label{eq:stat_prec_first} \end{align} for every $k\in K$, where $y_{\max}(x_1,y_0,\s_1):=\|y_0\|+ c \|A(x_1) \|$. \item If a second-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f}, \epsilon_{k,s},\b_k)$ second-order stationary point of~(\ref{eq:minmax}) with $\epsilon_{k,s}$ specified above and with \begin{align} \epsilon_{k,s} &= \epsilon_{k-1} + \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} = \frac{\nu + \sigma_k \sqrt{m} \lambda_A 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} =: \frac{Q'(f,g,A,\s_1)}{\beta_{k-1}}. \end{align} \end{itemize} \end{theorem} %A few remarks are in order about Theorem~\ref{thm:main}. %\textbf{Tradeoff.} %Roughly speaking, Theorem~\ref{thm:main} states that the iterates of Algorithm~\ref{Algo:2} approach first-order stationarity for~\eqref{prob:01} at the rate of $1/{\b_k}$, where $\b_k$ is the penalty weight in AL in iteration $k$ as in~\eqref{eq:Lagrangian}. Note the inherent tradeoff here: For a faster sequence $\{\b_k\}_k$, the iterates approach stationarity faster but the computational cost of the inexact primal subroutine (Step 2) increases since a higher accuracy is required and Lipschitz constant of the unconstrained problem is also affected by $\beta_k$. In words, there is a tradeoff between the number of outer and inner loop iterations of Algorithm~\ref{Algo:2}. We highlight this tradeoff for a number of subroutines below. Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} converges to a (first- or second-) order stationary point of \eqref{eq:minmax} at the rate of $1/\b_k$, further specified in Corollary \ref{cor:first} and Corollary \ref{cor:second}. %Sections \ref{sec:first-o-opt} and \ref{sec:second-o-opt}. A few remarks are in order about Theorem \ref{thm:main}. \paragraph{Regularity.} The key geometric condition in Theorem~\ref{thm:main} is \eqref{eq:regularity} which, broadly speaking, ensures that the primal updates of Algorithm \ref{Algo:2} reduce the feasibility gap as the penalty weight $\b_k$ grows. We will verify this condition for several examples in Appendices \ref{sec:verification2} and \ref{sec:adexp}. This condition in \eqref{eq:regularity} is closely related to those in the existing literature. In the special case where $g=0$ in~\eqref{prob:01}, \eqref{eq:regularity} reduces to; \begin{align} \|DA(x)^\top A(x)\| \geq \nu \|A(x)\|. \label{eq:regularity_special} \end{align} ~~~~\emph{Polyak-Lojasiewicz (PL) condition~\cite{karimi2016linear}.} Consider the problem with $\lambda_{\tilde{f}}$-smooth objective, \begin{align*} \min_{x\in\RR^d} \tilde{f}(x). \end{align*} $\tilde{f}(x)$ satisfies the PL inequality if the following holds for some $\mu > 0$, \begin{equation} \frac{1}{2} \| \nabla \tilde{f}(x) \|^2 \geq \mu (\tilde{f}(x) - \tilde{f}^*),\quad \forall x \tag{PL inequality} \end{equation} This inequality implies that gradient is growing faster than a quadratic as we move away from the optimal. Assuming that the feasible set $\{ x: A(x) = 0\}$ is non-empty, it is easy to verify that \ref{eq:regularity_special} is equivalent to the PL condition for minimizing $\tilde{f}(x) = \frac{1}{2}\|A(x)\|^2$ with $\nu = \sqrt{2 \mu} $~~\cite{karimi2016linear}. PL condition itself is a special case of Kurdyka-Lojasiewicz with $\theta = 1/2$, see \cite[Definition 1.1]{xu2017globally}. When $g=0$, it is also easy to see that \eqref{eq:regularity} is weaker than the Mangasarian-Fromovitz (MF) condition in nonlinear optimization \cite[Assumption 1]{bolte2018nonconvex}. {Moreover, {when $g$ is the indicator on a convex set,} \eqref{eq:regularity} is a consequence of the \textit{basic constraint qualification} in \cite{rockafellar1993lagrange}, which itself generalizes the MF condition to the case when $g$ is an indicator function of a convex set.} %\notea{I'm not sure if the claim about basic constraint qualification is true because our condition should locally hold rather globally. Could you add the exact equation number in [55] and double check this? Also consider double checking other claims in this paragraph.} We may think of \eqref{eq:regularity} as a local condition, which should hold within a neighborhood of the constraint set $\{x:A(x)=0\}$ rather than everywhere in $\mathbb{R}^d$. Indeed, the iteration count $k$ appears in \eqref{eq:regularity} to reflect this local nature of the condition. Similar kind of arguments on the regularity condition also appear in \cite{bolte2018nonconvex}. There is also a constant complexity algorithm in \cite{bolte2018nonconvex} to reach so-called ``information zone'', which supplements Theorem \ref{thm:main}. %Lastly, in contrast to most conditions in the nonconvex optimization literature, such as~\cite{flores2012complete}, the condition in~\eqref{eq:regularity} appears to be easier to verify, as we see in the sequel. %\begin{exmp}[Clustering] %Clustering is an important application %\end{exmp} %\edita{\textbf{AE: Fatih, I think you had added the following two references to our response for icml. Could you discuss their relevance here? [2] Rockafellar, Lagrange Multipliers and Optimality, 1993 %[3] Bertsekas, On penalty and multiplier methods for constrained minimization. 1996}} %\edita{\textbf{AE: the spars review talks about the "Pong-Li" work. Fatih, do you know what is that?}} % %\editf{(mfs:) I do not think this work is relevant to our condition but the template seems similar. We can mention in the related works instead.} %\notea{Ok. Yes, consider adding it to the related work.} %We now argue that such a condition is necessary for controlling the feasibility gap of~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}. %To provide further insight about \eqref{eq:regularity}, consider the convex case where $f=0$ and $g=\delta_\mathcal{X}$, where $\mathcal{X}$ is a convex set and $A$ is a linear operator. In this case, solving \eqref{prob:01} finds a point in the convex set $\mathcal{X}\cap \text{null}(A)$, where the subspace $\text{null}(A)=\{ x\in\mathbb{R}^d: A(x) = 0 \} \subset \RR^d$ is the null space of $A$. %Here, the Slater's condition~\cite{boyd2004convex} requires that %\begin{equation} %\text{relint} (\mathcal{X}) \cap \text{null}(A)\neq \emptyset. %\end{equation} %%<<<<<<< HEAD %Recall that the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}. %Intuitively, the Slater's condition achieves this by removing pathological cases and ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01} and finding a point in $\mathcal{X}\cap \text{null}(A)$ can be particularly slow. For instance, in Figure \ref{fig:convex_slater}, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) converges to the intersection point at the suboptimal rate of $1/\sqrt{k}$ rather than the standard $1/k$ rate. %{\color{red} Ahmet: a reference would be cool here} %======= %In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}. Intuitively, the Slater's condition here removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01}, namely, finding a point in $\mathcal{X}\cap \text{null}(A)$, can be particularly difficult. For instance, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) has arbitrarily slow convergence, see Figure~\ref{fig:convex_slater}. %>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732 %\begin{figure} %\begin{center} %\includegraphics[scale=.5]{Slater.pdf} %\end{center} %\caption{In pathological cases where the Slater's condition does not apply, solving \eqref{prob:01} can be particularly slow, even when \eqref{prob:01} is a convex program. See the first remark after Theorem~\ref{thm:main} for more details. \label{fig:convex_slater}} %\end{figure} \paragraph{Penalty method.} A classical algorithm to solve \eqref{prob:01} is the penalty method, which is characterized by the absence of the dual variable ($y=0$) in \eqref{eq:Lagrangian}. Indeed, ALM can be interpreted as an adaptive penalty or smoothing method with a variable center determined by the dual variable. It is worth noting that, with the same proof technique, one can establish the same convergence rate of Theorem \ref{thm:main} for the penalty method. However, while both methods have the same convergence rate in theory, we ignore the uncompetitive penalty method since it is significantly outperformed by iALM in practice. % outperforms the penalty method significantly in practice. % by virtue of its variable center and has been excluded from this presentation for this reason. \paragraph{Computational complexity.} Theorem~\ref{thm:main} specifies the number of (outer) iterations that Algorithm~\ref{Algo:2} requires to reach a near-stationary point of problem~\eqref{eq:Lagrangian} with a prescribed precision and, in particular, specifies the number of calls made to the solver in Step~2. In this sense, Theorem~\ref{thm:main} does not fully capture the computational complexity of Algorithm~\ref{Algo:2}, as it does not take into account the computational cost of the solver in Step~2. To better understand the total iteration complexity of Algorithm~\ref{Algo:2}, we consider two scenarios in the following. In the first scenario, we take the solver in Step~2 to be the Accelerated Proximal Gradient Method (APGM), a well-known first-order algorithm~\cite{ghadimi2016accelerated}. In the second scenario, we will use the second-order trust region method developed in~\cite{cartis2012complexity}. We have the following two corollaries showing the total complexity of our algorithm to reach first and second-order stationary points. Appendix \ref{sec:opt_cnds} contains the proofs and more detailed discussion for the complexity results. % +{\color{blue} \begin{corollary}[First-order optimality]\label{cor:first} For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point of~\eqref{eq:minmax}, after $T$ calls to the first-order oracle, where % \begin{equation} -T = \mathcal{O}\left( \frac{Q^3 \rho^2}{\epsilon^{3}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} \rho^2}{\epsilon^{3}} \right). +T = \mathcal{O}\left( \frac{Q^3 \rho^2}{\epsilon^{4}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} \rho^2}{\epsilon^{4}} \right). \end{equation} \end{corollary} + +} %For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is thus not feasible. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight.\footnote{In this context, homotopy loosely corresponds to the gradual enforcement of the constraints by increasing the penalty weight.} This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}. For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is thus not feasible. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight. This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}. \begin{corollary}[Second-order optimality]\label{cor:second} For $b>1$, let $\beta_k =b^k $ for every $k$. We assume that \begin{equation} \mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) \leq L_{u},\qquad \forall \beta. \end{equation} If we use the trust region method from~\cite{cartis2012complexity} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon$-second-order stationary point of~\eqref{eq:minmax} in $T$ calls to the second-order oracle where % \begin{equation} T = \mathcal{O}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \log_b{\left( \frac{Q'}{\epsilon} \right)} \right) = \widetilde{\mathcal{O}}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \right). \end{equation} \end{corollary} \paragraph{Remark.} These complexity results for first and second-order are stationarity with respect to~\eqref{eq:Lagrangian}. We note that these complexities match~\cite{cartis2018optimality} and~\cite{birgin2016evaluation}. However, the stationarity criteria and the definition of dual variable in these papers differ from ours. We include more discussion on this in the Appendix. \paragraph{Effect of $\beta_k$ in \ref{eq:regularity}.} We consider two cases, when $g$ is the indicator of a convex set (or 0), the subdifferential set will be a cone (or 0), thus $\beta_k$ will not have an effect. On the other hand, when $g$ is a convex and Lipschitz contiunous function defined on the whole space, subdifferential set will be bounded \cite[Theorem 23.4]{rockafellar1970convex}. This will introduce an error term in \ref{eq:regularity} that is of the order (1/$\beta_k$). One can see that $b^k$ choice for $\beta_k$ causes a linear decrease in this error term. In fact, all the examples in this paper fall into the first case.