diff --git a/NeurIPS 19/main.tex b/NeurIPS 19/main.tex index 54739f1..b2412bd 100644 --- a/NeurIPS 19/main.tex +++ b/NeurIPS 19/main.tex @@ -1,131 +1,216 @@ \documentclass{article} - -% if you need to pass options to natbib, use, e.g.: -% \PassOptionsToPackage{numbers, compress}{natbib} -% before loading neurips_2019 - -% ready for submission - \usepackage{neurips_2019} - -% to compile a preprint version, e.g., for submission to arXiv, add add the -% [preprint] option: -% \usepackage[preprint]{neurips_2019} - -% to compile a camera-ready version, add the [final] option, e.g.: +%\usepackage[a4paper,top=3cm,bottom=2cm,left=3cm,right=3cm,marginparwidth=1.75cm]{geometry} +%%% if you need to pass options to natbib, use, e.g.: +%%% \PassOptionsToPackage{numbers, compress}{natbib} +%%% before loading neurips_2019 +%% +%%% ready for submission + \usepackage[final]{neurips_2019} +%% +%%% to compile a preprint version, e.g., for submission to arXiv, add add the +%%% [preprint] option: +%%% \usepackage[preprint]{neurips_2019} +%% +%%% to compile a camera-ready version, add the [final] option, e.g.: % \usepackage[final]{neurips_2019} - -% to avoid loading the natbib package, add option nonatbib: -% \usepackage[nonatbib]{neurips_2019} - - +%% +%%% to avoid loading the natbib package, add option nonatbib: +%%% \usepackage[nonatbib]{neurips_2019} +% +% \usepackage[utf8]{inputenc} % allow utf-8 input \usepackage[T1]{fontenc} % use 8-bit T1 fonts \usepackage{hyperref} % hyperlinks \usepackage{url} % simple URL typesetting \usepackage{booktabs} % professional-quality tables \usepackage{amsfonts} % blackboard math symbols \usepackage{nicefrac} % compact symbols for 1/2, etc. \usepackage{microtype} % microtypography \usepackage{amsmath} \usepackage{graphicx} \usepackage{multirow} \usepackage{amsthm} +%%%%%%% HCGM formatting FROM HERE %%%%%%% + +%%\documentclass[11pt]{article} +%% +%%\usepackage{microtype} +%%\usepackage{graphicx} +%%\usepackage{subfigure} +%%\usepackage{booktabs} +%% +%%\usepackage{hyperref} +%%\usepackage{multirow} +%% +%%\newcommand{\theHalgorithm}{\arabic{algorithm}} +%% +%%\synctex=1 +%% +%%\newcommand\hmmax{0} +%%\newcommand\bmmax{0} +%% +%%\usepackage{amsmath,amsbsy,amsgen,amscd,amssymb,amsthm,amsfonts,stmaryrd} +%%\usepackage{mathtools} +%% +%%\usepackage{bm} +%% +%%\usepackage{microtype} +%% +%%\usepackage{hyperref} +%%\usepackage{url} +%% +%%\usepackage[usenames,dvipsnames]{xcolor} +%%\usepackage{graphicx} +%% +%%\graphicspath{{art/}} +%% +%%\DeclareFontFamily{OT1}{pzc}{} +%%\DeclareFontShape{OT1}{pzc}{m}{it}{<-> s * [1.200] pzcmi7t}{} +%%\DeclareMathAlphabet{\mathpzc}{OT1}{pzc}{m}{it} +%%\usepackage[margin=1in]{geometry} +%%\usepackage{scrlayer-scrpage} +%% +%% +%%\newcommand\blfootnote[1]{% +%% \begingroup +%% \renewcommand\thefootnote{}\footnote{#1}% +%% \addtocounter{footnote}{-1}% +%% \endgroup +%%} + +%%%%%%%%% UNTIL HERE %%%%%%%% + + + + \input{preamble} \title{An Inexact Augmented Lagrangian Framework for \\ Nonconvex Optimization with Nonlinear Constraints} % The \author macro works with any number of authors. There are two commands % used to separate the names and addresses of multiple authors: \And and \AND. % % Using \And between authors leaves it to LaTeX to determine where to break the % lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4 % authors names on the first line, and the last on the second line, try using % \AND instead of \And before the third author name. -\author{% - David S.~Hippocampus\thanks{Use footnote for providing further information - about author (webpage, alternative address)---\emph{not} for acknowledging - funding agencies.} \\ - Department of Computer Science\\ - Cranberry-Lemon University\\ - Pittsburgh, PA 15213 \\ - \texttt{hippo@cs.cranberry-lemon.edu} \\ - % examples of more authors - % \And - % Coauthor \\ - % Affiliation \\ - % Address \\ - % \texttt{email} \\ - % \AND - % Coauthor \\ - % Affiliation \\ - % Address \\ - % \texttt{email} \\ - % \And - % Coauthor \\ - % Affiliation \\ - % Address \\ - % \texttt{email} \\ - % \And - % Coauthor \\ - % Affiliation \\ - % Address \\ - % \texttt{email} \\ +\author{Mehmet Fatih Sahin +\And Armin Eftekhari +\And Ahmet Alacaoglu +\And Fabian Latorre G\'omez +\And Volkan Cevher~~~~~~~~~~~~~\\ +~\\ +LIONS, Ecole Polytechnique F\'ed\'erale de Lausanne, Switzerland \\ +~\\ +\texttt{{\footnotesize{\{mehmet.sahin, armin.eftekhari, ahmet.alacaoglu, fabian.latorregomez, volkan.cevher\}@epfl.ch}}} +%\thanks{Correspondence to: Mehmet Fatih Sahin <\texttt{mehmet.sahin@epfl.ch}>} } +%\author{Mehmet Fatih Sahin +%$\qquad$Armin Eftekhari +%$\qquad$Ahmet Alacaoglu\\~\\ +%Fabian Latorre +%$\qquad$Volkan Cevher\\ +%~\\ +%LIONS, Ecole Polytechnique F\'ed\'erale de Lausanne, Switzerland \\ +%%\texttt{{\footnotesize{\{mehmet.sahin, armin.eftekhari, ahmet.alacaoglu, fabian.latorregomez, volkan.cevher\}@epfl.ch}}} +%\blfootnote{Correspondence to: Volkan Cevher $<$\texttt{volkan.cevher@epfl.ch}$>$ } +%} +% +%\date{} + +%\ofoot{asdfads} + +% +%\author{% +% David S.~Hippocampus\thanks{Use footnote for providing further information +% about author (webpage, alternative address)---\emph{not} for acknowledging +% funding agencies.} \\ +% Department of Computer Science\\ +% Cranberry-Lemon University\\ +% Pittsburgh, PA 15213 \\ +% \texttt{hippo@cs.cranberry-lemon.edu} \\ +% % examples of more authors +% % \And +% % Coauthor \\ +% % Affiliation \\ +% % Address \\ +% % \texttt{email} \\ +% % \AND +% % Coauthor \\ +% % Affiliation \\ +% % Address \\ +% % \texttt{email} \\ +% % \And +% % Coauthor \\ +% % Affiliation \\ +% % Address \\ +% % \texttt{email} \\ +% % \And +% % Coauthor \\ +% % Affiliation \\ +% % Address \\ +% % \texttt{email} \\ +%} + \begin{document} \maketitle \input{sections/abstract.tex} \input{sections/introduction.tex} \input{sections/preliminaries.tex} \input{sections/AL.tex} \input{sections/guarantees.tex} \input{sections/related_works.tex} \input{sections/experiments.tex} \input{sections/conclusion.tex} +\section*{Acknowledgements} +\vspace{-2mm} +This project has received funding from the European Research Council (ERC) under the European Union's Horizon 2020 research and innovation programme (grant agreement n$^{\circ}$ 725594 - time-data) and was supported by the Swiss National Science Foundation (SNSF) under grant number 200021\_178865 / 1. + \bibliographystyle{abbrv} % basic style, author-year citations \bibliography{bibliography.bib} % name your BibTeX data base \newpage \appendix \input{sections/AL_theory.tex} \input{sections/appendix.tex} %\subsubsection*{Acknowledgments} % %Use unnumbered third level headings for the acknowledgments. All acknowledgments %go at the end of the paper. Do not include acknowledgments in the anonymized %submission, only in the final paper. % %\section*{References} % %References follow the acknowledgments. Use unnumbered first-level heading for %the references. Any choice of citation style is acceptable as long as you are %consistent. It is permissible to reduce the font size to \verb+small+ (9 point) %when listing the references. {\bf Remember that you can use more than eight % pages as long as the additional pages contain \emph{only} cited references.} %\medskip % %\small % %[1] Alexander, J.A.\ \& Mozer, M.C.\ (1995) Template-based algorithms for %connectionist rule extraction. In G.\ Tesauro, D.S.\ Touretzky and T.K.\ Leen %(eds.), {\it Advances in Neural Information Processing Systems 7}, %pp.\ 609--616. Cambridge, MA: MIT Press. % %[2] Bower, J.M.\ \& Beeman, D.\ (1995) {\it The Book of GENESIS: Exploring % Realistic Neural Models with the GEneral NEural SImulation System.} New York: %TELOS/Springer--Verlag. % %[3] Hasselmo, M.E., Schnell, E.\ \& Barkai, E.\ (1995) Dynamics of learning and %recall at excitatory recurrent synapses and cholinergic modulation in rat %hippocampal region CA3. {\it Journal of Neuroscience} {\bf 15}(7):5249-5262. \end{document} \ No newline at end of file diff --git a/NeurIPS 19/sections/AL.tex b/NeurIPS 19/sections/AL.tex index fa4ce99..3eb1fff 100644 --- a/NeurIPS 19/sections/AL.tex +++ b/NeurIPS 19/sections/AL.tex @@ -1,58 +1,58 @@ %!TEX root = ../main.tex \vspace{-4mm} \section{Algorithm \label{sec:AL algorithm}} To solve the equivalent formulation of \eqref{prob:01} presented in \eqref{eq:minmax}, we propose the inexact ALM (iALM), detailed in Algorithm~\ref{Algo:2}. At the $k^{\text{th}}$ iteration, Step 2 of Algorithm~\ref{Algo:2} calls a solver that finds an approximate stationary point of the augmented Lagrangian $\L_{\b_k}(\cdot,y_k)$ with the accuracy of $\epsilon_{k+1}$, and this accuracy gradually increases in a controlled fashion. The increasing sequence of penalty weights $\{\b_k\}_k$ and the dual update (Steps 4 and 5) are responsible for continuously enforcing the constraints in~\eqref{prob:01}. The appropriate choice for $\{\b_k\}_k$ will be specified in Corrollary Sections \ref{sec:first-o-opt} and \ref{sec:second-o-opt}. %{Appendix \ref{sec:opt_cnds} defines the approximate stationarity measures we used throughout the paper.} The particular choice of the dual step sizes $\{\s_k\}_k$ in Algorithm~\ref{Algo:2} ensures that the dual variable $y_k$ remains bounded, see~\cite{bertsekas1976penalty} in the ALM literature where a similar dual step size is considered. %Step 3 of Algorithm~\ref{Algo:2} removes pathological cases with divergent iterates. As an example, suppose that $g=\delta_\mathcal{X}$ in \eqref{prob:01} is the indicator function for a bounded convex set $\mathcal{X}\subset \RR^d$ and take $\rho' > \max_{x\in \mathcal{X}} \|x\|$. Then, for sufficiently large $k$ \editf{[(mfs:)we may consider removing this "sufficiently large" phrase which appears a few times or clearly explain what we meant for it] }, it is not difficult to verify that all the iterates of Algorithm~\ref{Algo:2} automatically satisfy $\|x_k\|\le \rho'$ without the need to execute Step 3. \begin{algorithm}[h!] \begin{algorithmic} \STATE \textbf{Input:} Non-decreasing, positive, unbounded sequence $\{\b_k\}_{k\ge 1}$, stopping thresholds $\tau_f, \tau_s > 0$. \vspace{2pt} \STATE \textbf{Initialization:} Primal variable $x_{1}\in \RR^d$, dual variable $y_0\in \RR^m$, dual step size $\s_1>0$. %For $k=1,2,\ldots$, execute\\ \vspace{2pt} \FOR{$k=1,2,\dots$} \STATE \begin{enumerate}%[leftmargin=*] \item \textbf{(Update tolerance)} $\epsilon_{k+1} = 1/\b_k$. %\begin{align} %\beta_k = \frac{\beta_{1}}{\log 2}\sqrt{k}\log^2(k+1) . %\end{align} \item \textbf{(Inexact primal solution)} Obtain $x_{k+1}\in \RR^d$ such that \begin{equation*} \dist(-\nabla_x \L_{\beta_k} (x_{k+1},y_k), \partial g(x_{k+1}) ) \le \epsilon_{k+1} \end{equation*} for first-order stationarity \begin{equation*} \lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_k}(x_{k+1}, y_k)) \ge -\epsilon_{k+1} \end{equation*} for second-order-stationarity, if $g=0$ in \eqref{prob:01}. %\item \textbf{(Control)} If necessary, project $x_{k+1}$ to ensure that $\|x_{k+1}\|\le \rho'$.\editf{[(mfs:) can we remove this step? If we append this to 2nd step of algo., can we still make sure is solution(or approx. solution) is within this ball?]}\\ \item \textbf{(Update dual step size)} \begin{align*} \s_{k+1} & = \s_{1} \min\Big( \frac{\|A(x_1)\| \log^2 2 }{\|A(x_{k+1})\| (k+1)\log^2(k+2)} ,1 \Big). \end{align*} \item \textbf{(Dual ascent)} $y_{k+1} = y_{k} + \sigma_{k+1}A(x_{k+1})$. \item \textbf{(Stopping criterion)} If \begin{align*} & \dist(-\nabla_x \L_{\b_k}(x_{k+1}),\partial g(x_{k+1})) + \|A(x_{k+1})\| \le \tau_f,\nonumber \end{align*} for first-order stationarity and if also $\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_{k}}(x_{k+1}, y_k)) \geq -\tau_s$ for second-order stationarity, - then quit and return $x_{k+1}$ as an (approximate) stationary point. + then quit and return $x_{k+1}$ as an (approximate) stationary point of \eqref{eq:minmax}. \end{enumerate} \ENDFOR \end{algorithmic} -\caption{Inexact ALM for solving~\eqref{prob:01}} +\caption{Inexact ALM} \label{Algo:2} \end{algorithm} diff --git a/NeurIPS 19/sections/AL_theory.tex b/NeurIPS 19/sections/AL_theory.tex index db94c59..efc9a88 100644 --- a/NeurIPS 19/sections/AL_theory.tex +++ b/NeurIPS 19/sections/AL_theory.tex @@ -1,365 +1,365 @@ %!TEX root = ../main.tex %\section{Optimality Conditions} %\label{sec:opt_cnds} %{First-order necessary optimality conditions} for \eqref{prob:01} are well-studied. {Indeed, $x\in \RR^d$ is a first-order stationary point of~\eqref{prob:01} if there exists $y\in \RR^m$ such that %% %%%% I am cutting this to save space. put it back if necessary (VC) %%\begin{align} %%%\begin{cases} %%-\nabla f(x) - DA(x)^\top y \in \partial g(x), \qquad \,\, A(x) = 0, %%%\end{cases} %%\label{e:inclu1} %%\end{align} %% %%where $DA(x)$ is the Jacobian of $A$ at $x$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to %\begin{align} %%\begin{cases} %-\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x),\qquad A(x) = 0, %%\end{cases} %\label{e:inclu2} %\end{align} %which is in turn the necessary optimality condition for \eqref{eq:minmax}.} %%Note that \eqref{e:inclu2} is equivalent to %%\begin{align} %%\left[ %%\begin{array}{c} %%\nabla_x \L_\b(x,y) \\ %%\nabla_y \L_\b(x,y) %%\end{array} %%\right] %%\in %%\left\{ %%\left[ %%\begin{array}{c} %% v\\ %%0 %%\end{array} %%\right] %%: v\in \partial g(x) \right\} %%, %%\end{align} %%which rephrases the stationarity condition in terms of the gradient of the duality gap of Program~\eqref{eq:Lagrangian}. %%\editf{mfs: check approx. optimality conditions, how they apply in this setting.} %Inspired by this, we say that $x$ is an $(\epsilon_f,\b)$ first-order stationary point of \eqref{eq:minmax} if {there exists a $y \in \RR^m$} such that %\begin{align} %%\begin{cases} %\dist(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x)) \leq \epsilon_f, \qquad \| A(x) \| \leq \epsilon_f, %%\end{cases} %\label{eq:inclu3app} %\end{align} %for $\epsilon_f\ge 0$. %In light of \eqref{eq:inclu3app}, a metric for evaluating the stationarity of a pair $(x,y)\in \RR^d\times \RR^m$ is %\begin{align} %\dist\left(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x) \right) + \|A(x)\| , %\label{eq:cvg metricapp} %\end{align} %which we use as the first-order stopping criterion. %%When $g=0$, it is also not difficult to verify that the expression in \eqref{eq:cvg metric} is the norm of the gradient of the duality gap in \eqref{eq:minmax}. %As an example, for a convex set $\mathcal{X}\subset\RR^d$, suppose that $g = \delta_\mathcal{X}$ is the indicator function on $\mathcal{X}$. %Let also $T_\mathcal{X}(x) \subseteq \RR^d$ denote the tangent cone to $\mathcal{X}$ at $x$, and with $P_{T_\mathcal{X}(x)}:\RR^d\rightarrow\RR^d$ we denote the orthogonal projection onto this tangent cone. Then, for $u\in\RR^d$, it is not difficult to verify that %\begin{align}\label{eq:dist_subgrad} %\dist\left(u, \partial g(x) \right) = \| P_{T_\mathcal{X}(x)}(u) \|. %\end{align} %% %When $g = 0$, a first-order stationary point $x\in \RR^d$ of \eqref{prob:01} is also second-order stationary if %% %%\vspace{-.5cm} %\begin{equation} %\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))\ge 0, %\end{equation} %%\vspace{-.5cm} %% %where $\nabla_{xx}\mathcal{L}_\b$ is the Hessian of $\mathcal{L}_\b$ with respect to $x$, and $\lambda_{\text{min}}(\cdot)$ returns the smallest eigenvalue of its argument. %Analogously, $x$ is an $(\epsilon_f, \epsilon_s,\b)$ second-order stationary point if, in addition to \eqref{eq:inclu3}, it holds that %% %%\vspace{-.5cm} %\begin{equation}\label{eq:sec_opt} %\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) \ge -\epsilon_s, %\end{equation} %for $\epsilon_s\ge 0$. %%\vspace{-.5cm} %% %Naturally, for second-order stationarity, we use $\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))$ as the stopping criterion. % -\section{Complexity Results} +\section{Complexity Results}\label{sec:opt_cnds} \subsection{First-Order Optimality \label{sec:first-o-opt}} %\textbf{AE: we go back and forth between "subroutine" and "solver". for consistency, I'm just using "solver" everywhere.} Let us first consider the case where the solver in Step~2 is is the first-order algorithm APGM, described in detail in ~\cite{ghadimi2016accelerated}. At a high level, APGM makes use of $\nabla_x \mathcal{L}_{\beta}(x,y)$ in \eqref{eq:Lagrangian}, the proximal operator $\text{prox}_g$, and the classical Nesterov acceleration~\cite{nesterov1983method} to reach first-order stationarity for the subproblem in~\eqref{e:exac}. % \textbf{AE: Ahmet to give a brief description of APGM here. } %First, we characterize the iteration complexity of Algorithm~\ref{Algo:2} for finding a first-order stationary point of~\eqref{prob:01}. %We propose to use the standard accelerated proximal method (APGM), guarantees of which are analyzed in~\cite{ghadimi2016accelerated} for nonconvex problems of the form~\eqref{e:exac}. Suppose that $g=\delta_\mathcal{X}$ is the indicator function on a bounded convex set $\mathcal{X}\subset \RR^d$ and let \begin{align} \rho= \max_{x\in \mathcal{X}}\|x\|, \end{align} be the radius of a ball centered at the origin that includes $\mathcal{X}$. Then, adapting the results in~\cite{ghadimi2016accelerated} to our setup, APGM reaches $x_{k}$ in Step 2 of Algorithm~\ref{Algo:2} after \begin{equation} \mathcal{O}\left ( \frac{\lambda_{\beta_{k}}^2 \rho^{2} }{\epsilon_{k+1}} \right) \label{eq:iter_1storder} \end{equation} (inner) iterations, where $\lambda_{\beta_{k}}$ denotes the Lipschitz constant of $\nabla_x{\mathcal{L}_{\beta_{k}}(x, y)}$, bounded in~\eqref{eq:smoothness of Lagrangian}. For the clarity of the presentation, we have used a looser bound in \eqref{eq:iter_1storder} compared to~\cite{ghadimi2016accelerated}. Using \eqref{eq:iter_1storder}, we derive the following corollary, describing the total iteration complexity of Algorithm~\ref{Algo:2} in terms of the number calls made to the first-order oracle in APGM. %\textbf{AE: we haven't talked about oracle before.} \begin{corollary}\label{cor:first_supp} For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point, after $T$ calls to the first-order oracle, where % \begin{equation} T = \mathcal{O}\left( \frac{Q^3 \rho^2}{\epsilon^{3}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} \rho^2}{\epsilon^{3}} \right). \end{equation} \end{corollary} \begin{proof} Let $K$ denote the number of (outer) iterations of Algorithm~\ref{Algo:2} and let $\epsilon_{f}$ denote the desired accuracy of Algorithm~\ref{Algo:2}, see~(\ref{eq:inclu3}). Recalling Theorem~\ref{thm:main}, we can then write that \begin{equation} \epsilon_{f} = \frac{Q}{\b_{K}}, \label{eq:acc_to_b} \end{equation} or, equivalently, $\b_{K} = Q/\epsilon_{f}$. %where $K$ denotes the last outer iterate and $\epsilon$ is the final accuracy we would like to get for the optimality conditions given in~\eqref{eq:inclu3}. We now count the number of total (inner) iterations $T$ of Algorithm~\ref{Algo:2} to reach the accuracy $\epsilon_{f}$. From \eqref{eq:smoothness of Lagrangian} and for sufficiently large $k$, recall that $\lambda_{\b_k}\le \lambda'' \b_k$ is the smoothness parameter of the augmented Lagrangian. Then, from \eqref{eq:iter_1storder} ad by summing over the outer iterations, we bound the total number of (inner) iterations of Algorithm~\ref{Algo:2} as \begin{align}\label{eq: tk_bound} T &= \sum_{k=1}^K\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}}^2 \rho^2 }{\epsilon_k} \right) \nonumber\\ & = \sum_{k=1}^K\mathcal{O}\left (\beta_{k-1}^3 \rho^2 \right) \qquad \text{(Step 1 of Algorithm \ref{Algo:2})} \nonumber\\ & \leq \mathcal{O} \left(K\beta_{K-1}^3 \rho^2 \right) \qquad \left( \{\b_k\}_k \text{ is increasing} \right) \nonumber\\ & \le \mathcal{O}\left( \frac{K Q^{{3}} \rho^2}{\epsilon_{f}^{{3}}} \right). \qquad \text{(see \eqref{eq:acc_to_b})} \end{align} In addition, if we specify $\beta_k=b^k$ for all $k$, we can further refine $T$. Indeed, \begin{equation} \beta_K = b^K~~ \Longrightarrow~~ K = \log_b \left( \frac{Q}{\epsilon_f} \right), \end{equation} which, after substituting into~\eqref{eq: tk_bound} gives the final bound in Corollary~\ref{cor:first}. \end{proof} %\textbf{AE: if we find some time, i'll add a couple of lines for how 1st order opt implies 2nd order opt for smooth constraints.} \subsection{Second-Order Optimality \label{sec:second-o-opt}} %{\color{red}{Ahmet (note to myself): not sure of the constants of trust-region, check again}} \\ Let us now consider the second-order optimality case where the solver in Step~2 is the the trust region method developed in~\cite{cartis2012complexity}. Trust region method minimizes a quadratic approximation of the function within a dynamically updated trust-region radius. Second-order trust region method that we consider in this section makes use of Hessian (or an approximation of Hessian) of the augmented Lagrangian in addition to first order oracles. As shown in~\cite{nouiehed2018convergence}, finding approximate second-order stationary points of convex-constrained problems is in general NP-hard. For this reason, we focus in this section on the special case of~\eqref{prob:01} with $g=0$. %We first give a theorem to show the convergence rate of the algorithm for second order stationarity: \\ %{\color{red}{Ahmet: I think that it is possible to remove sufficiently large k0 assumption here. }} \textbf{AE: not worth it really} %\begin{corollary} %\label{thm:main_second} %Under the assumptions of Theorem~\ref{thm:main}, the output of Algorithm~\ref{Algo:2} satisfies %\begin{align} %%& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ %%& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\ %%&=: \frac{Q}{\beta_{k-1}^2}. %\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_{k-1}}(x_k, y_k)) \geq - \frac{C}{\beta_{k-1}} - \epsilon_{k-1}. %\end{align} %\end{corollary} % %We consider \textbf{AE: Ahmet to add a brief description of the algorithm.} Let us compute the total computational complexity of Algorithm~\ref{Algo:2} with the trust region method in Step~2, in terms of the number of calls made to the second-order oracle. By adapting the result in~\cite{cartis2012complexity} to our setup, we find that the number of (inner) iterations required in Step~2 of Algorithm~\ref{Algo:2} to produce $x_{k+1}$ is % \begin{equation} \mathcal{O}\left ( \frac{\lambda_{\beta_{k}, H}^2 (\mathcal{L}_{\beta_{k}}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y))}{\epsilon_k^3} \right), \label{eq:sec_inn_comp} \end{equation} % %<<<<<<< HEAD where $\lambda_{\beta, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop. In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y)$ is bounded by a constant independent of $\epsilon$. We assume a uniform bound for this quantity for every $ \beta_k$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following: %======= %where $\lambda_{\beta_k, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta_k$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop. %In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta_k}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y)$ is bounded by a constant independent of $\epsilon_k$. %We assume a uniform bound for this quantity for all $\b$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following result. The proof is very similar to that of Corollary~\ref{cor:first} and hence omitted here. %>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732 % \begin{corollary}\label{cor:second_supp} For $b>1$, let $\beta_k =b^k $ for every $k$. We assume that \begin{equation} \mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) \leq L_{u},\qquad \forall \beta. \end{equation} If we use the trust region method from~\cite{cartis2012complexity} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon$-second-order stationary point of~(\ref{prob:01}) in $T$ calls to the second-order oracle where % \begin{equation} T = \mathcal{O}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \log_b{\left( \frac{Q'}{\epsilon} \right)} \right) = \widetilde{\mathcal{O}}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \right). \end{equation} \end{corollary} % %\notea{I can't remember: what is $Q'$ and why isn't it defined?} Before closing this section, we note that the remark after Corollary~\ref{cor:first} applies here as well. \subsection{Approximate optimality of \eqref{prob:01}.} %{\paragraph{Approximate optimality of \eqref{prob:01}. } Corollary \ref{cor:first} establishes the iteration complexity of Algorithm~\ref{Algo:2} to reach approximate first-order stationarity for the equivalent formulation of \eqref{prob:01} presented in \eqref{eq:minmax}. Unlike the exact case, approximate first-order stationarity in \eqref{eq:minmax} does not immediately lend itself to approximate stationarity in \eqref{prob:01}, and the study of approximate stationarity for the penalized problem (special case of our setting with dual variable set to $0$) has also precedent in~\cite{bhojanapalli2018smoothed}. %\textbf{AE: I think the reference is wrong... It's important to mention some precedent for what we do here to strengthen our argument.} However, it is not difficult to verify that, with the more aggressive regime of $\epsilon_{k+1}=1/\b_k^2$ in Step 1 of Algorithm~\ref{Algo:2}, one can achieve $\epsilon$-first-order stationarity for \eqref{prob:01} with the iteration complexity of $T=\tilde{O}(Q^3\rho^2/\epsilon^6)$ in Corollary~\ref{cor:first}. Note that this conversion is by a naive computation using loose bounds rather than using duality arguments for a tight conversion. For a precedent in convex optimization for relating the convergence in augmented Lagrangian to the constrained problem using duality, see~\cite{tran2018smooth}. For the second-order case, it is in general not possible to establish approximate second-order optimality for \eqref{eq:minmax} from Corollary~\ref{cor:second}, with the exception of linear constraints. } \section{Proof of Theorem \ref{thm:main} \label{sec:theory}} For every $k\ge2$, recall from (\ref{eq:Lagrangian}) and Step~2 of Algorithm~\ref{Algo:2} that $x_{k}$ satisfies \begin{align} & \dist(-\nabla f(x_k) - DA(x_k)^\top y_{k-1} \nonumber\\ & \qquad - \b_{k-1} DA(x_{k})^\top A(x_k) ,\partial g(x_k) ) \nonumber\\ & = \dist(-\nabla_x \L_{\b_{k-1}} (x_k ,y_{k-1}) ,\partial g(x_k) ) \le \epsilon_{k}. \end{align} With an application of the triangle inequality, it follows that \begin{align} & \dist( -\b_{k-1} DA(x_k)^\top A(x_k) , \partial g(x_k) ) \nonumber\\ & \qquad \le \| \nabla f(x_k )\| + \| DA(x_k)^\top y_{k-1}\| + \epsilon_k, \end{align} which in turn implies that \begin{align} & \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \nonumber\\ & \le \frac{ \| \nabla f(x_k )\|}{\b_{k-1} } + \frac{\| DA(x_k)^\top y_{k-1}\|}{\b_{k-1} } + \frac{\epsilon_k}{\b_{k-1} } \nonumber\\ & \le \frac{\lambda'_f+\lambda'_A \|y_{k-1}\|+\epsilon_k}{\b_{k-1}} , \label{eq:before_restriction} \end{align} where $\lambda'_f,\lambda'_A$ were defined in \eqref{eq:defn_restricted_lipsichtz}. %For example, when $g=\delta_\mathcal{X}$ is the indicator , then \eqref{eq:before_restriction} reads as %\begin{align} %& \| P_{T_C(x_k)} DA(x_k)^\top A(x_k) \| \nonumber\\ %& \qquad \le \frac{ \lambda'_f+ \lambda'_A \|y_{k-1}\| + \epsilon_k }{\b_{k-1} } . %\label{eq:before} %\end{align} We next translate \eqref{eq:before_restriction} into a bound on the feasibility gap $\|A(x_k)\|$. Using the regularity condition \eqref{eq:regularity}, the left-hand side of \eqref{eq:before_restriction} can be bounded below as \begin{align} & \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \ge \nu \|A(x_k) \|. \qquad \text{(see (\ref{eq:regularity}))} \label{eq:restrited_pre} \end{align} %provided that $\rho$ satisfy %\begin{align} %\max_{k\in K} \|A(x_k)\| \le \rho. %\label{eq:to_be_checked_later} %\end{align} By substituting \eqref{eq:restrited_pre} back into \eqref{eq:before_restriction}, we find that \begin{align} \|A(x_k)\| \le \frac{ \lambda'_f + \lambda'_A \|y_{k-1}\| + \epsilon_k}{\nu \b_{k-1} }. \label{eq:before_dual_controlled} \end{align} In words, the feasibility gap is directly controlled by the dual sequence $\{y_k\}_k$. We next establish that the dual sequence is bounded. Indeed, for every $k\in K$, note that \begin{align} \|y_k\| & = \| y_0 + \sum_{i=1}^{k} \s_i A(x_i) \| \quad \text{(Step 5 of Algorithm \ref{Algo:2})} \nonumber\\ & \le \|y_0\|+ \sum_{i=1}^k \s_i \|A(x_i)\| \qquad \text{(triangle inequality)} \nonumber\\ & \le \|y_0\|+ \sum_{i=1}^k \frac{ \|A(x_1)\| \log^2 2 }{ k \log^2(k+1)} \quad \text{(Step 4)} \nonumber\\ & \le \|y_0\|+ c \|A(x_1) \| \log^2 2 =: y_{\max}, \label{eq:dual growth} \end{align} where \begin{align} c \ge \sum_{i=1}^{\infty} \frac{1}{k \log^2 (k+1)}. \end{align} Substituting \eqref{eq:dual growth} back into \eqref{eq:before_dual_controlled}, we reach \begin{align} \|A(x_k)\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} } \nonumber\\ & \le \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} } , \label{eq:cvg metric part 2} \end{align} where the second line above holds if $k_0$ is large enough, which would in turn guarantees that $\epsilon_k=1/\b_{k-1}$ is sufficiently small since $\{\b_k\}_k$ is increasing and unbounded. %Let us now revisit and simplify \eqref{eq:to_be_checked_later}. %Note that $\rho'$ automatically satisfies the second inequality there, owing to Step~3 of Algorithm~\ref{Algo:2}. %Note that $\rho$ satisfies \eqref{eq:to_be_checked_later} if %\begin{align} %\frac{\lambda'_f+\lambda'_A y_{\max}}{ \nu_A \b_1} \le \rho/2, %\qquad \text{(see \eqref{eq:cvg metric part 2})} %\end{align} %provided that %\begin{align} %\beta_1 \ge \b_0 := \left(\lambda'_f + \lambda'_A y_{\max} \right)^{-1}. %\label{eq:beta-0} %\end{align} %\begin{align} %\|A(x_k)\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} } %\qquad \text{(see \eqref{eq:cvg metric part 2})} %\nonumber\\ %& \le \frac{ 2( \lambda'_f + \lambda'_A y_{\max})}{\nu \b_1 } %\qquad \text{(see \eqref{eq:dual growth})}, %\end{align} %where the last line above holds when $k_0$ is large enough, because $\b_k \ge \b_1$ and $\lim_{k\rightarrow\infty} \epsilon_k=0$. It remains to control the first term in \eqref{eq:cvg metric}. To that end, after recalling Step 2 of Algorithm~\ref{Algo:2} and applying the triangle inequality, we can write that \begin{align} & \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\ & \le \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) , \partial g(x_{k}) ) \nonumber\\ & + \| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \|. \label{eq:cvg metric part 1 brk down} \end{align} The first term on the right-hand side above is bounded by $\epsilon_k$, by Step 5 of Algorithm~\ref{Algo:2}. For the second term on the right-hand side of \eqref{eq:cvg metric part 1 brk down}, we write that \begin{align} & \| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \| \nonumber\\ & = \| DA(x_k)^\top (y_k - y_{k-1}) \| \qquad \text{(see \eqref{eq:Lagrangian})} \nonumber\\ & \le \lambda'_A \|y_k- y_{k-1}\| \qquad \text{(see \eqref{eq:defn_restricted_lipsichtz})} \nonumber\\ & = \lambda'_A \s_k \|A (x_k) \| \qquad \text{(see Step 5 of Algorithm \ref{Algo:2})} \nonumber\\ & \le \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) . \qquad \text{(see \eqref{eq:cvg metric part 2})} \label{eq:part_1_2} \end{align} By combining (\ref{eq:cvg metric part 1 brk down},\ref{eq:part_1_2}), we find that \begin{align} & \dist( \nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\ & \le \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) + \epsilon_k. \label{eq:cvg metric part 1} \end{align} By combining (\ref{eq:cvg metric part 2},\ref{eq:cvg metric part 1}), we find that \begin{align} & \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\| \nonumber\\ & \le \left( \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) + \epsilon_k \right) \nonumber\\ & \qquad + 2\left( \frac{ \lambda'_f + \lambda'_A y_{\max}}{\nu \b_{k-1} } \right). \end{align} Applying $\s_k\le \s_1$, we find that \begin{align} & \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\| \nonumber\\ & \le \frac{ 2\lambda'_A\s_1 + 2}{ \nu\b_{k-1}} ( \lambda'_f+\lambda'_A y_{\max}) + \epsilon_k. \end{align} For the second part of the theorem, we use the Weyl's inequality and Step 5 of Algorithm~\ref{Algo:2} to write \begin{align}\label{eq:sec} \lambda_{\text{min}} &(\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k-1})) \geq \lambda_{\text{min}} (\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k})) \notag \\&- \sigma_k \| \sum_{i=1}^m A_i(x_k) \nabla^2 A_i(x_k) \|. \end{align} The first term on the right-hand side is lower bounded by $-\epsilon_{k-1}$ by Step 2 of Algorithm~\ref{Algo:2}. We next bound the second term on the right-hand side above as \begin{align*} & \sigma_k \| \sum_{i=1}^m A_i(x_k) \nabla^2 A_i(x_k) \| \\ &\le \sigma_k \sqrt{m} \max_{i} \| A_i(x_k)\| \| \nabla^2 A_i(x_k)\| \\ &\le \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} }, \end{align*} where the last inequality is due to~(\ref{eq:smoothness basic},\ref{eq:cvg metric part 2}). Plugging into~\eqref{eq:sec} gives \begin{align*} & \lambda_{\text{min}}(\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k-1}))\nonumber\\ & \geq -\epsilon_{k-1} - \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} }, \end{align*} which completes the proof of Theorem \ref{thm:main}. diff --git a/NeurIPS 19/sections/appendix.tex b/NeurIPS 19/sections/appendix.tex index c9e5abc..6400784 100644 --- a/NeurIPS 19/sections/appendix.tex +++ b/NeurIPS 19/sections/appendix.tex @@ -1,725 +1,725 @@ %!TEX root = ../main.tex %\section{Proof of Corollary~\ref{cor:first}} %Denote $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|$ for every $k$. Using the convergence proof of the outer algorithm, we have the following bound %\begin{proof} %Let $K$ denote the number of (outer) iterations of Algorithm~\ref{Algo:2} and let $\epsilon_{f}$ denote the desired accuracy of Algorithm~\ref{Algo:2}, see~(\ref{eq:inclu3}). Recalling Theorem~\ref{thm:main}, we can then write that %\begin{equation} % \epsilon_{f} = \frac{Q}{\b_{K}}, % \label{eq:acc_to_b} %\end{equation} %or, equivalently, $\b_{K} = Q/\epsilon_{f}$. %%where $K$ denotes the last outer iterate and $\epsilon$ is the final accuracy we would like to get for the optimality conditions given in~\eqref{eq:inclu3}. %We now count the number of total (inner) iterations $T$ of Algorithm~\ref{Algo:2} to reach the accuracy $\epsilon_{f}$. From \eqref{eq:smoothness of Lagrangian} and for sufficiently large $k$, recall that $\lambda_{\b_k}\le \lambda'' \b_k$ is the smoothness parameter of the augmented Lagrangian. Then, from \eqref{eq:iter_1storder} ad by summing over the outer iterations, we bound the total number of (inner) iterations of Algorithm~\ref{Algo:2} as %\begin{align}\label{eq: tk_bound} %T &= \sum_{k=1}^K\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}}^2 \rho^2 }{\epsilon_k} \right) \nonumber\\ %& = \sum_{k=1}^K\mathcal{O}\left (\beta_{k-1}^3 \rho^2 \right) %\qquad \text{(Step 1 of Algorithm \ref{Algo:2})} %\nonumber\\ %& \leq \mathcal{O} \left(K\beta_{K-1}^3 \rho^2 \right) %\qquad \left( \{\b_k\}_k \text{ is increasing} \right) % \nonumber\\ % & \le \mathcal{O}\left( \frac{K Q^{{3}} \rho^2}{\epsilon_{f}^{{3}}} \right). % \qquad \text{(see \eqref{eq:acc_to_b})} %\end{align} %In addition, if we specify $\beta_k=b^k$ for all $k$, we can further refine $T$. Indeed, %\begin{equation} %\beta_K = b^K~~ \Longrightarrow~~ K = \log_b \left( \frac{Q}{\epsilon_f} \right), %\end{equation} %which, after substituting into~\eqref{eq: tk_bound} gives the final bound in Corollary~\ref{cor:first}. %\end{proof} %\begin{equation} %T \leq \mathcal{O}\left( \frac{Q^{\frac{3}{2}+\frac{1}{2b}} x_{\max}^2}{\epsilon_f^{\frac{3}{2}+\frac{1}{2b}}} \right), %\end{equation} %which completes the proof of Corollary~\ref{cor:first}. %\section{Analysis of different rates for $\beta_k$ and $\epsilon_k$} % %We check the iteration complexity analysis by decoupling $\beta_k$ and $\epsilon_k$. %\begin{equation} %\beta_k = k^b, ~~~~ \epsilon_k = k^{-e}. %\end{equation} % %By denoting $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|$, the algorithm bound becomes, % %\begin{equation} %B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e}. %\end{equation} % %Total iteration number is % %\begin{equation} %T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1}. %\end{equation} % %We now consider two different relations between $b$ and $e$ to see what is going on. % %\textbf{Case 1:} $b\geq e$: % %Bound for the algorithm: % %\begin{equation} %B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e} \leq \frac{2}{k^e} = \epsilon, %\end{equation} % %which gives the relation $K = \left( \frac{2}{\epsilon}\right)^{1/e}$. %Writing down the total number of iterations and plugging in $K$, % %\begin{equation} %T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{\frac{2b}{e}+1+\frac{1}{e}}. %\end{equation} % %To get the least number of total iterations for a given accuracy $\epsilon$, one needs to pick $e$ as large as possible and $b$ as small as possible. %Since in this case we had $b\geq e$, this suggests picking $b=e$ for the optimal iteration complexity. % %\textbf{Case 2:} $b\leq e$: % %Same calculations yield the following bound on the total number of iterations: % %\begin{equation} %T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{2+\frac{e}{b}+\frac{1}{b}}. %\end{equation} % %Given that $b\leq e$, the bound suggests picking $e$ as small as possible and $b$ as big as possible. % %To minimize the total number of iterations in both cases with flexible $b$ and $e$, the bounds suggest to pick $b=e=\alpha$ and take this value to be as large as possible. \section{Proof of Lemma \ref{lem:smoothness}\label{sec:proof of smoothness lemma}} \begin{proof} Note that \begin{align} \mathcal{L}_{\beta}(x,y) = f(x) + \sum_{i=1}^m y_i A_i (x) + \frac{\b}{2} \sum_{i=1}^m (A_i(x))^2, \end{align} which implies that \begin{align} & \nabla_x \mathcal{L}_\beta(x,y) \nonumber\\ & = \nabla f(x) + \sum_{i=1}^m y_i \nabla A_i(x) + \frac{\b}{2} \sum_{i=1}^m A_i(x) \nabla A_i(x) \nonumber\\ & = \nabla f(x) + DA(x)^\top y + \b DA(x)^\top A(x), \end{align} where $DA(x)$ is the Jacobian of $A$ at $x$. By taking another derivative with respect to $x$, we reach \begin{align} \nabla^2_x \mathcal{L}_\beta(x,y) & = \nabla^2 f(x) + \sum_{i=1}^m \left( y_i + \b A_i(x) \right) \nabla^2 A_i(x) \nonumber\\ & \qquad +\b \sum_{i=1}^m \nabla A_i(x) \nabla A_i(x)^\top. \end{align} It follows that \begin{align} & \|\nabla_x^2 \mathcal{L}_\beta(x,y)\|\nonumber\\ & \le \| \nabla^2 f(x) \| + \max_i \| \nabla^2 A_i(x)\| \left (\|y\|_1+\b \|A(x)\|_1 \right) \nonumber\\ & \qquad +\beta\sum_{i=1}^m \|\nabla A_i(x)\|^2 \nonumber\\ & \le \lambda_h+ \sqrt{m} \lambda_A \left (\|y\|+\b \|A(x)\| \right) + \b \|DA(x)\|^2_F. \end{align} For every $x$ such that $\|x\|\le \rho$ and $\|A(x)\|\le \rho$, we conclude that \begin{align} \|\nabla_x^2 \mathcal{L}_\beta(x,y)\| & \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho' \right)+ \b \max_{\|x\|\le \rho}\|DA(x)\|_F^2, \end{align} which completes the proof of Lemma \ref{lem:smoothness}. \end{proof} %\section{Proof of Lemma \ref{lem:11}\label{sec:proof of descent lemma}} % %Throughout, let %\begin{align} %G = G_{\b,\g}(x,y) = \frac{x-x^+}{\g}, %\end{align} %for short. %Suppose that $\|A(x)\|\le \rho$, $\|x\|\le \rho$, and similarly $\|A(x^+)\|\le \rho$, $\|x^+\|\le \rho'$. An application of Lemma \ref{lem:smoothness} yields that %\begin{align} %\L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y)+ \langle x^+-x,\nabla_x \L_\b(x,y) \rangle %+ \frac{\lambda_\b}{2} \|x^+ - x\|^2 + g(x^+) \nonumber\\ %& = \L_\b(x,y)-\g \langle G ,\nabla_x \L_\b(x,y) \rangle %+ \frac{\g^2 \lambda_\b }{2} \|G\|^2 + g(x^+) %\label{eq:descent pr 1} %\end{align} %Since $x^+ = P_g(x - \g \nabla_x \L_\b(x,y))$, we also have that %\begin{align} %\g (G - \nabla_x \L_\b(x,y)) = \xi \in \partial g(x^+). %\label{eq:opt of prox} %\end{align} %By combining (\ref{eq:descent pr 1},\ref{eq:opt of prox}), we find that %\begin{align} %\L_\b(x^+,y)+g(x^+) & %\le \L_\b(x,y) -\g \|G\|^2 + \g \langle G, \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\ %& = \L_\b(x,y) -\g \|G\|^2 + \langle x- x^+ , \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\ %& \le \L_\b(x,y) + g(x) - \g\left( 1-\frac{\g\lambda_\b}{2}\right) \|G\|^2, %\end{align} %where the last line above uses the convexity of $g$. Recalling that $\g\le 1/\lambda_\b$ completes the proof of Lemma \ref{lem:11}. % % %\section{Proof of Lemma \ref{lem:eval Lipsc cte}\label{sec:proof of linesearch}} % % %Recalling $x^+_{\gamma}$ in \eqref{eq:defn of gamma line search}, we note that %\begin{equation} %x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = -\xi \in -\partial g (x^+_{\gamma}). %\label{eq:optimality of uplus} %\end{equation} %Lastly, $\gamma$ by definition in \eqref{eq:defn of gamma line search} satisfies %\begin{align} %& \mathcal{L}_{\beta}(x^+_{\gamma},y) + g(x_\g^+) \nonumber\\ % & \le \mathcal{L}_\beta(x,y) + \g \left\langle %x^+_{\gamma} - x , \nabla_x \mathcal{L}_\beta (x,y) %\right\rangle + \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 %+ g(x_\g^+) %\nonumber\\ %& = \mathcal{L}_\beta(x,y) + \left\langle %x - x^+_{\gamma} ,\xi %\right\rangle %- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+)\nonumber\\ %& \le \mathcal{L}_\beta(x,y) %- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x) - g(x^+_\g) %\qquad (\text{convexity of } g) \nonumber\\ %& = \mathcal{L}_\beta(x,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(x,y)\|^2 +g(x) - g(x^+_\g), %\qquad \text{(see Definition \ref{def:grad map})} %\end{align} %which completes the proof of Lemma \ref{lem:eval Lipsc cte}. % % % \section{Clustering \label{sec:verification2}} We only verify the condition in~\eqref{eq:regularity} here. Note that \begin{align} A(x) = VV^\top \mathbf{1}- \mathbf{1}, \end{align} \begin{align} DA(x) & = \left[ \begin{array}{cccc} w_{1,1} x_1^\top & \cdots & w_{1,n} x_{1}^\top\\ %x_2^\top p& \cdots & x_{2}^\top\\ \vdots\\ w_{n,1}x_{n}^\top & \cdots & w_{n,n}1 x_{n}^\top \end{array} \right] \nonumber\\ & = \left[ \begin{array}{ccc} V & \cdots & V \end{array} \right] + \left[ \begin{array}{ccc} x_1^\top & \\ & \ddots & \\ & & x_n^\top \end{array} \right], \label{eq:Jacobian clustering} \end{align} %where $I_n\in\RR^{n\times n}$ is the identity matrix, where $w_{i.i}=2$ and $w_{i,j}=1$ for $i\ne j$. In the last line above, $n$ copies of $V$ appear and the last matrix above is block-diagonal. For $x_k$, define $V_k$ accordingly and let $x_{k,i}$ be the $i$th row of $V_k$. Consequently, \begin{align} DA(x_k)^\top A(x_k) & = \left[ \begin{array}{c} (V_k^\top V_k - I_n) V_k^\top \mathbf{1}\\ \vdots\\ (V_k^\top V_k - I_n) V_k^\top \mathbf{1} \end{array} \right] \nonumber\\ & \qquad + \left[ \begin{array}{c} x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\ \vdots \\ x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n \end{array} \right], \end{align} where $I_n\in \RR^{n\times n}$ is the identity matrix. Let us make a number of simplifying assumptions. First, we assume that $\|x_k\|< \sqrt{s}$ (which can be enforced in the iterates by replacing $C$ with $(1-\epsilon)C$ for a small positive $\epsilon$ in the subproblems). Under this assumption, it follows that \begin{align} (\partial g(x_k))_i = \begin{cases} 0 & (x_k)_i > 0\\ \{a: a\le 0\} & (x_k)_i = 0, \end{cases} \qquad i\le d. \label{eq:exp-subgrad-cluster} \end{align} Second, we assume that $V_k$ has nearly orthonormal columns, namely, $V_k^\top V_k \approx I_n$. This can also be enforced in each iterate of Algorithm~\ref{Algo:2} and naturally corresponds to well-separated clusters. While a more fine-tuned argument can remove these assumptions, they will help us simplify the presentation here. Under these assumptions, the (squared) right-hand side of \eqref{eq:regularity} becomes \begin{align} & \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right)^2 \nonumber\\ & = \left\| \left( -DA(x_k)^\top A(x_k) \right)_+\right\|^2 \qquad (a_+ = \max(a,0)) \nonumber\\ & = \left\| \left[ \begin{array}{c} x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\ \vdots \\ x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n \end{array} \right] \right\|^2 \qquad (x_k\in C \Rightarrow x_k\ge 0) \nonumber\\ & = \sum_{i=1}^n \| x_{k,i}\|^2 (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\ & \ge \min_i \| x_{k,i}\|^2 \cdot \sum_{i=1}^n (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\ & = \min_i \| x_{k,i}\|^2 \cdot \| V_kV_k^\top \mathbf{1}-\mathbf{1} \|^2. \label{eq:final-cnd-cluster} \end{align} Therefore, given a prescribed $\nu$, ensuring $\min_i \|x_{k,i}\| \ge \nu$ guarantees \eqref{eq:regularity}. When the algorithm is initialized close enough to the constraint set, there is indeed no need to separately enforce \eqref{eq:final-cnd-cluster}. In practice, often $n$ exceeds the number of true clusters and a more intricate analysis is required to establish \eqref{eq:regularity} by restricting the argument to a particular subspace of $\RR^n$. %\section{Basis Pursuit \label{sec:verification1}} %We only verify the regularity condition in \eqref{eq:regularity} for \eqref{prob:01} with $f,A,g$ specified in \eqref{eq:bp-equiv}. Note that %\begin{align} %DA(x) = 2 \overline{B} \text{diag}(x), %\label{eq:jacob-bp} %\end{align} %where $\text{diag}(x)\in\RR^{2d\times 2d}$ is the diagonal matrix formed by $x$. The left-hand side of \eqref{eq:regularity} then reads as %\begin{align} %& \text{dist} \left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{\b_{k-1}} \right) \nonumber\\ %& = \text{dist} \left( -DA(x_k)^\top A(x_k) , \{0\} \right) \qquad (g\equiv 0)\nonumber\\ %& = \|DA(x_k)^\top A(x_k) \| \nonumber\\ %& =2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|. %\qquad \text{(see \eqref{eq:jacob-bp})} %\label{eq:cnd-bp-pre} %\end{align} %To bound the last line above, let $x_*$ be a solution of~\eqref{prob:01} and note that $\overline{B} x_*^{\circ 2} = b $ by definition. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. Corresponding to $x_k$, also define $u_{k,1},u_{k,2}$ naturally and let $|z_k| = u_{k,1}^{\circ 2} + u_{k,2}^{\circ 2} \in \RR^d$ be the vector of amplitudes of $z_k$. To simplify matters, let us assume also that $B$ is full-rank. %We then rewrite the norm in the last line of \eqref{eq:cnd-bp-pre} as %\begin{align} %& \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|^2 \nonumber\\ %& = \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x_k^{\circ 2} -x_*^{\circ 2}) \|^2 %\qquad (\overline{B} x_*^{\circ 2} = b) %\nonumber\\ %& = \| \text{diag}(x_k)\overline{B}^\top B (x_k - x_*) \|^2 \nonumber\\ %& = \| \text{diag}(u_{k,1})B^\top B (z_k - z_*) \|^2 \nonumber\\ %& \qquad + \| \text{diag}(u_{k,2})B^\top B (z_k - z_*) \|^2 \nonumber\\ %& = \| \text{diag}(u_{k,1}^{\circ 2}+ u_{k,2}^{\circ 2}) B^\top B (z_k - z_*) \|^2 \nonumber\\ %& = \| \text{diag}(|z_k|) B^\top B (z_k - z_*) \|^2 \nonumber\\ %& \ge %\eta_n ( B \text{diag}(|z_k|) )^2 %\| B(z_k - z_*) \|^2 \nonumber\\ %& = %\eta_n ( B \text{diag}(|z_k|) )^2 %\| B z_k -b \|^2 %\qquad ( Bz_* = \ol{B} x^{\circ2}_* = b) \nonumber\\ %& \ge \min_{|T|=n}\eta_n(B_T)\cdot |z_{k,(n)}|^2 \|Bz_k - b\|^2, %\end{align} %where $\eta_n(\cdot)$ returns the $n$th largest singular value of its argument. In the last line above, $B_T$ is the restriction of $B$ to the columns indexed by $T$ of size $n$. Moreover, $z_{k,(n)}$ is the $n$th largest entry of $z$ in magnitude. %Given a prescribed $\nu$, \eqref{eq:regularity} therefore holds if %\begin{align} %|z_{k,(n)} | \ge \frac{\nu}{2 \sqrt{\min_{|T|=n} \eta_n(B_T)}} , %\label{eq:final-bp-cnd} %\end{align} %for every iteration $k$. If Algorithm \ref{Algo:2} is initialized close enough to the solution $z^*$ and the entries of $z^*$ are sufficiently large in magnitude, there will be no need to directly enforce \eqref{eq:final-bp-cnd}. %\paragraph{Discussion} %The true potential of the reformulation of BP in \eqref{eq:bp-equiv} is in dealing with more structured norms than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as %\begin{align*} %\|z\|_{\Omega} = \sum_{i=1}^I \| z_{\Omega_i} \|, %\end{align*} %where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the non-convex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work. \section{Additional Experiments}{\label{sec:adexp}} \subsection{Basis Pursuit}{\label{sec:bp}} Basis Pursuit (BP) finds sparsest solutions of an under-determined system of linear equations by solving \begin{align} %\begin{cases} \min_{z} \|z\|_1 \subjto Bz = b, %\end{cases} \label{eq:bp_main} \end{align} where $B \in \RR^{n \times d}$ and $b \in \RR^{n}$. %BP has found many applications in machine learning, statistics and signal processing \cite{chen2001atomic, candes2008introduction, arora2018compressed}. Various primal-dual convex optimization algorithms are available in the literature to solve BP, including~\cite{tran2018adaptive,chambolle2011first}. %There also exists a line of work \cite{beck2009fast} that handles sparse regression via regularization with $\ell_1$ norm, known as Lasso regression, which we do not compare with here due to their need for tuning the regularization term. We compare our algorithm against state-of-the-art primal-dual convex methods for solving \eqref{eq:bp_main}, namely, Chambole-Pock~\cite{chambolle2011first}, ASGARD~\cite{tran2018smooth} and ASGARD-DL~\cite{tran2018adaptive}. %\textbf{AE: Fatih, maybe mention a few other algorithms including asgard and also say why we can't use fista and nista.} Here, we take a different approach and cast~(\ref{eq:bp_main}) as an instance of~\eqref{prob:01}. Note that any $z \in \RR^d$ can be decomposed as $z = z^+ - z^-$, where $z^+,z^-\in \RR^d$ are the positive and negative parts of $z$, respectively. Then consider the change of variables $z^+ = u_1^{\circ 2}$ and $z^-= u_2^{\circ 2} \in \RR^d$, where $\circ$ denotes element-wise power. Next, we concatenate $u_1$ and $u_2$ as $x := [ u_1^{\top}, u_2^{\top} ]^{\top} \in \RR^{2d}$ and define $\overline{B} := [B, -B] \in \RR^{n \times 2d}$. Then, \eqref{eq:bp_main} is equivalent to \eqref{prob:01} with \begin{align} f(x) =& \|x\|^2, \quad g(x) = 0,\subjto A(x) = \overline{B}x^{\circ 2}- b. \label{eq:bp-equiv} \end{align} We draw the entries of $B$ independently from a zero-mean and unit-variance Gaussian distribution. For a fixed sparsity level $k$, the support of $z_*\in \RR^d$ and its nonzero amplitudes are also drawn from the standard Gaussian distribution. Then the measurement vector is created as $b = Bz + \epsilon$, where $\epsilon$ is the noise vector with entries drawn independently from the zero-mean Gaussian distribution with variance $\sigma^2=10^{-6}$. The results are compiled in Figure~\ref{fig:bp1}. Clearly, the performance of Algorithm~\ref{Algo:2} with a second-order solver for BP is comparable to the rest. %Figure~\ref{fig:bp1} compiles our results for the proposed relaxation. It is, indeed, interesting to see that these type of nonconvex relaxations gives the solution of convex one and first order methods succeed. \begin{figure}[] % \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf} %\begin{center} \centering {\includegraphics[width=.4\columnwidth]{figs/bp_all_obj.pdf}} {\includegraphics[width=.4\columnwidth]{figs/bp_all_feas.pdf}} \caption{Basis Pursuit} \label{fig:bp1} \end{figure} % \paragraph{Discussion:} The true potential of our reformulation is in dealing with more structured norms rather than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as \begin{align*} \|z\|_{\Omega} = \sum_{i=1}^I \| z_{\Omega_i} \|, \end{align*} {where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the nonconvex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work. %\vspace{3mm} \paragraph{Condition verification:} In the sequel, we verify that Theorem~\ref{thm:main} indeed applies to~\eqref{prob:01} with the above $f,A,g$. Note that %We hereby verify the regularity condition in \eqref{eq:regularity} for \eqref{prob:01} with $f,A,g$ specified in \eqref{eq:bp-equiv}. \begin{align} DA(x) = 2 \overline{B} \text{diag}(x), \label{eq:jacob-bp} \end{align} where $\text{diag}(x)\in\RR^{2d\times 2d}$ is the diagonal matrix formed by $x$. The left-hand side of \eqref{eq:regularity} then reads as \begin{align} & \text{dist} \left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{\b_{k-1}} \right) \nonumber\\ & = \text{dist} \left( -DA(x_k)^\top A(x_k) , \{0\} \right) \qquad (g\equiv 0)\nonumber\\ & = \|DA(x_k)^\top A(x_k) \| \nonumber\\ & =2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|. \qquad \text{(see \eqref{eq:jacob-bp})} \label{eq:cnd-bp-pre} \end{align} To bound the last line above, let $x_*$ be a solution of~\eqref{prob:01} and note that $\overline{B} x_*^{\circ 2} = b $ by definition. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. Corresponding to $x_k$, also define $u_{k,1},u_{k,2}$ naturally and let $|z_k| = u_{k,1}^{\circ 2} + u_{k,2}^{\circ 2} \in \RR^d$ be the vector of amplitudes of $z_k$. To simplify matters, let us assume also that $B$ is full-rank. We then rewrite the norm in the last line of \eqref{eq:cnd-bp-pre} as \begin{align} & \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|^2 \nonumber\\ & = \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x_k^{\circ 2} -x_*^{\circ 2}) \|^2 \qquad (\overline{B} x_*^{\circ 2} = b) \nonumber\\ & = \| \text{diag}(x_k)\overline{B}^\top B (x_k - x_*) \|^2 \nonumber\\ & = \| \text{diag}(u_{k,1})B^\top B (z_k - z_*) \|^2 \nonumber\\ & \qquad + \| \text{diag}(u_{k,2})B^\top B (z_k - z_*) \|^2 \nonumber\\ & = \| \text{diag}(u_{k,1}^{\circ 2}+ u_{k,2}^{\circ 2}) B^\top B (z_k - z_*) \|^2 \nonumber\\ & = \| \text{diag}(|z_k|) B^\top B (z_k - z_*) \|^2 \nonumber\\ & \ge \eta_n ( B \text{diag}(|z_k|) )^2 \| B(z_k - z_*) \|^2 \nonumber\\ & = \eta_n ( B \text{diag}(|z_k|) )^2 \| B z_k -b \|^2 \qquad ( Bz_* = \ol{B} x^{\circ2}_* = b) \nonumber\\ & \ge \min_{|T|=n}\eta_n(B_T)\cdot |z_{k,(n)}|^2 \|Bz_k - b\|^2, \end{align} where $\eta_n(\cdot)$ returns the $n$th largest singular value of its argument. In the last line above, $B_T$ is the restriction of $B$ to the columns indexed by $T$ of size $n$. Moreover, $z_{k,(n)}$ is the $n$th largest entry of $z$ in magnitude. Given a prescribed $\nu$, \eqref{eq:regularity} therefore holds if \begin{align} |z_{k,(n)} | \ge \frac{\nu}{2 \sqrt{\min_{|T|=n} \eta_n(B_T)}} , \label{eq:final-bp-cnd} \end{align} for every iteration $k$. If Algorithm \ref{Algo:2} is initialized close enough to the solution $z^*$ and the entries of $z^*$ are sufficiently large in magnitude, there will be no need to directly enforce \eqref{eq:final-bp-cnd}. \subsection{Generalized Eigenvalue Problem}{\label{sec:geig}} -\begin{figure}[h] +\begin{figure}[!h] %\begin{table}[h] \begin{tabular}{l|l|l} ~~~~~~~~~(i) $C:$ Gaussian iid & ~~~~~(ii) $C:$ Polynomial decay & ~~~~~(iii) $C:$ Exponential decay \\ \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case1_convergence.pdf}& \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case2_convergence.pdf}& \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case3_convergence.pdf}\\ \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case1_eigenvalues.pdf} & \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case2_eigenvalues.pdf} & \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case3_eigenvalues.pdf} \\ \hline ~~~~~~~~~~~~~~~~~~~~(iv) & ~~~~~~~~~~~~~~~~~~~~(v) & ~~~~~~~~~~~~~~~~~~~(vi) \\ \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case4_convergence.pdf}& \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case6_convergence.pdf}& \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case7_convergence.pdf}\\ \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case4_eigenvalues.pdf} & \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case6_eigenvalues.pdf} & \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case7_eigenvalues.pdf} \end{tabular} %\end{table} \caption{{\it{(Top)}} Objective convergence for calculating top generalized eigenvalue and eigenvector of $B$ and $C$. {\it{(Bottom)}} Eigenvalue structure of the matrices. For (i),(ii) and (iii), $C$ is positive semidefinite; for (iv), (v) and (vi), $C$ contains negative eigenvalues. {[(i): Generated by taking symmetric part of iid Gaussian matrix. (ii): Generated by randomly rotating diag($1^{-p}, 2^{-p}, \cdots, 1000^{-p}$)($p=1$). (iii): Generated by randomly rotating diag($10^{-p}, 10^{-2p}, \cdots, 10^{-1000p}$)($p=0.0025$).]} } \label{fig:geig1} \end{figure} Generalized eigenvalue problem has extensive applications in machine learning, statistics and data analysis~\cite{ge2016efficient}. The well-known nonconvex formulation of the problem is~\cite{boumal2016non} given by \begin{align} \begin{cases} \underset{x\in\mathbb{R}^n}{\min} x^\top C x \\ x^\top B x = 1, \end{cases} \label{eq:eig} \end{align} where $B, C \in \RR^{n \times n}$ are symmetric matrices and $B$ is positive definite, namely, $B \succ 0$. %Due to the invertibilty of $B$, we have $\langle x, Bx \rangle \geq \frac{\| x\|_F^2}{\| B^{-1} \|}$, which implies the constraint $\| x \|_F ^2 \leq \| B^{-1} \|$. The generalized eigenvector computation is equivalent to performing principal component analysis (PCA) of $C$ in the norm $B$. It is also equivalent to computing the top eigenvector of symmetric matrix $S = B^{-1/2}CB^{1/2}$ and multiplying the resulting vector by $B^{-1/2}$. However, for large values of $n$, computing $B^{-1/2}$ is extremely expensive. The natural convex SDP relaxation for~\eqref{eq:eig} involves lifting $Y = xx^\top$ and removing the nonconvex rank$(Y) = 1$ constraint, namely, \begin{align} \begin{cases} \underset{Y \in \RR^{n \times n}}{\min} \text{tr}(CY)\\ \text{tr}(BY) = 1, \quad X \succeq 0. \end{cases} \label{eq:eig-sdp} \end{align} Here, however, we opt to directly solve~\eqref{eq:eig} because it fits into our template with \begin{align} f(x) =& x^\top C x, \quad g(x) = 0,\nonumber\\ A(x) =& x^\top B x - 1. \label{eq:eig-equiv} \end{align} We compare our approach against three different methods: manifold based Riemannian gradient descent and Riemannian trust region methods in \cite{boumal2016global} and the linear system solver in~\cite{ge2016efficient}, abbrevated as GenELin. We have used Manopt software package in \cite{manopt} for the manifold based methods. For GenELin, we have utilized Matlab's backslash operator as the linear solver. The results are compiled in Figure \ref{fig:geig1}. \paragraph{Condition verification:} Here, we verify the regularity condition in \eqref{eq:regularity} for problem \eqref{eq:eig}. Note that \begin{align} DA(x) = (2Bx)^\top. \label{eq:jacobian-gen-eval} \end{align} Therefore, \begin{align} \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right)^2 & = \dist\left( -DA(x_k)^\top A(x_k) , \{0\} \right)^2 \qquad (g\equiv 0) \nonumber\\ & = \| DA(x_k)^\top A(x_k) \|^2 \nonumber\\ & = \|2Bx_k (x_k^\top Bx_k - 1)\|^2 \qquad \text{(see \eqref{eq:jacobian-gen-eval})} \nonumber\\ & = 4 (x_k^\top Bx_k - 1)^2\|Bx_k\|^2 \nonumber\\ & = 4\|Bx_k\|^2 \|A(x_k)\|^2 \qquad \text{(see \eqref{eq:eig-equiv})} \nonumber \\ & \ge \eta_{\min}(B)^2\|x_k\|^2 \|A(x_k)\|^2, \end{align} where $\eta_{\min}(B)$ is the smallest eigenvalue of the positive definite matrix $B$. Therefore, for a prescribed $\nu$, the regularity condition in \eqref{eq:regularity} holds with $\|x_k\| \ge \nu /\eta_{min}$ for every $k$. If the algorithm is initialized close enough to the constraint set, there will be again no need to directly enforce this latter condition. \subsection{$\ell_\infty$ Denoising with a Generative Prior}{\label{sec:gan}} -The authors of \citep{Samangouei2018,Ilyas2017} have proposed to +The authors of \cite{Samangouei2018,Ilyas2017} have proposed to project onto the range of a Generative Adversarial network (GAN) -\citep{Goodfellow2014}, as a way to defend against adversarial examples. For a +\cite{Goodfellow2014}, as a way to defend against adversarial examples. For a given noisy observation $x^* + \eta$, they consider a projection in the $\ell_2$ norm. We instead propose to use our augmented Lagrangian method to denoise in the $\ell_\infty$ norm, a much harder task: \begin{align} \begin{array}{lll} \underset{x, z}{\text{min}} & & \|x^* + \eta - x\|_\infty \\ \text{s.t. } && x=G(z). \end{array} \end{align} -\begin{figure}[h!] +\begin{figure}[!h] % \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf} \begin{center} {\includegraphics[scale=0.9]{figs/example_denoising_fab.pdf}} %\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}} \caption{Augmented Lagrangian vs Adam and Gradient descent for $\ell_\infty$ denoising} \label{fig:comparison_fab} \end{center} \end{figure} We use a pretrained generator for the MNIST dataset, given by a standard -deconvolutional neural network architecture \citep{Radford2015}. We compare the -succesful optimizer Adam \citep{Kingma2014} and gradient Descent against our +deconvolutional neural network architecture \cite{Radford2015}. We compare the +succesful optimizer Adam \cite{Kingma2014} and gradient Descent against our method. Our algorithm involves two forward and one backward pass through the network, as oposed to Adam that requires only one forward/backward pass. For this reason we let our algorithm run for 2000 iterations, and Adam and GD for 3000 iterations. Both Adam and gradient descent generate a sequence of feasible iterates $x_t=G(z_t)$. For this reason we plot the objective evaluated at the point $G(z_t)$ vs iteration count in figure \ref{fig:comparison_fab}. Our method successfully minimizes the objective value, while Adam and GD do not. \subsection{Quadratic assginment problem}{\label{sec:qap}} Let $K$, $L$ be $n \times n$ symmetric metrices. QAP in its simplest form can be written as \begin{align} \max \text{tr}(KPLP), \,\,\,\,\,\,\, \text{subject to}\,\, P \,\,\text{be a permutation matrix} \label{eq:qap1} \end{align} A direct approach for solving \eqref{eq:qap1} involves a combinatorial search. To get the SDP relaxation of \eqref{eq:qap1}, we will first lift the QAP to a problem involving a larger matrix. Observe that the objective function takes the form \begin{align*} \text{tr}((K\otimes L) (\text{vec}(P)\text{vec}(P^\top))), \end{align*} where $\otimes$ denotes the Kronecker product. Therefore, we can recast \eqref{eq:qap1} as \begin{align} \text{tr}((K\otimes L) Y) \,\,\,\,\text{subject to} \,\,\, Y = \text{vec}(P)\text{vec}(P^\top), \label{eq:qapkron} \end{align} where $P$ is a permutation matrix. We can relax the equality constraint in \eqref{eq:qapkron} to a semidefinite constraint and write it in an equivalent form as \begin{align*} X = \begin{bmatrix} 1 & \text{vec}(P)^\top\\ \text{vec}(P) & Y \end{bmatrix} \succeq 0 \,\,\, \text{for a symmetric} X \in \mathbb{S}^{(n^2+1) \times (n^2+1)} \end{align*} We now introduce the following constraints such that \begin{equation} B_k(X) = {\bf{b_k}}, \,\,\,\, {\bf b_k} \in \RR^{m_k}\,\, %\text{and}\, k \in [1, 6] \label{eq:qapcons} \end{equation} to make sure X has a proper structure. Here, $B_k$ is a linear operator on $X$ and the total number of constraints is $m = \sum_{k} m_k.$ Hence, SDP relaxation of the quadratic assignment problem takes the form, %\begin{enumerate} %\item Each entry of the permutation matrix $P$ is either $0$ or $1$. For this, we need to have that first column of the matrix $X$ is equal to its diagonal elements, i.e., $X_{1, i} = X_{i,i} \,\,\, \forall i \in [2, n^2+1]$. %\item A permutation matrix is doubly stochastic, meaning that all the entries are nonnegative, row sum and column sum of the matrix is equal to $1$ for each row and column, i.e., $\sum_i p_{ij} = 1$ and $\sum_j p_{ij} = 1 \,\, \forall i,j \in [1, n] $. %\item We know that a permutation matrix is orthogonal, i.e., $PP^\top= P^\top P = I $. This constraint is enforced through partial traces, i.e., $\text{trace}_1(Y) = \text{trace}_1(Y) = I $. %\end{enumerate} \begin{align} \max\,\,\, & \langle C, X \rangle \nonumber \\%\,\text{trace}(K\otimes L) \text{subject to} \,\,\,& P1 = 1, \,\, 1^\top P = 1,\,\, P\geq 0 \nonumber \\ & \text{trace}_1(Y) = I \,\,\, \text{trace}_2(Y) = I \nonumber \\ & \text{vec}(P) = \text{diag}(Y) \nonumber \\ & \text{trace}(Y) = n \,\, \begin{bmatrix} 1 & \text{vec}(P)^\top\\ \text{vec}(P) & Y \end{bmatrix} \succeq 0, \label{eq:qap_sdp} \end{align} where $\text{trace}_1(.)$ and $\text{trace}_2(.)$ are partial traces satisfying, $$ \text{trace}_1(K \otimes L) = \text{trace}(K)L \,\,\,\,\,\, \text{and} \,\,\,\,\, \text{trace}_2(K\otimes L)= K \text{trace}(L) $$ $$ \text{trace}_1^*(T) = I \otimes T \,\,\,\,\,\, \text{and} \,\,\,\,\, \text{trace}_2^*(T)= T \otimes I $$ $1st$ set of equalities are due to the fact that permutation matrices are doubly stochastic. $2nd$ set of equalities are to ensure permutation matrices are orthogonal, i.e., $PP^\top = P^\top P=I$. $3rd$ set of equalities are to enforce every individual entry of the permutation matrix takes either $0$ or $1$, i.e., $X_{1, i} = X_{i,i} \,\,\, \forall i \in [1, n^2+1]$. . Trace constraint in the last line is to bound the problem domain. By concatenating the $B_k$'s in \eqref{eq:qapcons}, we can rewrite \eqref{eq:qap_sdp} in standard SDP form as \begin{align} \max\,\,\, & \langle C, X \rangle \nonumber \\%\,\text{trace}(K\otimes L) \text{subject to} \,\,\,& B(X) = {\bf{b},\,\,\, b} \in \RR^m \nonumber \\ & \text{trace}(X) = n+1 \nonumber \\ & X_{ij} \geq 0, \,\,\,\, i,j\,\,\, \mathcal{G}\nonumber \\ & X\succeq0, \end{align} where $\mathcal{G}$ represents the index set for which we introduce the nonnegativities. When $\mathcal{G}$ covers the wholes set of indices, we get the best approximation to the original problem. However, it becomes computationally undesirable as the problem dimension increases. Hence, we remove the redundant nonnegativity constraints and enforce it for the indices where Kronecker product between $K$ and $L$ is nonzero. We penalize the non-negativity constraints and add it to the augmented Lagrangian objective since a projection to the positive orthant approach in the low rank space as we did for the clustering does not work here. We take \cite{ferreira2018semidefinite} as the baseline. This is an SDP based approach for solving QAP problems containing a sparse graph. We compare against the best feasible upper bounds reported in \cite{ferreira2018semidefinite} for the given instances. Here, optimality gap is defined as $$ \% \text{Gap} = \frac{|\text{bound} - \text{optimal}|}{\text{optimal}} \times 100 $$ We used a (relatively) sparse graph data set from the QAP library. We run our low rank algorithm for different rank values. $r_m$ in each instance corresponds to the smallest integer satisfying the Pataki bound~\cite{pataki1998rank, barvinok1995problems}. Results are shown in Table \ref{tb:qap}. Primal feasibility values except for the last instance $esc128$ is less than $10^{-5}$ and we obtained bounds at least as good as the ones reported in \cite{ferreira2018semidefinite} for these problems. -For $esc128$, the primal feasibility is $\approx 10^{-1}$, hence, we could not manage to obtain a good optimality gap due to limited time. +For $esc128$, the primal feasibility is $\approx 10^{-1}$, hence, we could not manage to obtain a good optimality gap.% due to limited time. %% not yet complete %\begin{table}[] %\begin{tabular}{|l|l|l|l|l|l|l|} %\hline % &rank &Primal feasibility &Feasible upper-bound & Opt. Gap & ADMM & SparseQAP \\ \hline %\multirow{4}{*}{esc132} &$\sqrt{m}$ & & & & & \\ \cline{2-7} % & $10$& & & & & \\ \cline{2-7} % & $25$ & & & & & \\ \cline{2-7} % & $50$ & & & & & \\ \cline{2-7} \hline %\multirow{4}{*}{esc64a} & $\sqrt{m}$ & & & & & \\ \cline{2-7} % & $10$ & & & & & \\ \cline{2-7} % & $25$& & & & & \\ \cline{2-7} % &$50$ & & & & & \\ \cline{2-7} \hline %\multirow{4}{*}{esc16a} & $\sqrt{m}$ & & & & & \\ \cline{2-7} % & $10$ & & & & & \\ \cline{2-7} % & $25$& & & & & \\ \cline{2-7} % & $50$ & & & & & \\ \hline %\multirow{4}{*}{esc16b} & $\sqrt{m}$ & & & & & \\ \cline{2-7} % & $10$ & & & & & \\ \cline{2-7} % & $25$& & & & & \\ \cline{2-7} % & $50$ & & & & & \\ \hline %\multirow{4}{*}{esc16c} & $\sqrt{m}$ & & & & & \\ \cline{2-7} % & $10$& & & & & \\ \cline{2-7} % & $25$& & & & & \\ \cline{2-7} % & $50$ & & & & & \\ \hline %\multirow{4}{*}{esc16d} & $\sqrt{m}$& & & & & \\ \cline{2-7} % & $10$ & & & & & \\ \cline{2-7} % & $25$& & & & & \\ \cline{2-7} % &$50$ & & & & & \\ \hline %\multirow{4}{*}{esc16e} & $\sqrt{m}$ & & & & & \\ \cline{2-7} % & $10$ & & & & & \\ \cline{2-7} % & $25$ & & & & & \\ \cline{2-7} % & $50$& & & & & \\ \hline %\end{tabular} %\end{table} \begin{table}[] \begin{tabular}{|l|l|l|l|l|l|l|l|} \hline & \multicolumn{1}{l|}{} & \multicolumn{6}{c|}{Optimality Gap ($\%$)} \\ \hline \multicolumn{1}{|c|}{\multirow{2}{*}{Data}} & \multicolumn{1}{c|}{\multirow{2}{*}{Optimal Value}} & \multicolumn{1}{c|}{\multirow{2}{*}{Sparse QAP~\cite{ferreira2018semidefinite}}} & \multicolumn{5}{c|}{iAL} \\ \cline{4-8} \multicolumn{1}{|c|}{} & \multicolumn{1}{c|}{} & \multicolumn{1}{c|}{} & $r=10$ & $r=25$ & $r=50$ & $r=r_m$ & $r_m$ \\ \hline esc16a & 68 & 8.8 & 11.8 & $\mathbf{0}$ & $\mathbf{0}$ & 5.9 & 157 \\ \hline esc16b & 292 & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & 224 \\ \hline esc16c & 160 & 5 & 5.0 & 5.0 & $\mathbf{2.5}$ & 3.8 & 177 \\ \hline esc16d & 16 & 12.5 & 37.5 & $\mathbf{0}$ & $\mathbf{0}$ & 25.0 & 126 \\ \hline esc16e & 28 & 7.1 & 7.1 & $\mathbf{0}$ & 14.3 & 7.1 & 126 \\ \hline esc16g & 26 & $\mathbf{0}$ & 23.1 & 7.7 & $\mathbf{0}$ & $\mathbf{0}$ & 126 \\ \hline esc16h & 996 & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & 224 \\ \hline esc16i & 14 & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & 14.3 & $\mathbf{0}$ & 113 \\ \hline esc16j & 8 & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & 106 \\ \hline esc32a & 130 & 93.8 & 129.2 & 109.2 & 104.6 &$\mathbf{83.1}$ & 433 \\ \hline esc32b & 168 & 88.1 & 111.9 & 92.9 & 97.6 & $\mathbf{69.0}$ & 508 \\ \hline esc32c & 642 & 7.8 & 15.6 & 14.0 & 15.0 & $\mathbf{4.0}$ & 552 \\ \hline esc32d & 200 & 21 & 28.0 & 28.0 & 29.0 &$\mathbf{17.0}$ & 470 \\ \hline esc32e & 2 & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & 220 \\ \hline esc32g & 6 & $\mathbf{0}$ & 33.3 &$\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & 234 \\ \hline esc32h & 438 & 18.3 & 25.1 & 19.6 & 25.1 & $\mathbf{13.2}$ & 570 \\ \hline esc64a & 116 & 53.4 & 62.1 & 51.7 & 58.6 & $\mathbf{34.5}$ & 899 \\ \hline esc128 & 64 & $\mathbf{175}$ & 256.3 & 193.8 & 243.8 & 215.6 & 2045 \\ \hline \end{tabular} \caption{Comparison between upper bounds on the problems from the QAP library with (relatively) sparse $L$.} \label{tb:qap} \end{table} diff --git a/NeurIPS 19/sections/experiments.tex b/NeurIPS 19/sections/experiments.tex index 8d52a8d..b96f65b 100644 --- a/NeurIPS 19/sections/experiments.tex +++ b/NeurIPS 19/sections/experiments.tex @@ -1,285 +1,285 @@ %!TEX root = ../main.tex \section{Numerical Evidence \label{sec:experiments}} We first begin with a caveat: It is known that quasi-Newton methods, such as BFGS and lBFGS, might not converge for nonconvex problems~\cite{dai2002convergence, mascarenhas2004bfgs}. For this reason, we have used the trust region method as the second-order solver in our analysis in Section~\ref{sec:cvg rate}, which is well-studied for nonconvex problems~\cite{cartis2012complexity}. Empirically, however, BFGS and lBGFS are extremely successful and we have therefore opted for those solvers in this section since the subroutine does not affect Theorem~\ref{thm:main} as long as the subsolver performs well in practice. \subsection{Clustering} Given data points $\{z_i\}_{i=1}^n $, the entries of the corresponding Euclidean distance matrix $D \in \RR^{n\times n}$ are $ D_{i, j} = \left\| z_i - z_j\right\|^2 $. Clustering is then the problem of finding a co-association matrix $Y\in \RR^{n\times n}$ such that $Y_{ij} = 1$ if points $z_i$ and $z_j$ are within the same cluster and $Y_{ij} = 0$ otherwise. In~\cite{Peng2007}, the authors provide a SDP relaxation of the clustering problem, specified as \begin{align} %\begin{cases} \underset{Y \in \RR^{nxn}}{\min} \text{tr}(DY) \subjto Y\mathbf{1} = \mathbf{1}, ~\text{tr}(Y) = s,~ Y\succeq 0,~Y \geq 0, %\end{cases} \label{eq:sdp_svx} \end{align} where $s$ is the number of clusters and $Y $ is both positive semidefinite and has nonnegative entries. Standard SDP solvers do not scale well with the number of data points~$n$, since they often require projection onto the semidefinite cone with the complexity of $\mathcal{O}(n^3)$. We instead use the BM factorization to solve \eqref{eq:sdp_svx}, sacrificing convexity to reduce the computational complexity. More specifically, we solve the program \begin{align} \label{eq:nc_cluster} %\begin{cases} \underset{V \in \RR^{n\times r}}{\min} \text{tr}(DVV^{\top}) \subjto VV^{\top}\mathbf{1} = \mathbf{1},~~ \|V\|_F^2 \le s, ~~V \geq 0, %\end{cases} \end{align} where $\mathbf{1}\in \RR^n$ is the vector of all ones. -Note that $Y \geq 0$ in \eqref{eq:sdp_svx} is replaced above by the much stronger but easier-to-enforce constraint $V \geq 0$ in \eqref{eq:nc_cluster}, see~\citep{kulis2007fast} for the reasoning behind this relaxation. +Note that $Y \geq 0$ in \eqref{eq:sdp_svx} is replaced above by the much stronger but easier-to-enforce constraint $V \geq 0$ in \eqref{eq:nc_cluster}, see~\cite{kulis2007fast} for the reasoning behind this relaxation. %. Trace constraint translates to a Frobenius norm constraint in factorized space. Semidefinite constraint is naturally removed due to factorization $Y=VV^{\top}$. %See~\citep{kulis2007fast} for the details of the relaxation. Now, we can cast~\eqref{eq:nc_cluster} as an instance of~\eqref{prob:01}. Indeed, for every $i\le n$, let $x_i \in \RR^r$ denote the $i$th row of $V$. We next form $x \in \RR^d$ with $d = nr$ by expanding the factorized variable $V$, namely, $ x := [x_1^{\top}, \cdots, x_n^{\top}]^{\top} \in \RR^d, $ and then set \begin{align*} f(x) =\sum_{i,j=1}^n D_{i, j} \left\langle x_i, x_j \right\rangle, \qquad g = \delta_C, \qquad A(x) = [x_1^{\top}\sum_{j=1}^n x_j -1, \cdots, x_n^{\top}\sum_{j=1}^n x_j-1]^{\top}, \end{align*} where $C$ is the intersection of the positive orthant in $\RR^d$ with the Euclidean ball of radius $\sqrt{s}$. In Appendix~\ref{sec:verification2}, {we verify that Theorem~\ref{thm:main} applies to~\eqref{prob:01} with $f,g,A$ specified above. } In our simulations, we use two different solvers for Step~2 of Algorithm~\ref{Algo:2}, namely, APGM and lBFGS. APGM is a solver for nonconvex problems of the form~\eqref{e:exac} with convergence guarantees to first-order stationarity, as discussed in Section~\ref{sec:cvg rate}. lBFGS is a limited-memory version of BFGS algorithm in~\cite{fletcher2013practical} that approximately leverages the second-order information of the problem. We compare our approach against the following convex methods: \begin{itemize} \item HCGM: Homotopy-based Conditional Gradient Method in~\cite{yurtsever2018conditional} which directly solves~\eqref{eq:sdp_svx}. \item SDPNAL+: A second-order augmented Lagrangian method for solving SDP's with nonnegativity constraints~\cite{yang2015sdpnal}. \end{itemize} %First, we extract the meaningful features from this dataset using a simple two-layer neural network with a sigmoid activation function. Then, we apply this neural network to 1000 test samples from the same dataset, which gives us a vector of length $10$ for each data point, where each entry represents the posterior probability for each class. Then, we form the $\ell_2$ distance matrix ${D}$ from these probability vectors. %The solution rank for the template~\eqref{eq:sdp_svx} is known and it is equal to number of clusters $k$ \cite[Theorem~1]{kulis2007fast}. As discussed in~\cite{tepper2018clustering}, setting rank $r>k$ leads more accurate reconstruction in expense of speed. %Therefore, we set the rank to 20. The results are depicted in Figure~\ref{fig:clustering}. % As for the dataset, our experimental setup is similar to that described by~\cite{mixon2016clustering}. We use the publicly-available fashion-MNIST data in \cite{xiao2017/online}, which is released as a possible replacement for the MNIST handwritten digits. Each data point is a $28\times 28$ gray-scale image, associated with a label from ten classes, labeled from $0$ to $9$. First, we extract the meaningful features from this dataset using a simple two-layer neural network with a sigmoid activation function. Then, we apply this neural network to 1000 test samples from the same dataset, which gives us a vector of length $10$ for each data point, where each entry represents the posterior probability for each class. Then, we form the $\ell_2$ distance matrix ${D}$ from these probability vectors. The solution rank for the template~\eqref{eq:sdp_svx} is known and it is equal to number of clusters $k$ \cite[Theorem~1]{kulis2007fast}. As discussed in~\cite{tepper2018clustering}, setting rank $r>k$ leads more accurate reconstruction in expense of speed. Therefore, we set the rank to 20. The results are depicted in Figure~\ref{fig:clustering}. We implemented 3 algorithms on MATLAB and used the software package for SDPNAL+ which contains mex files. It is predictable that the performance of our nonconvex approach would even improve by using mex files. %\begin{figure*}[ht] %% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf} %\begin{center} %{\includegraphics[width=.4\columnwidth]{figs/clustering_fig4_times_linearbeta_last.pdf}} %{\includegraphics[width=.4\columnwidth]{figs/clustering_fig4_iter_linearbeta_last.pdf}} %%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}} %\caption{Convergence of different algorithms for clustering with fashion-MNIST dataset. Here, we set the rank as $r=20$ for the nonconvex approaches. The solution rank for the template~\eqref{eq:sdp_svx} is the number of clusters $s$ \cite[Theorem 1]{kulis2007fast}. However, as discussed in~\cite{tepper2018clustering}, setting rank $r>s$ leads more accurate reconstruction at the expense of speed, hence our choice of $r=20$. } %\label{fig:clustering} %%(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])} %\end{center} %\end{figure*} %\begin{figure}[] %% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf} %%\begin{center} %\centering %\begin{minipage}{.48\textwidth} %{\includegraphics[width=.49\columnwidth]{figs/clustering_fig4_times_linearbeta_last.pdf}} %{\includegraphics[width=.49\columnwidth]{figs/cluster_feas_time.pdf}} %\caption{Clustering running time comparison. } %\label{fig:clustering} %\end{minipage} %\begin{minipage}{.48\textwidth} %{\includegraphics[width=.49\columnwidth]{figs/bp_all_obj.pdf}} %{\includegraphics[width=.49\columnwidth]{figs/bp_all_feas.pdf}} %\caption{Basis Pursuit} %\label{fig:bp1} %\end{minipage} %\end{figure} \begin{figure}[] % \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf} \begin{center} {\includegraphics[width=.4\columnwidth]{figs/clustering_fig4_times_linearbeta_last.pdf}} {\includegraphics[width=.4\columnwidth]{figs/cluster_feas_time.pdf}} %\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}} \caption{Clustering running time comparison. } \label{fig:clustering} %(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])} \end{center} \end{figure} \subsection{Additional demonstrations} We provide several additional experiments in Appendix \ref{sec:adexp}. Section \ref{sec:bp} discusses a novel nonconvex relaxation of the standard basis pursuit template which performs comparable to the state of the art convex solvers. In Section \ref{sec:geig}, we provide fast numerical solutions to the generalized eigenvalue problem. In Section \ref{sec:gan}, we give a contemporary application example that our template applies, namely, denoising with generative adversarial networks. Finally, we provide improved bounds for sparse quadratic assignment problem instances in Section \ref{sec:qap}. %\subsection{Basis Pursuit} %Basis Pursuit (BP) finds sparsest solutions of an under-determined system of linear equations by solving %\begin{align} %%\begin{cases} %\min_{z} \|z\|_1 \subjto %Bz = b, %%\end{cases} %\label{eq:bp_main} %\end{align} %where $B \in \RR^{n \times d}$ and $b \in \RR^{n}$. %%BP has found many applications in machine learning, statistics and signal processing \cite{chen2001atomic, candes2008introduction, arora2018compressed}. %Various primal-dual convex optimization algorithms are available in the literature to solve BP, including~\cite{tran2018adaptive,chambolle2011first}. %%There also exists a line of work \cite{beck2009fast} that handles sparse regression via regularization with $\ell_1$ norm, known as Lasso regression, which we do not compare with here due to their need for tuning the regularization term. %We compare our algorithm against state-of-the-art primal-dual convex methods for solving \eqref{eq:bp_main}, namely, Chambole-Pock~\cite{chambolle2011first}, ASGARD~\cite{tran2018smooth} and ASGARD-DL~\cite{tran2018adaptive}. % %%\textbf{AE: Fatih, maybe mention a few other algorithms including asgard and also say why we can't use fista and nista.} % % % %Here, we take a different approach and cast~(\ref{eq:bp_main}) as an instance of~\eqref{prob:01}. Note that any $z \in \RR^d$ can be decomposed as $z = z^+ - z^-$, where $z^+,z^-\in \RR^d$ are the positive and negative parts of $z$, respectively. Then consider the change of variables $z^+ = u_1^{\circ 2}$ and $z^-= u_2^{\circ 2} \in \RR^d$, where $\circ$ denotes element-wise power. Next, we concatenate $u_1$ and $u_2$ as $x := [ u_1^{\top}, u_2^{\top} ]^{\top} \in \RR^{2d}$ and define $\overline{B} := [B, -B] \in \RR^{n \times 2d}$. Then, \eqref{eq:bp_main} is equivalent to \eqref{prob:01} with %\begin{align} %f(x) =& \|x\|^2, \quad g(x) = 0,\subjto %A(x) = \overline{B}x^{\circ 2}- b. %\label{eq:bp-equiv} %\end{align} %In Appendix~\ref{sec:verification1}, we verify with minimal details that Theorem~\ref{thm:main} indeed applies to~\eqref{prob:01} with the above $f,A$. % % % % %%Let $\mu(B)$ denote the \emph{coherence} of ${B}$, namely, %%\begin{align} %%\mu = \max_{i,j} | \langle B_i, B_j \rangle |, %%\end{align} %%where $B_i\in \RR^n$ is the $i$th column of $B$. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. We rewrite the last line of \eqref{eq:cnd-bp-pre} as %%\begin{align} %%& 2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \| \nonumber\\ %%& = 2 \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x^{\circ 2} -x_*^{\circ 2}) \| \nonumber\\ %%& = 2 \| \text{diag}(x_k) {B}^\top B(x_k- x_*) \| \nonumber\\ %%& = 2 \| \text{diag}(x_k) {B}^\top B(x_k- x_*) \|_{\infty} \nonumber\\ %%& \ge 2 \| \text{diag}(x_k) \text{diag}({B}^\top B) (x_k- x_*) \|_{\infty}\nonumber\\ %%& \qquad - 2 \| \text{diag}(x_k) \text{hollow}({B}^\top B) (x_k- x_*) \|_{\infty}, %%\end{align} %%where $\text{hollow}$ returns the hollow part of th %% %% and let $\overline{B}=U\Sigma V^\top$ be its thin singular value decomposition, where $U\in \RR^{n\times n}, V\in \RR^{d\times n}$ have orthonormal columns and the diagonal matrix $\Sigma\in \RR^{n\times n}$ contains the singular values $\{\sigma_i\}_{i=1}^r$. Let also $x_{k,i},U_i$ be the $i$th entry of $x_k$ and the $i$th row of $U_i$, respectively. Then we bound the last line above as %%\begin{align} %%& 2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \| \nonumber\\ %%& = 2 \| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \| \nonumber\\ %%& \ge 2 \| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \|_{\infty} \nonumber\\ %%& = \max_i |x_{k,i}| \cdot | U_i ^\top \Sigma \cdot \Sigma U^\top ( x^{\circ 2} - x_*^{\circ 2}) | \nonumber\\ %%& %%\end{align} %% %%where $\tilde{b}_{i,j} = (\tilde{B})_{ij},~ i\in [1:n]$ and $j \in [1:2d]$. %%\begin{align*} %%-DA(x)^\top(A(x) - b) = -2x \odot (\tilde{B}^\top (A(x) - b)), %%\end{align*} %%where $\odot$ denotes hadamard product. %%\begin{align*} %%& \text{dist} \left( -DA(x)^\top (A(x)-b), \frac{\partial g(x)}{\b} \right) \nonumber\\ %%& = \text{dist} \left( -DA(x)^\top (A(x)-b), \{0\} \right) \nonumber\\ %%& = \left\| -DA(x)^\top (A(x)-b) \right\| \nonumber\\ %%& \ge ?????, %%\end{align*} %%Hence, this completes the proof for regularity condition. % %We draw the entries of $B$ independently from a zero-mean and unit-variance Gaussian distribution. For a fixed sparsity level $k$, the support of $z_*\in \RR^d$ and its nonzero amplitudes are also drawn from the standard Gaussian distribution. %%We then pick a sparsity level $k$ and choose $k$ indexes, i.e, $\Omega \subset [1:d]$, which are corresponding to nonzero entries of $z$. %%We then assign values from normal distribution to those entries. %Then the measurement vector is created as $b = Bz + \epsilon$, where $\epsilon$ is the noise vector with entries drawn independently from the zero-mean Gaussian distribution with variance $\sigma^2=10^{-6}$. % %%\edita{\textbf{AE: the image sizes throughout the paper are inconsistent which is not nice. The font size in Fig 3 is also very different from the rest of the paper which is not nice. please change.}} %%\editf{(mfs:) I equated the figure sizes and will make further modifications to the inconsistent font size in this figure.} % %%\begin{figure}[ht] %%\begin{center} %%{\includegraphics[width = 12cm, height = 5cm]{figs/bp_fig1_algos.pdf}} %%\caption{Convergence with different subsolvers for the aforementioned nonconvex relaxation. %%} %%\label{fig:bp1} %%%(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])} %%\end{center} %%\end{figure} %%\vspace{-3mm} % %The results are compiled in Figure~\ref{fig:bp1}. Clearly, the performance of Algorithm~\ref{Algo:2} with a second-order solver for BP is comparable to the rest. %%Figure~\ref{fig:bp1} compiles our results for the proposed relaxation. %It is, indeed, interesting to see that these type of nonconvex relaxations gives the solution of convex one and first order methods succeed. % % % %\paragraph{Discussion:} %The true potential of our reformulation is in dealing with more structured norms rather than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as %\begin{align*} %\|z\|_{\Omega} = \sum_{i=1}^I \| z_{\Omega_i} \|, %\end{align*} %{where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the nonconvex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work. %\subsection{Adversarial Denoising with GANs} %In the appendix, we provide a contemporary application example that our template applies. %This relaxation transformed a non-smooth objective into a smooth one while loosing the linearity on the equality constraint. %\subsection{Latent Group Lasso \label{sec:latent lasso}} % %For a collection of subsets $\Omega=\{\Omega_i\}_{i=1}^{I}\subset [1:p]$, the latent group Lasso norm on $\RR^p$ takes $z\in \RR^p$ to %\begin{align} %\|z\|_{\Omega,1} = \sum_{i=1}^I \| z_{\Omega_i} \|. %\end{align} %Note that we do not require $\Omega_i$ to be disjoint. For $B\in \RR^{n\times p}$, $b\in \RR^n$, and $\lambda>0$, consider the latent group lasso as %\begin{align} %\min_{z\in \RR^d} \frac{1}{2}\| Bz - b\|_2^2+ \lambda \| z\|_{\Omega,1}. %\label{eq:group_lasso} %\end{align} %Because $\Omega$ is not a partition of $[1:p]$, computing the proximal operator of $\|\cdot\|_{\Omega}$ is often intractable, ruling out proximal gradient descent and similar algorithms for solving Program~\eqref{eq:group_lasso}. Instead, often Program~\eqref{eq:group_lasso} is solved by Alternating Direction Method of Multipliers (ADMM). More concretely, ??? % %We take a radically different approach here and cast Program~\eqref{eq:group_lasso} as an instance of Program~\eqref{prob:01}. More specifically, let $z^+,z^-\in \RR^p$ be positive and negative parts of $z$, so that $z=z^+-z^-$. Let us introduce the nonnegative $u\in \RR^p$ such that $z^++z^- = u^{\circ 2}$, where we used $\circ$ notation to show entrywise power. We may now write that %\begin{align} %\| z^++z^- \|_{\Omega,1} %& = \| u^{\circ 2} \|_{\Omega,1 } =: \|u\|_{\Omega,2}^2, %\end{align} %Unlike $\|\cdot\|_{\Omega,1}$, note that $\|\cdot\|_{\Omega,2}^2$ is differentiable and, in fact, there exists a positive semidefinite matrix $Q\in \RR^{d\times d}$ such that $\|u\|_{\Omega,2}^2=u^\top Q_\Omega u$. Let us form $x=[(z^+)^\top,(z^-)^\top, u^\top]^\top\in \RR^{3d}$ and set %\begin{align*} %f(x) = \frac{1}{2}\| Bz^+-Bz^- -b\|_1+ \|u\|_{\Omega,2}^2, %\end{align*} %\begin{align} %g(x) = 0, \qquad A(x) = z^++z^--u^{\circ 2}. %\end{align} %We can now apply Algorithm~\ref{Algo:2} to solve Program~\eqref{prob:01} with $f,g,A$ specified above. % % %\paragraph{Convergence rate.} %Clearly, $f,A$ are strongly smooth in the sense that \eqref{eq:smoothness basic} holds with proper $\lambda_f,\lambda_A$. Likewise, both $f$ and the Jacobian $DA$ are continuous and the restricted Lipschitz constants $\lambda'_f,\lambda'_A$ in \eqref{eq:defn_lambda'} are consequently well-defined and finite for a fixed $\rho'>0$. We next verify the key regularity condition in Theorem~\ref{thm:main}, namely, \eqref{eq:regularity}. Note that %\begin{align*} %DA(x) & = %\left[ %\begin{array}{ccc} %I_p & I_p & -2\text{diag}(u) %\end{array} %\right]\in \RR^{d\times 3d}, %\end{align*} %\begin{align*} %-DA(x)^\top A(x) = %\left[ %\begin{array}{c} %-z^+-z^-+u^{\circ 2} \\ %-z^+-z^-+u^{\circ 2}\\ %2\text{diag}(u)( z^++z^--u^{\circ 2}) %\end{array} %\right], %\end{align*} %\begin{align*} %& \text{dist} \left( -DA(x)^\top A(x), \frac{\partial g(x)}{\b} \right) \nonumber\\ %& = \text{dist} \left( -DA(x)^\top A(x), \{0\} \right) \nonumber\\ %& = \left\| -DA(x)^\top A(x) \right\| \nonumber\\ %& \ge \sqrt{2} \| A(x)\|, %\end{align*} %namely, \eqref{eq:regularity} holds with $\nu=1$. diff --git a/NeurIPS 19/sections/guarantees.tex b/NeurIPS 19/sections/guarantees.tex index f7795e7..c482c88 100644 --- a/NeurIPS 19/sections/guarantees.tex +++ b/NeurIPS 19/sections/guarantees.tex @@ -1,139 +1,139 @@ %!TEX root = ../main.tex \section{Convergence Rate \label{sec:cvg rate}} -This section presents the total iteration complexity of Algorithm~\ref{Algo:2} for finding first and second-order stationary points of problem \eqref{prob:01}. +This section presents the total iteration complexity of Algorithm~\ref{Algo:2} for finding first and second-order stationary points of problem \eqref{eq:minmax}. All the proofs are deferred to Appendix~\ref{sec:theory}. Theorem~\ref{thm:main} characterizes the convergence rate of Algorithm~\ref{Algo:2} for finding stationary points in the number of outer iterations. \begin{theorem}\textbf{\emph{(convergence rate)}} \label{thm:main} For integers $2 \le k_0\le k_1$, consider the interval $K=[k_0:k_1]$, and let $\{x_k\}_{k\in K}$ be the output sequence of Algorithm~\ref{Algo:2} on the interval $K$.\footnote{The choice of $k_1 = \infty$ is valid here too.} Let also $\rho:=\sup_{k\in [K]} \|x_k\|$.\footnote{If necessary, to ensure that $\rho<\infty$, one can add a small factor of $\|x\|^2$ to $\mathcal{L}_{\b}$ in \eqref{eq:Lagrangian}. Then it is easy to verify that the iterates of Algorithm \ref{Algo:2} remain bounded, provided that the initial penalty weight $\beta_0$ is large enough, $\sup_x \|\nabla f(x)\|/\|x\|< \infty$, $\sup_x \|A(x)\| < \infty$, and $\sup_x \|D A(x)\| <\infty$. } Suppose that $f$ and $A$ satisfy (\ref{eq:smoothness basic}) and let \begin{align} \lambda'_f = \max_{\|x\|\le \rho} \|\nabla f(x)\|,\qquad \lambda'_A = \max_{\|x\| \le \rho} \|DA(x)\|, \label{eq:defn_restricted_lipsichtz} \end{align} be the (restricted) Lipschitz constants of $f$ and $A$, respectively. %Suppose that $\b_1 \ge \b_0$, where the expression for $\b_0(f,g,A,\s_1)$ is given in (\ref{eq:beta-0}). With $\nu>0$, assume that \begin{align} \nu \|A(x_k)\| & \le \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right), \label{eq:regularity} \end{align} for every $k\in K$. We consider two cases: %{\color{red} Ahmet: I removed the squares and showed the rate on dist + feasibility, since it is the measure for the stationarity, using squares confuses complexity analysis...} \begin{itemize}[leftmargin=*] -\item If a first-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f},\b_k)$ first-order stationary point of (\ref{prob:01}) with +\item If a first-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f},\b_k)$ first-order stationary point of (\ref{eq:minmax}) with \begin{align} \epsilon_{k,f} & = \frac{1}{\beta_{k-1}} \left(\frac{2(\lambda'_f+\lambda'_A y_{\max}) (1+\lambda_A' \sigma_k)}{\nu}+1\right) =: \frac{Q(f,g,A,\s_1)}{\beta_{k-1}}, \label{eq:stat_prec_first} \end{align} for every $k\in K$, where $y_{\max}(x_1,y_0,\s_1)$ is specified in (\ref{eq:dual growth}) due to the limited space. -\item If a second-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f}, \epsilon_{k,s},\b_k)$ second-order stationary point of~(\ref{prob:01}) with $\epsilon_{k,s}$ specified above and with +\item If a second-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f}, \epsilon_{k,s},\b_k)$ second-order stationary point of~(\ref{eq:minmax}) with $\epsilon_{k,s}$ specified above and with \begin{align} \epsilon_{k,s} &= \epsilon_{k-1} + \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} = \frac{\nu + \sigma_k \sqrt{m} \lambda_A 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} =: \frac{Q'(f,g,A,\s_1)}{\beta_{k-1}}. \end{align} \end{itemize} \end{theorem} %A few remarks are in order about Theorem~\ref{thm:main}. %\textbf{Tradeoff.} %Roughly speaking, Theorem~\ref{thm:main} states that the iterates of Algorithm~\ref{Algo:2} approach first-order stationarity for~\eqref{prob:01} at the rate of $1/{\b_k}$, where $\b_k$ is the penalty weight in AL in iteration $k$ as in~\eqref{eq:Lagrangian}. Note the inherent tradeoff here: For a faster sequence $\{\b_k\}_k$, the iterates approach stationarity faster but the computational cost of the inexact primal subroutine (Step 2) increases since a higher accuracy is required and Lipschitz constant of the unconstrained problem is also affected by $\beta_k$. In words, there is a tradeoff between the number of outer and inner loop iterations of Algorithm~\ref{Algo:2}. We highlight this tradeoff for a number of subroutines below. -Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} converges to a (first- or second-) order stationary point of \eqref{prob:01} at the rate of $1/\b_k$, further specified in +Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} converges to a (first- or second-) order stationary point of \eqref{eq:minmax} at the rate of $1/\b_k$, further specified in Corollary \ref{cor:first} and Corollary \ref{cor:second}. %Sections \ref{sec:first-o-opt} and \ref{sec:second-o-opt}. A few remarks are in order about Theorem \ref{thm:main}. -\textbf{Regularity.} The key geometric condition in Theorem~\ref{thm:main} is \eqref{eq:regularity} which, broadly speaking, ensures that the primal updates of Algorithm \ref{Algo:2} reduce the feasibility gap as the penalty weight $\b_k$ grows. We will verify this condition for several examples in Section \ref{sec:experiments}. +\paragraph{Regularity.} The key geometric condition in Theorem~\ref{thm:main} is \eqref{eq:regularity} which, broadly speaking, ensures that the primal updates of Algorithm \ref{Algo:2} reduce the feasibility gap as the penalty weight $\b_k$ grows. We will verify this condition for several examples in Section \ref{sec:experiments}. This condition in \eqref{eq:regularity} is closely related to those in the existing literature. In the special case where $g=0$ in~\eqref{prob:01}, it is easy to verify that \eqref{eq:regularity} reduces to the Polyak-Lojasiewicz (PL) condition for minimizing $\|A(x)\|^2$~\cite{karimi2016linear}. PL condition itself is a special case of Kurdyka-Lojasiewicz with $\theta = 1/2$, see \cite[Definition 1.1]{xu2017globally}. When $g=0$, it is also easy to see that \eqref{eq:regularity} is weaker than the Mangasarian-Fromovitz (MF) condition in nonlinear optimization \cite[Assumption 1]{bolte2018nonconvex}. {Moreover, {when $g$ is the indicator on a convex set,} \eqref{eq:regularity} is a consequence of the \textit{basic constraint qualification} in \cite{rockafellar1993lagrange}, which itself generalizes the MF condition to the case when $g$ is an indicator function of a convex set.} %\notea{I'm not sure if the claim about basic constraint qualification is true because our condition should locally hold rather globally. Could you add the exact equation number in [55] and double check this? Also consider double checking other claims in this paragraph.} We may think of \eqref{eq:regularity} as a local condition, which should hold within a neighborhood of the constraint set $\{x:A(x)=0\}$ rather than everywhere in $\mathbb{R}^d$. There is a constant complexity algorithm in \cite{bolte2018nonconvex} to reach this so-called ``information zone'', which supplements Theorem \ref{thm:main}. Lastly, in contrast to most conditions in the nonconvex optimization literature, such as~\cite{flores2012complete}, the condition in~\eqref{eq:regularity} appears to be easier to verify, as we see in the sequel. %\begin{exmp}[Clustering] %Clustering is an important application %\end{exmp} %\edita{\textbf{AE: Fatih, I think you had added the following two references to our response for icml. Could you discuss their relevance here? [2] Rockafellar, Lagrange Multipliers and Optimality, 1993 %[3] Bertsekas, On penalty and multiplier methods for constrained minimization. 1996}} %\edita{\textbf{AE: the spars review talks about the "Pong-Li" work. Fatih, do you know what is that?}} % %\editf{(mfs:) I do not think this work is relevant to our condition but the template seems similar. We can mention in the related works instead.} %\notea{Ok. Yes, consider adding it to the related work.} %We now argue that such a condition is necessary for controlling the feasibility gap of~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}. %To provide further insight about \eqref{eq:regularity}, consider the convex case where $f=0$ and $g=\delta_\mathcal{X}$, where $\mathcal{X}$ is a convex set and $A$ is a linear operator. In this case, solving \eqref{prob:01} finds a point in the convex set $\mathcal{X}\cap \text{null}(A)$, where the subspace $\text{null}(A)=\{ x\in\mathbb{R}^d: A(x) = 0 \} \subset \RR^d$ is the null space of $A$. %Here, the Slater's condition~\cite{boyd2004convex} requires that %\begin{equation} %\text{relint} (\mathcal{X}) \cap \text{null}(A)\neq \emptyset. %\end{equation} %%<<<<<<< HEAD %Recall that the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}. %Intuitively, the Slater's condition achieves this by removing pathological cases and ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01} and finding a point in $\mathcal{X}\cap \text{null}(A)$ can be particularly slow. For instance, in Figure \ref{fig:convex_slater}, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) converges to the intersection point at the suboptimal rate of $1/\sqrt{k}$ rather than the standard $1/k$ rate. %{\color{red} Ahmet: a reference would be cool here} %======= %In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}. Intuitively, the Slater's condition here removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01}, namely, finding a point in $\mathcal{X}\cap \text{null}(A)$, can be particularly difficult. For instance, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) has arbitrarily slow convergence, see Figure~\ref{fig:convex_slater}. %>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732 %\begin{figure} %\begin{center} %\includegraphics[scale=.5]{Slater.pdf} %\end{center} %\caption{In pathological cases where the Slater's condition does not apply, solving \eqref{prob:01} can be particularly slow, even when \eqref{prob:01} is a convex program. See the first remark after Theorem~\ref{thm:main} for more details. \label{fig:convex_slater}} %\end{figure} \paragraph{Penalty method.} A classical algorithm to solve \eqref{prob:01} is the penalty method, which is characterized by the absence of the dual variable ($y=0$) in \eqref{eq:Lagrangian}. Indeed, ALM can be interpreted as an adaptive penalty or smoothing method with a variable center determined by the dual variable. It is worth noting that, with the same proof technique, one can establish the same convergence rate of Theorem \ref{thm:main} for the penalty method. However, while both methods have the same convergence rate in theory, we ignore the uncompetitive penalty method since it is significantly outperformed by iALM in practice. % outperforms the penalty method significantly in practice. % by virtue of its variable center and has been excluded from this presentation for this reason. -\textbf{Computational complexity.} Theorem~\ref{thm:main} specifies the number of (outer) iterations that Algorithm~\ref{Algo:2} requires to reach a near-stationary point of problem~\eqref{eq:Lagrangian} with a prescribed precision and, in particular, specifies the number of calls made to the solver in Step~2. In this sense, Theorem~\ref{thm:main} does not fully capture the computational complexity of Algorithm~\ref{Algo:2}, as it does not take into account the computational cost of the solver in Step~2. +\paragraph{Computational complexity.} Theorem~\ref{thm:main} specifies the number of (outer) iterations that Algorithm~\ref{Algo:2} requires to reach a near-stationary point of problem~\eqref{eq:Lagrangian} with a prescribed precision and, in particular, specifies the number of calls made to the solver in Step~2. In this sense, Theorem~\ref{thm:main} does not fully capture the computational complexity of Algorithm~\ref{Algo:2}, as it does not take into account the computational cost of the solver in Step~2. To better understand the total iteration complexity of Algorithm~\ref{Algo:2}, we consider two scenarios in the following. In the first scenario, we take the solver in Step~2 to be the Accelerated Proximal Gradient Method (APGM), a well-known first-order algorithm~\cite{ghadimi2016accelerated}. In the second scenario, we will use the second-order trust region method developed in~\cite{cartis2012complexity}. We have the following two corollaries showing the total complexity of our algorithm to reach first and second-order stationary points. Appendix \ref{sec:opt_cnds} contains the proofs and more detailed discussion for the complexity results. % \begin{corollary}[First-order optimality]\label{cor:first} -For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point, +For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point of~\eqref{eq:minmax}, after $T$ calls to the first-order oracle, where % \begin{equation} T = \mathcal{O}\left( \frac{Q^3 \rho^2}{\epsilon^{3}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} \rho^2}{\epsilon^{3}} \right). \end{equation} \end{corollary} %For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is thus not feasible. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight.\footnote{In this context, homotopy loosely corresponds to the gradual enforcement of the constraints by increasing the penalty weight.} This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}. For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is thus not feasible. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight. This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}. \begin{corollary}[Second-order optimality]\label{cor:second} For $b>1$, let $\beta_k =b^k $ for every $k$. We assume that \begin{equation} \mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) \leq L_{u},\qquad \forall \beta. \end{equation} -If we use the trust region method from~\cite{cartis2012complexity} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon$-second-order stationary point of~(\ref{prob:01}) in $T$ calls to the second-order oracle where +If we use the trust region method from~\cite{cartis2012complexity} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon$-second-order stationary point of~\eqref{eq:minmax} in $T$ calls to the second-order oracle where % \begin{equation} T = \mathcal{O}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \log_b{\left( \frac{Q'}{\epsilon} \right)} \right) = \widetilde{\mathcal{O}}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \right). \end{equation} \end{corollary} \paragraph{Remark.} These complexity results for first and second-order are stationarity with respect to~\eqref{eq:Lagrangian}. We note that these complexities match~\cite{cartis2018optimality} and~\cite{birgin2016evaluation}. However, the stationarity criteria and the definition of dual variable in these papers differ from ours. We include more discussion on this in the Appendix. diff --git a/NeurIPS 19/sections/introduction.tex b/NeurIPS 19/sections/introduction.tex index 666a0c9..6754f47 100644 --- a/NeurIPS 19/sections/introduction.tex +++ b/NeurIPS 19/sections/introduction.tex @@ -1,95 +1,97 @@ %!TEX root = ../main.tex \vspace{-5mm} \section{Introduction}\vspace{-3mm} \label{intro} We study the nonconvex optimization problem % \begin{equation} \label{prob:01} %\begin{cases} \underset{x\in \mathbb{R}^d}{\min}\,\, f(x)+g(x) \subjto A(x) = 0, %\end{cases} \end{equation} % where $f:\mathbb{R}^d\rightarrow\mathbb{R}$ is a {continuously-differentiable} nonconvex function and $A:\mathbb{R}^d\rightarrow\mathbb{R}^m$ is a nonlinear operator. %For clarity of notation, we take $b=0$ in the sequel, the extension to any $b$ is trivial. %We assume that $g:\mathbb{R}^d\rightarrow\mathbb{R}$ is a possibly nonsmooth but proximal-friendly convex function \cite{parikh2014proximal}. We assume that $g:\mathbb{R}^d\rightarrow\mathbb{R}$ is a proximal-friendly convex function \cite{parikh2014proximal}. % (not necessarily smooth). %For instance, for a convex set $\mathcal{X}\subset\RR^d$, letting $g=\delta_\mathcal{X}$ to be the indicator function on $\mathcal{X}$, ~\eqref{prob:01} is a nonconvex optimization problem with the convex constraint $x\in \mathcal{X}$. %Our other requirement for $g$ is that computing its proximal operator is inexpensive, see Section~\ref{sec:preliminaries}. A host of problems in computer science \cite{khot2011grothendieck, lovasz2003semidefinite, zhao1998semidefinite}, machine learning \cite{mossel2015consistency, song2007dependence}, and signal processing~\cite{singer2011angular, singer2011three} naturally fall under the template~\eqref{prob:01}, including max-cut, clustering, generalized eigenvalue decomposition, as well as the {quadratic assignment problem (QAP) \cite{zhao1998semidefinite}}.% \textbf{AE: We need to add some more specifics to this paragraph to convince people that this is an important template. What are the names of the problems you're citing? Max-cut, sparse regression, statistical learning with deep networks as prior \cite{Bora2017}, what else we can think of?} To solve \eqref{prob:01}, we %builds up on the vast literature on the classical inexact augmented Lagrangian framework and propose an intuitive and easy-to-implement {augmented Lagrangian} algorithm, and provide its total iteration complexity under an interpretable geometric condition. Before we elaborate on the results, let us first motivate~\eqref{prob:01} with an application to semidefinite programming (SDP): -\vspace{-3mm} +%\vspace{-1mm} \paragraph{Vignette: Burer-Monteiro splitting.} A powerful convex relaxation for max-cut, clustering, and many others is provided by the SDP \begin{equation} \label{eq:sdp} %\begin{cases} \underset{X\in\mathbb{S}^{d \times d}}{\min} \langle C, X \rangle \subjto B(X) = b, \,\, X \succeq 0, %\end{cases} \end{equation} % where $C\in \RR^{d\times d}$, $X$ is a positive semidefinite $d\times d$ matrix, %$\mathbb{S}^{d \times d}$ denotes the set of real symmetric matrices, and ${B}: \mathbb{S}^{d\times d} \to \mathbb{R}^m$ is a linear operator. If the unique-games conjecture is true, the SDP \eqref{eq:sdp} obtains the best possible approximation for the underlying discrete problem~\cite{raghavendra2008optimal}. Since $d$ is often large, many first- and second-order methods for solving such SDP's are immediately ruled out, not only due to their high computational complexity, but also due to their storage requirements, which are $\mathcal{O}(d^2)$. A contemporary challenge in optimization is therefore to solve SDPs using little space and in a scalable fashion. The recent homotopy conditional gradient method, which is based on linear minimization oracles (LMOs), can solve \eqref{eq:sdp} in a small space via sketching \cite{yurtsever2018conditional}. However, such LMO-based methods are extremely slow in obtaining accurate solutions. %In practice, $d$ is often very large which makes interior point methods, with their poor scaling in $d$, an unattractive option for solving ~\eqref{eq:sdp}. Attempts to resolve this issue prompted extensive research in computationally- and memory-efficient SDP solvers. The first such solvers relied on the so-called Linear Minimization Oracle (LMO), reviewed in Section~\ref{sec:related work}, alongside other scalabe SDP solvers. -A different approach for solving \eqref{prob:01}, dating back to~\cite{burer2003nonlinear, burer2005local}, is the so-called Burer-Monteiro (BM) factorization $X=UU^\top$, where $U\in\mathbb{R}^{d\times r}$ and $r$ is selected according to the guidelines in~\cite{pataki1998rank, barvinok1995problems}, which is tight~\cite{waldspurger2018rank}. +A different approach for solving \eqref{eq:sdp}, dating back to~\cite{burer2003nonlinear, burer2005local}, is the so-called Burer-Monteiro (BM) factorization $X=UU^\top$, where $U\in\mathbb{R}^{d\times r}$ and $r$ is selected according to the guidelines in~\cite{pataki1998rank, barvinok1995problems}, which is tight~\cite{waldspurger2018rank}. %It has been shown that these bounds on the rank, which are shown to be optimal~\cite{waldspurger2018rank}, under some assumptions removing the spurious local minima of the nonconvex factorized problem~\cite{boumal2016non}. %so as to remove spurious local minima of the nonconvex factorized problem. Evidently, BR splitting has the advantage of lower storage and faster iterations. The BM factorization leads to the following nonconvex problem in the template~\eqref{prob:01}: \begin{equation} \label{prob:nc} %\begin{cases} \underset{U\in\mathbb{R}^{d \times r}}{\min} \langle C, UU^\top \rangle \subjto B(UU^\top) = b, %\end{cases} \end{equation} %which can be easily written in the form of~\eqref{prob:01}. -The BM factorization does not introduce any extraneous local minima~\cite{burer2005local}. Moreover,~\cite{boumal2016non} established the connection between the local minimizers of the factorized problem~\eqref{prob:nc} and the global minimizers for~\eqref{eq:sdp}. +The BM factorization does not introduce any extraneous local minima~\cite{burer2005local}. Moreover,~\cite{boumal2016non} establishes the connection between the local minimizers of the factorized problem~\eqref{prob:nc} and the global minimizers for~\eqref{eq:sdp}. %\textbf{Vignette 2: Latent group lasso} %See Section ? for several examples. \textbf{We should explain this better and give more high level examples to motivate this .} %An example of our template in \eqref{prob:01} is semi-definite ming which provides powerful relaxations to above problems. Denote the space of $d'\times d'$ symmetric matrices by $\mathbb{S}^{d'\times d'}$ and consider % %\begin{equation} % \label{sdp} % \min_x \{h'(x): A'(x) = b', ~~x \in \mathcal{C'}~~\text{and}~~x\succeq0 \} %\end{equation} % %where $h': \mathbb{S}^{d'\times d'} \to \RR$, $A'\colon\mathbb{S}^{d'\times d'}\to\RR^m$, $b'\in\RR^m$, and $C' \subseteq \mathbb{R}^{d'\times d'}$. This template clearly can be put to the form of \eqref{prob:01} by using \emph{Burer-Monteiro factorization} \cite{burer2003nonlinear, burer2005local}. To solve \eqref{prob:nc}, the inexact Augmented Lagrangian method (iALM) is widely used~\cite{burer2003nonlinear, burer2005local, kulis2007fast}, due to its cheap per iteration cost and its empirical success. -Every (outer) iteration of iALM calls a solver to solve an intermediate augmented Lagrangian subproblem to near stationarity. The choices include first-order methods, such as the proximal gradient descent \cite{parikh2014proximal}, or second-order methods, such as the trust region method and BFGS~\cite{nocedal2006numerical}.\footnote{Strictly speaking, BFGS is in fact a quasi-Newton method that emulates second-order information.} +Every (outer) iteration of iALM calls a solver to solve an intermediate augmented Lagrangian subproblem to near stationarity. The choices include first-order methods, such as the proximal gradient descent \cite{parikh2014proximal}, or second-order methods, such as the trust region method and BFGS~\cite{nocedal2006numerical}.\footnote{BFGS is in fact a quasi-Newton method that emulates second-order information.} -Unlike its convex counterpart~\cite{nedelcu2014computational,lan2016iteration,xu2017inexact}, the convergence rate and the complexity of iALM for ~\eqref{prob:nc} are not well-understood, see Section~\ref{sec:related work} for a review of the related literature. Indeed, addressing this important theoretical gap is one of the contributions of our work. In addition +Unlike its convex counterpart~\cite{nedelcu2014computational,lan2016iteration,xu2017inexact}, the convergence rate and the complexity of iALM for ~\eqref{prob:nc} are not well-understood, see Section~\ref{sec:related work} for a review of the related literature. Indeed, addressing this important theoretical gap is one of the contributions of our work. In addition, { %{\textbf{Summary of contributions:} \\[2mm] $\triangleright$ { -We derive the convergence rate of iALM for solving \eqref{prob:01} to first- or second-order optimality, and find the total iteration complexity of iALM using different solvers for the augmented Lagrangian subproblems. Our complexity bounds match the best theoretical results in optimization, see Section~\ref{sec:related work}.} \\[2mm] +We derive the convergence rate of iALM +%for solving \eqref{prob:01} +to first- or second-order optimality, and find the total iteration complexity of iALM using different solvers for the augmented Lagrangian subproblems. Our complexity bounds match the best theoretical results in optimization, see Section~\ref{sec:related work}.} \\[2mm] $\triangleright$ Our iALM framework is future-proof in the sense that different subsolvers can be substituted. \\[2mm] $\triangleright$ We propose a geometric condition that simplifies the algorithmic analysis for iALM, and clarify its connection to well-known Polyak-Lojasiewicz \cite{karimi2016linear} and Mangasarian-Fromovitz \cite{bertsekas1982constrained} conditions. %\edita{please add citations for these conditions here.} We also verify this condition for key problems in Section~\ref{sec:experiments}. %\edita{how about the KL condition spars'19 reviewers pointed out?} %\textbf{Roadmap.} Section~\ref{sec:preliminaries} collects the main tools and our notation. % that we utilize. %We present the iALM in Section~\ref{sec:AL algorithm} and obtain its convergence rate to first- and second-order stationary points in Section~\ref{sec:cvg rate}, alongside their iteration complexities. %We provide a comprehensive review of the literature and highlight our key differences in % with the precedents in %Section~\ref{sec:related work}. %Section~\ref{sec:experiments} presents the numerical evidence and comparisons with the state-of-the-art techniques. % methods. diff --git a/NeurIPS 19/sections/preliminaries.tex b/NeurIPS 19/sections/preliminaries.tex index 7d01977..371ae0b 100644 --- a/NeurIPS 19/sections/preliminaries.tex +++ b/NeurIPS 19/sections/preliminaries.tex @@ -1,207 +1,207 @@ %!TEX root = ../main.tex %\begin{comment} \vspace{-4mm} \section{Preliminaries \label{sec:preliminaries}}\vspace{-3mm} %\editf{Try to simplify this section as much as possible. We need to shorthen the paper.} \paragraph{\textbf{{Notation.}}} We use the notation $\langle\cdot ,\cdot \rangle $ and $\|\cdot\|$ for the {standard inner} product and the norm on $\RR^d$. For matrices, $\|\cdot\|$ and $\|\cdot\|_F$ denote the spectral and the Frobenius norms, respectively. %For a matrix $M$, its row and null spaces are denoted by $\row(A)$ and $\text{null}(A)$, respectively. For the convex function $g:\RR^d\rightarrow\RR$, the subdifferential set at $x\in \RR^d$ is denoted by $\partial g(x)$ and we will occasionally use the notation $\partial g(x)/\b = \{ z/\b : z\in \partial g(x)\}$. When presenting iteration complexity results, we often use $\widetilde{O}(\cdot)$ which suppresses the logarithmic dependencies. %The proximal operator $P_g:\RR^d\rightarrow\RR^d$ takes $x$ to %\begin{align} %P_g(x) = \underset{y}{\argmin} \, g(y) + \frac{1}{2}\|x-y\|^2, %\label{eq:defn of prox} %\end{align} %and, if $g=1_C$ is the indicator function of a convex set $C$, we will use the shorthand $P_C=P_{1_C}$ to show the orthogonal projection onto $C$. We denote $\delta_\mathcal{X}:\RR^d\rightarrow\RR$ as the indicator function of a set $\mathcal{X}\subset\RR^d$. %, which takes $x$ to %\begin{align} %\delta_\mathcal{X}(x) = %\begin{cases} %0 & x \in \mathcal{X}\\ %\infty & x\notin \mathcal{X}. %\end{cases} %\label{eq:indicator} %\end{align} % %The tangent cone of $C$ at $x\in C$ is denoted by $T_C(x)$ and The distance function from a point $x$ to $\mathcal{X}$ is denoted by $\dist(x,\mathcal{X}) = \min_{z\in \mathcal{X}} \|x-z\|$. For integers $k_0 \le k_1$, we use the notation $[k_0:k_1]=\{k_0,\ldots,k_1\}$. For an operator $A:\RR^d\rightarrow\RR^m$ with components $\{A_i\}_{i=1}^m$, $DA(x) \in \mathbb{R}^{m\times d}$ denotes the Jacobian of $A$, where the $i$th row of $DA(x)$ is the vector $\nabla A_i(x) \in \RR^d$. %We denote the Hessian of the augmented Lagrangian with respect to $x$ as $\nabla _{xx} \mathcal{L}_{\beta}(x,y)$. \\ %{\color{red} define $A_i$ rigorously.} -\textbf{Smoothness.} +\paragraph{Smoothness.} We assume smooth $f:\RR^d\rightarrow\RR$ and $A:\RR^d\rightarrow \RR^m$; i.e., there exist $\lambda_f,\lambda_A\ge 0$ s.t. % %\vspace{-.3cm} \begin{align} \| \nabla f(x) - \nabla f(x')\| \le \lambda_f \|x-x'\|, \quad \| DA(x) - DA(x') \| \le \lambda_A \|x-x'\|, \quad \forall x, x' \in \RR^d . \label{eq:smoothness basic} \end{align} %\edita{the above format is more professional imo. using and sign is not common. } %\end{comment} -\textbf{Augmented Lagrangian method (ALM)}. +\paragraph{Augmented Lagrangian method (ALM).} ALM is a classical algorithm, which first appeared in~\cite{hestenes1969multiplier, powell1969method} and extensively studied afterwards in~\cite{bertsekas1982constrained, birgin2014practical}. For solving \eqref{prob:01}, ALM suggests solving the problem % %\vspace{-.3cm} \begin{equation} \min_{x} \max_y \,\,\mathcal{L}_\beta(x,y) + g(x), \label{eq:minmax} \end{equation} %\vspace{-.5cm} % where, for penalty weight $\b>0$, $\mathcal{L}_\b$ is the corresponding augmented Lagrangian, defined as \begin{align} \label{eq:Lagrangian} \mathcal{L}_\beta(x,y) := f(x) + \langle A(x), y \rangle + \frac{\beta}{2}\|A(x)\|^2. \end{align} %\textbf{AE: Adding the bias $b$ doesn't seem to add much except making the presentation more clunky.} %\vspace{-.5cm} The minimax formulation in \eqref{eq:minmax} naturally suggests the following algorithm for solving \eqref{prob:01}: \begin{equation}\label{e:exac} x_{k+1} \in \underset{x}{\argmin} \,\, \mathcal{L}_{\beta}(x,y_k)+g(x), \end{equation} \begin{equation*} y_{k+1} = y_k+\s_k A(x_{k+1}), \end{equation*} where the dual step sizes are denoted as $\{\s_k\}_k$. However, computing $x_{k+1}$ above requires solving the nonconvex problem~\eqref{e:exac} to optimality, which is typically intractable. Instead, it is often easier to find an approximate first- or second-order stationary point of~\eqref{e:exac}. %We therefore consider an algorithm that only requires approximate stationarity in every iteration. -Hence, we argue that by gradually improving the stationarity precision and increasing the penalty weight $\b$ above, we can reach a stationary point of the main problem in~\eqref{prob:01}, as detailed in Section~\ref{sec:AL algorithm}. +Hence, we argue that by gradually improving the stationarity precision and increasing the penalty weight $\b$ above, we can reach a stationary point of the main problem in~\eqref{eq:minmax}, as detailed in Section~\ref{sec:AL algorithm}. \paragraph{{\textbf{Optimality conditions.}}} {First-order necessary optimality conditions} for \eqref{prob:01} are well-studied. {Indeed, $x\in \RR^d$ is a first-order stationary point of~\eqref{prob:01} if there exists $y\in \RR^m$ such that % %%% I am cutting this to save space. put it back if necessary (VC) %\begin{align} %%\begin{cases} %-\nabla f(x) - DA(x)^\top y \in \partial g(x), \qquad \,\, A(x) = 0, %%\end{cases} %\label{e:inclu1} %\end{align} % %where $DA(x)$ is the Jacobian of $A$ at $x$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to \begin{align} %\begin{cases} -\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x),\qquad A(x) = 0, %\end{cases} \label{e:inclu2} \end{align} which is in turn the necessary optimality condition for \eqref{eq:minmax}.} %Note that \eqref{e:inclu2} is equivalent to %\begin{align} %\left[ %\begin{array}{c} %\nabla_x \L_\b(x,y) \\ %\nabla_y \L_\b(x,y) %\end{array} %\right] %\in %\left\{ %\left[ %\begin{array}{c} % v\\ %0 %\end{array} %\right] %: v\in \partial g(x) \right\} %, %\end{align} %which rephrases the stationarity condition in terms of the gradient of the duality gap of Program~\eqref{eq:Lagrangian}. %\editf{mfs: check approx. optimality conditions, how they apply in this setting.} Inspired by this, we say that $x$ is an $(\epsilon_f,\b)$ first-order stationary point of \eqref{eq:minmax} if {there exists a $y \in \RR^m$} such that \begin{align} %\begin{cases} \dist(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x)) \leq \epsilon_f, \qquad \| A(x) \| \leq \epsilon_f, %\end{cases} -\label{eq:inclu3app} +\label{eq:inclu3} \end{align} for $\epsilon_f\ge 0$. -In light of \eqref{eq:inclu3app}, a metric for evaluating the stationarity of a pair $(x,y)\in \RR^d\times \RR^m$ is +In light of \eqref{eq:inclu3}, a metric for evaluating the stationarity of a pair $(x,y)\in \RR^d\times \RR^m$ is \begin{align} \dist\left(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x) \right) + \|A(x)\| , -\label{eq:cvg metricapp} +\label{eq:cvg metric} \end{align} which we use as the first-order stopping criterion. %When $g=0$, it is also not difficult to verify that the expression in \eqref{eq:cvg metric} is the norm of the gradient of the duality gap in \eqref{eq:minmax}. As an example, for a convex set $\mathcal{X}\subset\RR^d$, suppose that $g = \delta_\mathcal{X}$ is the indicator function on $\mathcal{X}$. Let also $T_\mathcal{X}(x) \subseteq \RR^d$ denote the tangent cone to $\mathcal{X}$ at $x$, and with $P_{T_\mathcal{X}(x)}:\RR^d\rightarrow\RR^d$ we denote the orthogonal projection onto this tangent cone. Then, for $u\in\RR^d$, it is not difficult to verify that \begin{align}\label{eq:dist_subgrad} \dist\left(u, \partial g(x) \right) = \| P_{T_\mathcal{X}(x)}(u) \|. \end{align} % When $g = 0$, a first-order stationary point $x\in \RR^d$ of \eqref{prob:01} is also second-order stationary if % %\vspace{-.5cm} \begin{equation} \lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))\ge 0, \end{equation} %\vspace{-.5cm} % where $\nabla_{xx}\mathcal{L}_\b$ is the Hessian of $\mathcal{L}_\b$ with respect to $x$, and $\lambda_{\text{min}}(\cdot)$ returns the smallest eigenvalue of its argument. Analogously, $x$ is an $(\epsilon_f, \epsilon_s,\b)$ second-order stationary point if, in addition to \eqref{eq:inclu3}, it holds that % %\vspace{-.5cm} \begin{equation}\label{eq:sec_opt} \lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) \ge -\epsilon_s, \end{equation} for $\epsilon_s\ge 0$. %\vspace{-.5cm} % Naturally, for second-order stationarity, we use $\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))$ as the stopping criterion. %\paragraph{{\textbf{Stationarity.}}}$x$ is an $(\epsilon_f,\b)$ {\textit{first-order stationary}} point of \eqref{eq:minmax} if {there exists a $y \in \RR^m$} such that %\begin{align} %%\begin{cases} %\dist(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x)) \leq \epsilon_f, \qquad \| A(x) \| \leq \epsilon_f, %%\end{cases} %\label{eq:inclu3} %\end{align} %for $\epsilon_f\ge 0$. %In light of \eqref{eq:inclu3}, a metric for evaluating the stationarity of a pair $(x,y)\in \RR^d\times \RR^m$ is %\begin{align} %\dist\left(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x) \right) + \|A(x)\| , %\label{eq:cvg metric} %\end{align} %which we use as the first-order stopping criterion. % %When $g = 0$, a first-order stationary point $x\in \RR^d$ of \eqref{prob:01} is also \textit{second-order stationary} if %% %%\vspace{-.5cm} %\begin{equation} %\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))\ge 0, %\end{equation} %%\vspace{-.5cm} %% %where $\nabla_{xx}\mathcal{L}_\b$ is the Hessian of $\mathcal{L}_\b$ with respect to $x$, and $\lambda_{\text{min}}(\cdot)$ returns the smallest eigenvalue of its argument. %{\textbf{{{Optimality conditions.}}} \paragraph{{\textbf{Smoothness lemma.}}} This next result controls the smoothness of $\L_\b(\cdot,y)$ for a fixed $y$. The proof is standard but nevertheless is included in Appendix~\ref{sec:proof of smoothness lemma} for completeness. \begin{lemma}[\textbf{smoothness}]\label{lem:smoothness} For fixed $y\in \RR^m$ and $\rho,\rho'\ge 0$, it holds that \begin{align} \| \nabla_x \mathcal{L}_{\beta}(x, y)- \nabla_x \mathcal{L}_{\beta}(x', y) \| \le \lambda_\b \|x-x'\|, \end{align} for every $x,x' \in \{ x'': \|x''\|\le \rho, \|A(x'') \|\le \rho'\}$, where \begin{align} \lambda_\beta \le \lambda_f + \sqrt{m}\lambda_A \|y\| + (\sqrt{m}\lambda_A\rho' + d \lambda'^2_A )\b =: \lambda_f + \sqrt{m}\lambda_A \|y\| + \lambda''(A,\rho,\rho') \b. \label{eq:smoothness of Lagrangian} \end{align} Above, $\lambda_f,\lambda_A$ were defined in (\ref{eq:smoothness basic}) and \begin{align} \lambda'_A := \max_{\|x\|\le \rho}\|DA(x)\|. \end{align} \end{lemma}