diff --git a/ICML19/Reviews/iALM_with_applications_to_sdps_nadav.pdf b/ICML19/Reviews/iALM_with_applications_to_sdps_nadav.pdf new file mode 100644 index 0000000..5f136ae Binary files /dev/null and b/ICML19/Reviews/iALM_with_applications_to_sdps_nadav.pdf differ diff --git a/NeurIPS 19/main.aux b/NeurIPS 19/main.aux index 727cc09..2c395e4 100644 --- a/NeurIPS 19/main.aux +++ b/NeurIPS 19/main.aux @@ -1,194 +1,193 @@ \relax \providecommand\hyper@newdestlabel[2]{} \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined \global\let\oldcontentsline\contentsline \gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} \global\let\oldnewlabel\newlabel \gdef\newlabel#1#2{\newlabelxx{#1}#2} \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} \AtEndDocument{\ifx\hyper@anchor\@undefined \let\contentsline\oldcontentsline \let\newlabel\oldnewlabel \fi} \fi} \global\let\hyper@last\relax \gdef\HyperFirstAtBeginDocument#1{#1} \providecommand\HyField@AuxAddToFields[1]{} \providecommand\HyField@AuxAddToCoFields[2]{} \citation{khot2011grothendieck,lovasz2003semidefinite,zhao1998semidefinite} \citation{mossel2015consistency,song2007dependence} \citation{singer2011angular,singer2011three} \citation{raghavendra2008optimal} \@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent } \newlabel{intro}{{1}{1}{Introduction}{section.1}{}} \newlabel{prob:01}{{1}{1}{Introduction}{equation.1.1}{}} \@writefile{toc}{\contentsline {paragraph}{Vignette: Burer-Monteiro splitting.}{1}{section*.1}\protected@file@percent } \newlabel{eq:sdp}{{2}{1}{Vignette: Burer-Monteiro splitting}{equation.1.2}{}} \citation{yurtsever2018conditional} \citation{burer2003nonlinear,burer2005local} \citation{pataki1998rank,barvinok1995problems} \citation{waldspurger2018rank} \citation{burer2005local} \citation{boumal2016non} \citation{burer2003nonlinear,burer2005local,kulis2007fast} \citation{parikh2014proximal} \citation{nocedal2006numerical} \citation{nedelcu2014computational,lan2016iteration,xu2017inexact} \newlabel{prob:nc}{{3}{2}{Vignette: Burer-Monteiro splitting}{equation.1.3}{}} \newlabel{sec:preliminaries}{{2}{2}{Preliminaries}{section.2}{}} \@writefile{toc}{\contentsline {section}{\numberline {2}Preliminaries }{2}{section.2}\protected@file@percent } \@writefile{toc}{\contentsline {paragraph}{\textbf {{Notation.}}}{2}{section*.2}\protected@file@percent } \citation{hestenes1969multiplier,powell1969method} \citation{bertsekas1982constrained,birgin2014practical} \newlabel{eq:indicator}{{4}{3}{\textbf {{Notation.}}}{equation.2.4}{}} \newlabel{eq:smoothness basic}{{5}{3}{\textbf {{Notation.}}}{equation.2.5}{}} \newlabel{eq:minmax}{{6}{3}{\textbf {{Notation.}}}{equation.2.6}{}} \newlabel{eq:Lagrangian}{{7}{3}{\textbf {{Notation.}}}{equation.2.7}{}} \newlabel{e:exac}{{8}{3}{\textbf {{Notation.}}}{equation.2.8}{}} \newlabel{sec:opt cnds}{{2}{3}{\textbf {{Notation.}}}{equation.2.8}{}} \newlabel{e:inclu1}{{9}{3}{\textbf {{Notation.}}}{equation.2.9}{}} \newlabel{e:inclu2}{{10}{3}{\textbf {{Notation.}}}{equation.2.10}{}} \newlabel{eq:inclu3}{{11}{3}{\textbf {{Notation.}}}{equation.2.11}{}} \newlabel{eq:cvg metric}{{12}{3}{\textbf {{Notation.}}}{equation.2.12}{}} \citation{bertsekas1976penalty} \newlabel{eq:dist_subgrad}{{13}{4}{Vignette: Burer-Monteiro splitting}{equation.2.13}{}} \newlabel{eq:sec_opt}{{15}{4}{Vignette: Burer-Monteiro splitting}{equation.2.15}{}} \@writefile{toc}{\contentsline {paragraph}{{\textbf {Smoothness lemma.}}}{4}{section*.3}\protected@file@percent } \newlabel{lem:smoothness}{{2.1}{4}{\textbf {smoothness}}{theorem.2.1}{}} \newlabel{eq:smoothness of Lagrangian}{{17}{4}{\textbf {smoothness}}{equation.2.17}{}} \newlabel{sec:AL algorithm}{{3}{4}{Algorithm}{section.3}{}} \@writefile{toc}{\contentsline {section}{\numberline {3}Algorithm }{4}{section.3}\protected@file@percent } \newlabel{sec:cvg rate}{{4}{4}{Convergence Rate}{section.4}{}} \@writefile{toc}{\contentsline {section}{\numberline {4}Convergence Rate }{4}{section.4}\protected@file@percent } \@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Inexact ALM for solving\nobreakspace {}\textup {\hbox {\mathsurround \z@ \normalfont (\ignorespaces \ref {prob:01}\unskip \@@italiccorr )}}}}{5}{algorithm.1}\protected@file@percent } \newlabel{Algo:2}{{1}{5}{Algorithm}{algorithm.1}{}} \newlabel{thm:main}{{4.1}{5}{Convergence Rate}{theorem.4.1}{}} \newlabel{eq:defn_restricted_lipsichtz}{{19}{5}{Convergence Rate}{equation.4.19}{}} \newlabel{eq:regularity}{{20}{5}{Convergence Rate}{equation.4.20}{}} \newlabel{eq:stat_prec_first}{{21}{5}{Convergence Rate}{equation.4.21}{}} \citation{karimi2016linear} \citation{xu2017globally} \citation{bolte2018nonconvex} \citation{bolte2018nonconvex} \citation{flores2012complete} \citation{ghadimi2016accelerated} \citation{cartis2012complexity} \citation{ghadimi2016accelerated} \citation{nesterov1983method} \citation{ghadimi2016accelerated} \@writefile{toc}{\contentsline {paragraph}{Penalty method.}{6}{section*.4}\protected@file@percent } \newlabel{sec:first-o-opt}{{4.1}{6}{First-Order Optimality}{subsection.4.1}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.1}First-Order Optimality }{6}{subsection.4.1}\protected@file@percent } \citation{ghadimi2016accelerated} \citation{ghadimi2016accelerated} \citation{cartis2012complexity} \citation{nouiehed2018convergence} \citation{cartis2012complexity} \citation{cartis2012complexity} \citation{cartis2012complexity} \citation{cartis2012complexity} \newlabel{eq:iter_1storder}{{24}{7}{First-Order Optimality}{equation.4.24}{}} \newlabel{cor:first}{{4.2}{7}{First-Order Optimality}{theorem.4.2}{}} \newlabel{sec:second-o-opt}{{4.2}{7}{Second-Order Optimality}{subsection.4.2}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Second-Order Optimality }{7}{subsection.4.2}\protected@file@percent } \newlabel{eq:sec_inn_comp}{{26}{7}{Second-Order Optimality}{equation.4.26}{}} \newlabel{cor:second}{{4.3}{7}{Second-Order Optimality}{theorem.4.3}{}} \citation{hestenes1969multiplier,powell1969method} \citation{lan2016iteration,nedelcu2014computational,tran2018adaptive,xu2017inexact} \citation{bertsekas2014constrained} \citation{fernandez2012local} \citation{birgin2016evaluation} \citation{cartis2018optimality} \citation{birgin2016evaluation,cartis2018optimality} \citation{cartis2011evaluation} \citation{bolte2018nonconvex} \citation{bolte2017error} \citation{clason2018acceleration} \citation{chambolle2011first} \citation{burer2003nonlinear,burer2005local} \citation{bhojanapalli2016dropping,park2016provable} \citation{bhojanapalli2018smoothed} \citation{boumal2014manopt,boumal2016global,boumal2016non} -\newlabel{sec:related work}{{5}{8}{Related works}{section.5}{}} -\@writefile{toc}{\contentsline {section}{\numberline {5}Related works }{8}{section.5}\protected@file@percent } -\newlabel{eq:pen_sdp}{{29}{8}{Related works}{equation.5.29}{}} -\newlabel{eq:pen_nc}{{30}{8}{Related works}{equation.5.30}{}} +\newlabel{sec:related work}{{5}{8}{Related Work}{section.5}{}} +\@writefile{toc}{\contentsline {section}{\numberline {5}Related Work }{8}{section.5}\protected@file@percent } +\newlabel{eq:pen_sdp}{{29}{8}{Related Work}{equation.5.29}{}} +\newlabel{eq:pen_nc}{{30}{8}{Related Work}{equation.5.30}{}} \citation{boumal2016global} \citation{nesterov2009primal,yurtsever2015universal,yurtsever2018conditional} \citation{jaggi2013revisiting} \citation{dai2002convergence,mascarenhas2004bfgs} \citation{cartis2012complexity} \citation{Peng2007} \citation{kulis2007fast} \citation{fletcher2013practical} \newlabel{sec:experiments}{{6}{9}{Numerical Evidence}{section.6}{}} \@writefile{toc}{\contentsline {section}{\numberline {6}Numerical Evidence }{9}{section.6}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {6.1}Clustering}{9}{subsection.6.1}\protected@file@percent } \newlabel{eq:sdp_svx}{{31}{9}{Clustering}{equation.6.31}{}} \newlabel{eq:nc_cluster}{{32}{9}{Clustering}{equation.6.32}{}} \citation{yurtsever2018conditional} \citation{yang2015sdpnal} \citation{mixon2016clustering} \citation{xiao2017/online} \citation{kulis2007fast} \citation{tepper2018clustering} \citation{kulis2007fast} \citation{tepper2018clustering} \citation{chen2001atomic,candes2008introduction,arora2018compressed} \citation{tran2018adaptive,chambolle2011first} \citation{beck2009fast} -\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Convergence of different algorithms for k-Means clustering with fashion MNIST dataset. Here, we set the rank to be equal to 20 for the non-convex approaches. The solution rank for the template\nobreakspace {}\textup {\hbox {\mathsurround \z@ \normalfont (\ignorespaces \ref {eq:sdp_svx}\unskip \@@italiccorr )}} is known and it is equal to number of clusters $k$ (Theorem 1. \cite {kulis2007fast}). As discussed in\nobreakspace {}\cite {tepper2018clustering}, setting rank $r>k$ leads more accurate reconstruction in expense of speed. Therefore, we set the rank to 20.}}{10}{figure.1}\protected@file@percent } -\newlabel{fig:clustering}{{1}{10}{Convergence of different algorithms for k-Means clustering with fashion MNIST dataset. Here, we set the rank to be equal to 20 for the non-convex approaches. The solution rank for the template~\eqref {eq:sdp_svx} is known and it is equal to number of clusters $k$ (Theorem 1. \cite {kulis2007fast}). As discussed in~\cite {tepper2018clustering}, setting rank $r>k$ leads more accurate reconstruction in expense of speed. Therefore, we set the rank to 20}{figure.1}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Convergence of different algorithms for clustering with fashion-MNIST dataset. Here, we set the rank as $r=20$ for the nonconvex approaches. The solution rank for the template\nobreakspace {}\textup {\hbox {\mathsurround \z@ \normalfont (\ignorespaces \ref {eq:sdp_svx}\unskip \@@italiccorr )}} is the number of clusters $s$ \cite [Theorem 1]{kulis2007fast}. However, as discussed in\nobreakspace {}\cite {tepper2018clustering}, setting rank $r>s$ leads more accurate reconstruction at the expense of speed, hence our choice of $r=20$. }}{10}{figure.1}\protected@file@percent } +\newlabel{fig:clustering}{{1}{10}{Convergence of different algorithms for clustering with fashion-MNIST dataset. Here, we set the rank as $r=20$ for the nonconvex approaches. The solution rank for the template~\eqref {eq:sdp_svx} is the number of clusters $s$ \cite [Theorem 1]{kulis2007fast}. However, as discussed in~\cite {tepper2018clustering}, setting rank $r>s$ leads more accurate reconstruction at the expense of speed, hence our choice of $r=20$}{figure.1}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.2}Basis Pursuit}{10}{subsection.6.2}\protected@file@percent } \newlabel{eq:bp_main}{{34}{10}{Basis Pursuit}{equation.6.34}{}} \newlabel{eq:bp-equiv}{{35}{10}{Basis Pursuit}{equation.6.35}{}} \citation{obozinski2011group} \bibstyle{abbrv} \bibdata{bibliography.bib} -\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Convergence with different subsolvers for the aforementioned non-convex relaxation. }}{11}{figure.2}\protected@file@percent } -\newlabel{fig:bp1}{{2}{11}{Convergence with different subsolvers for the aforementioned non-convex relaxation}{figure.2}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Convergence with different subsolvers for the aforementioned nonconvex relaxation. }}{11}{figure.2}\protected@file@percent } +\newlabel{fig:bp1}{{2}{11}{Convergence with different subsolvers for the aforementioned nonconvex relaxation}{figure.2}{}} \@writefile{toc}{\contentsline {paragraph}{Discussion:}{11}{section*.5}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3}Adversarial Denoising with GANs}{11}{subsection.6.3}\protected@file@percent } \newlabel{sec:theory}{{A}{12}{Proof of Theorem \ref {thm:main}}{appendix.A}{}} \@writefile{toc}{\contentsline {section}{\numberline {A}Proof of Theorem \ref {thm:main} }{12}{appendix.A}\protected@file@percent } \newlabel{eq:before_restriction}{{38}{12}{Proof of Theorem \ref {thm:main}}{equation.A.38}{}} \newlabel{eq:restrited_pre}{{39}{12}{Proof of Theorem \ref {thm:main}}{equation.A.39}{}} \newlabel{eq:before_dual_controlled}{{40}{12}{Proof of Theorem \ref {thm:main}}{equation.A.40}{}} \newlabel{eq:dual growth}{{41}{12}{Proof of Theorem \ref {thm:main}}{equation.A.41}{}} \newlabel{eq:cvg metric part 2}{{43}{12}{Proof of Theorem \ref {thm:main}}{equation.A.43}{}} \newlabel{eq:cvg metric part 1 brk down}{{44}{13}{Proof of Theorem \ref {thm:main}}{equation.A.44}{}} \newlabel{eq:part_1_2}{{45}{13}{Proof of Theorem \ref {thm:main}}{equation.A.45}{}} \newlabel{eq:cvg metric part 1}{{46}{13}{Proof of Theorem \ref {thm:main}}{equation.A.46}{}} \newlabel{eq:sec}{{49}{13}{Proof of Theorem \ref {thm:main}}{equation.A.49}{}} \@writefile{toc}{\contentsline {section}{\numberline {B}Proof of Corollary\nobreakspace {}\ref {cor:first}}{14}{appendix.B}\protected@file@percent } \newlabel{eq:acc_to_b}{{50}{14}{Proof of Corollary~\ref {cor:first}}{equation.B.50}{}} \newlabel{eq: tk_bound}{{51}{14}{Proof of Corollary~\ref {cor:first}}{equation.B.51}{}} \newlabel{sec:proof of smoothness lemma}{{C}{14}{Proof of Lemma \ref {lem:smoothness}}{appendix.C}{}} \@writefile{toc}{\contentsline {section}{\numberline {C}Proof of Lemma \ref {lem:smoothness}}{14}{appendix.C}\protected@file@percent } \newlabel{sec:verification2}{{D}{15}{Clustering}{appendix.D}{}} \@writefile{toc}{\contentsline {section}{\numberline {D}Clustering }{15}{appendix.D}\protected@file@percent } \newlabel{eq:Jacobian clustering}{{59}{15}{Clustering}{equation.D.59}{}} \newlabel{eq:exp-subgrad-cluster}{{61}{15}{Clustering}{equation.D.61}{}} \citation{obozinski2011group} \newlabel{sec:verification1}{{E}{16}{Basis Pursuit}{appendix.E}{}} \@writefile{toc}{\contentsline {section}{\numberline {E}Basis Pursuit }{16}{appendix.E}\protected@file@percent } \newlabel{eq:jacob-bp}{{63}{16}{Basis Pursuit}{equation.E.63}{}} \newlabel{eq:cnd-bp-pre}{{64}{16}{Basis Pursuit}{equation.E.64}{}} \@writefile{toc}{\contentsline {paragraph}{Discussion}{16}{section*.7}\protected@file@percent } \citation{Ilyas2017} \citation{Goodfellow2014} \citation{ge2016efficient} \citation{boumal2016non} \citation{boumal2016global} \citation{ge2016efficient} \citation{manopt} \@writefile{toc}{\contentsline {subsection}{\numberline {E.1}$\ell _\infty $ Denoising with a Generative Prior}{17}{subsection.E.1}\protected@file@percent } \newlabel{fig:comparison_fab}{{E.1}{17}{$\ell _\infty $ Denoising with a Generative Prior}{equation.E.67}{}} \@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Augmented Lagrangian vs Adam for $\ell _\infty $ denoising (left). $\ell _2$ vs $\ell _\infty $ denoising as defense against adversarial examples}}{17}{figure.3}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {E.2}Generalized Eigenvalue Problem}{17}{subsection.E.2}\protected@file@percent } \newlabel{eq:eig}{{68}{17}{Generalized Eigenvalue Problem}{equation.E.68}{}} \newlabel{eq:eig-sdp}{{69}{17}{Generalized Eigenvalue Problem}{equation.E.69}{}} \newlabel{eq:eig-equiv}{{70}{17}{Generalized Eigenvalue Problem}{equation.E.70}{}} \@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces {\it {(Top)}} Objective convergence for calculating top generalized eigenvalue and eigenvector of $B$ and $C$. {\it {(Bottom)}} Eigenvalue structure of the matrices. For (i),(ii) and (iii), $C$ is positive semidefinite; for (iv), (v) and (vi), $C$ contains negative eigenvalues. {[(i): Generated by taking symmetric part of iid Gaussian matrix. (ii): Generated by randomly rotating diag($1^{-p}, 2^{-p}, \cdots , 1000^{-p}$)($p=1$). (iii): Generated by randomly rotating diag($10^{-p}, 10^{-2p}, \cdots , 10^{-1000p}$)($p=0.0025$).]} }}{18}{figure.4}\protected@file@percent } \newlabel{fig:geig1}{{4}{18}{{\it {(Top)}} Objective convergence for calculating top generalized eigenvalue and eigenvector of $B$ and $C$. {\it {(Bottom)}} Eigenvalue structure of the matrices. For (i),(ii) and (iii), $C$ is positive semidefinite; for (iv), (v) and (vi), $C$ contains negative eigenvalues. {[(i): Generated by taking symmetric part of iid Gaussian matrix. (ii): Generated by randomly rotating diag($1^{-p}, 2^{-p}, \cdots , 1000^{-p}$)($p=1$). (iii): Generated by randomly rotating diag($10^{-p}, 10^{-2p}, \cdots , 10^{-1000p}$)($p=0.0025$).]}}{figure.4}{}} diff --git a/NeurIPS 19/main.out b/NeurIPS 19/main.out index 4bd5aa9..4f24a3b 100644 --- a/NeurIPS 19/main.out +++ b/NeurIPS 19/main.out @@ -1,18 +1,17 @@ \BOOKMARK [1][-]{section.1}{Introduction}{}% 1 \BOOKMARK [1][-]{section.2}{Preliminaries }{}% 2 \BOOKMARK [1][-]{section.3}{Algorithm }{}% 3 \BOOKMARK [1][-]{section.4}{Convergence Rate }{}% 4 \BOOKMARK [2][-]{subsection.4.1}{First-Order Optimality }{section.4}% 5 \BOOKMARK [2][-]{subsection.4.2}{Second-Order Optimality }{section.4}% 6 -\BOOKMARK [1][-]{section.5}{Related works }{}% 7 +\BOOKMARK [1][-]{section.5}{Related Work }{}% 7 \BOOKMARK [1][-]{section.6}{Numerical Evidence }{}% 8 \BOOKMARK [2][-]{subsection.6.1}{Clustering}{section.6}% 9 \BOOKMARK [2][-]{subsection.6.2}{Basis Pursuit}{section.6}% 10 -\BOOKMARK [2][-]{subsection.6.3}{Adversarial Denoising with GANs}{section.6}% 11 -\BOOKMARK [1][-]{appendix.A}{Proof of Theorem 4.1 }{}% 12 -\BOOKMARK [1][-]{appendix.B}{Proof of Corollary 4.2}{}% 13 -\BOOKMARK [1][-]{appendix.C}{Proof of Lemma 2.1}{}% 14 -\BOOKMARK [1][-]{appendix.D}{Clustering }{}% 15 -\BOOKMARK [1][-]{appendix.E}{Basis Pursuit }{}% 16 -\BOOKMARK [2][-]{subsection.E.1}{ Denoising with a Generative Prior}{appendix.E}% 17 -\BOOKMARK [2][-]{subsection.E.2}{Generalized Eigenvalue Problem}{appendix.E}% 18 +\BOOKMARK [1][-]{appendix.A}{Proof of Theorem 4.1 }{}% 11 +\BOOKMARK [1][-]{appendix.B}{Proof of Corollary 4.2}{}% 12 +\BOOKMARK [1][-]{appendix.C}{Proof of Lemma 2.1}{}% 13 +\BOOKMARK [1][-]{appendix.D}{Clustering }{}% 14 +\BOOKMARK [1][-]{appendix.E}{Basis Pursuit }{}% 15 +\BOOKMARK [2][-]{subsection.E.1}{ Denoising with a Generative Prior}{appendix.E}% 16 +\BOOKMARK [2][-]{subsection.E.2}{Generalized Eigenvalue Problem}{appendix.E}% 17 diff --git a/NeurIPS 19/main.pdf b/NeurIPS 19/main.pdf index 5830986..1e9f4b6 100644 Binary files a/NeurIPS 19/main.pdf and b/NeurIPS 19/main.pdf differ diff --git a/NeurIPS 19/main.synctex.gz b/NeurIPS 19/main.synctex.gz index 70b500e..3fe662c 100644 Binary files a/NeurIPS 19/main.synctex.gz and b/NeurIPS 19/main.synctex.gz differ diff --git a/NeurIPS 19/sections/appendix.tex b/NeurIPS 19/sections/appendix.tex index ca50708..8ade199 100644 --- a/NeurIPS 19/sections/appendix.tex +++ b/NeurIPS 19/sections/appendix.tex @@ -1,451 +1,450 @@ %!TEX root = ../main.tex \section{Proof of Corollary~\ref{cor:first}} %Denote $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|$ for every $k$. Using the convergence proof of the outer algorithm, we have the following bound Let $K$ denote the number of (outer) iterations of Algorithm~\ref{Algo:2} and let $\epsilon_{f}$ denote the desired accuracy of Algorithm~\ref{Algo:2}, see~(\ref{eq:inclu3}). Recalling Theorem~\ref{thm:main}, we can then write that \begin{equation} \epsilon_{f} = \frac{Q}{\b_{K}}, \label{eq:acc_to_b} \end{equation} or, equivalently, $\b_{K} = Q/\epsilon_{f}$. %where $K$ denotes the last outer iterate and $\epsilon$ is the final accuracy we would like to get for the optimality conditions given in~\eqref{eq:inclu3}. We now count the number of total (inner) iterations $T$ of Algorithm~\ref{Algo:2} to reach the accuracy $\epsilon_{f}$. From \eqref{eq:smoothness of Lagrangian} and for sufficiently large $k$, recall that $\lambda_{\b_k}\le \lambda'' \b_k$ is the smoothness parameter of the augmented Lagrangian. Then, from \eqref{eq:iter_1storder} ad by summing over the outer iterations, we bound the total number of (inner) iterations of Algorithm~\ref{Algo:2} as \begin{align}\label{eq: tk_bound} T &= \sum_{k=1}^K\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}}^2 \rho'^2 }{\epsilon_k} \right) \nonumber\\ & = \sum_{k=1}^K\mathcal{O}\left (\beta_{k-1}^3 \rho'^2 \right) \qquad \text{(Step 1 of Algorithm \ref{Algo:2})} \nonumber\\ & \leq \mathcal{O} \left(K\beta_{K-1}^3 \rho'^2 \right) \qquad \left( \{\b_k\}_k \text{ is increasing} \right) \nonumber\\ & \le \mathcal{O}\left( \frac{K Q^{{3}} \rho'^2}{\epsilon_{f}^{{3}}} \right). \qquad \text{(see \eqref{eq:acc_to_b})} \end{align} In addition, if we specify $\beta_k=b^k$ for all $k$, we can further refine $T$. Indeed, \begin{equation} \beta_K = b^K~~ \Longrightarrow~~ K = \log_b \left( \frac{Q}{\epsilon_f} \right), \end{equation} which, after substituting into~\eqref{eq: tk_bound} gives the final bound in Corollary~\ref{cor:first}. %\begin{equation} %T \leq \mathcal{O}\left( \frac{Q^{\frac{3}{2}+\frac{1}{2b}} x_{\max}^2}{\epsilon_f^{\frac{3}{2}+\frac{1}{2b}}} \right), %\end{equation} %which completes the proof of Corollary~\ref{cor:first}. %\section{Analysis of different rates for $\beta_k$ and $\epsilon_k$} % %We check the iteration complexity analysis by decoupling $\beta_k$ and $\epsilon_k$. %\begin{equation} %\beta_k = k^b, ~~~~ \epsilon_k = k^{-e}. %\end{equation} % %By denoting $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|$, the algorithm bound becomes, % %\begin{equation} %B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e}. %\end{equation} % %Total iteration number is % %\begin{equation} %T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1}. %\end{equation} % %We now consider two different relations between $b$ and $e$ to see what is going on. % %\textbf{Case 1:} $b\geq e$: % %Bound for the algorithm: % %\begin{equation} %B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e} \leq \frac{2}{k^e} = \epsilon, %\end{equation} % %which gives the relation $K = \left( \frac{2}{\epsilon}\right)^{1/e}$. %Writing down the total number of iterations and plugging in $K$, % %\begin{equation} %T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{\frac{2b}{e}+1+\frac{1}{e}}. %\end{equation} % %To get the least number of total iterations for a given accuracy $\epsilon$, one needs to pick $e$ as large as possible and $b$ as small as possible. %Since in this case we had $b\geq e$, this suggests picking $b=e$ for the optimal iteration complexity. % %\textbf{Case 2:} $b\leq e$: % %Same calculations yield the following bound on the total number of iterations: % %\begin{equation} %T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{2+\frac{e}{b}+\frac{1}{b}}. %\end{equation} % %Given that $b\leq e$, the bound suggests picking $e$ as small as possible and $b$ as big as possible. % %To minimize the total number of iterations in both cases with flexible $b$ and $e$, the bounds suggest to pick $b=e=\alpha$ and take this value to be as large as possible. \section{Proof of Lemma \ref{lem:smoothness}\label{sec:proof of smoothness lemma}} Note that \begin{align} \mathcal{L}_{\beta}(x,y) = f(x) + \sum_{i=1}^m y_i A_i (x) + \frac{\b}{2} \sum_{i=1}^m (A_i(x))^2, \end{align} which implies that \begin{align} & \nabla_x \mathcal{L}_\beta(x,y) \nonumber\\ & = \nabla f(x) + \sum_{i=1}^m y_i \nabla A_i(x) + \frac{\b}{2} \sum_{i=1}^m A_i(x) \nabla A_i(x) \nonumber\\ & = \nabla f(x) + DA(x)^\top y + \b DA(x)^\top A(x), \end{align} where $DA(x)$ is the Jacobian of $A$ at $x$. By taking another derivative with respect to $x$, we reach \begin{align} \nabla^2_x \mathcal{L}_\beta(x,y) & = \nabla^2 f(x) + \sum_{i=1}^m \left( y_i + \b A_i(x) \right) \nabla^2 A_i(x) \nonumber\\ & \qquad +\b \sum_{i=1}^m \nabla A_i(x) \nabla A_i(x)^\top. \end{align} It follows that \begin{align} & \|\nabla_x^2 \mathcal{L}_\beta(x,y)\|\nonumber\\ & \le \| \nabla^2 f(x) \| + \max_i \| \nabla^2 A_i(x)\| \left (\|y\|_1+\b \|A(x)\|_1 \right) \nonumber\\ & \qquad +\beta\sum_{i=1}^m \|\nabla A_i(x)\|^2 \nonumber\\ & \le \lambda_h+ \sqrt{m} \lambda_A \left (\|y\|+\b \|A(x)\| \right) + \b \|DA(x)\|^2_F. \end{align} -For every $x$ such that $\|A(x)\|\le \rho$ and $\|x\|\le \rho'$, we conclude that +For every $x$ such that $\|x\|\le \rho$ and $\|A(x)\|\le \rho$, we conclude that \begin{align} \|\nabla_x^2 \mathcal{L}_\beta(x,y)\| -& \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) \nonumber\\ -& \qquad + \b \max_{\|x\|\le \rho'}\|DA(x)\|_F^2, +& \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho' \right)+ \b \max_{\|x\|\le \rho}\|DA(x)\|_F^2, \end{align} which completes the proof of Lemma \ref{lem:smoothness}. %\section{Proof of Lemma \ref{lem:11}\label{sec:proof of descent lemma}} % %Throughout, let %\begin{align} %G = G_{\b,\g}(x,y) = \frac{x-x^+}{\g}, %\end{align} %for short. %Suppose that $\|A(x)\|\le \rho$, $\|x\|\le \rho$, and similarly $\|A(x^+)\|\le \rho$, $\|x^+\|\le \rho'$. An application of Lemma \ref{lem:smoothness} yields that %\begin{align} %\L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y)+ \langle x^+-x,\nabla_x \L_\b(x,y) \rangle %+ \frac{\lambda_\b}{2} \|x^+ - x\|^2 + g(x^+) \nonumber\\ %& = \L_\b(x,y)-\g \langle G ,\nabla_x \L_\b(x,y) \rangle %+ \frac{\g^2 \lambda_\b }{2} \|G\|^2 + g(x^+) %\label{eq:descent pr 1} %\end{align} %Since $x^+ = P_g(x - \g \nabla_x \L_\b(x,y))$, we also have that %\begin{align} %\g (G - \nabla_x \L_\b(x,y)) = \xi \in \partial g(x^+). %\label{eq:opt of prox} %\end{align} %By combining (\ref{eq:descent pr 1},\ref{eq:opt of prox}), we find that %\begin{align} %\L_\b(x^+,y)+g(x^+) & %\le \L_\b(x,y) -\g \|G\|^2 + \g \langle G, \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\ %& = \L_\b(x,y) -\g \|G\|^2 + \langle x- x^+ , \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\ %& \le \L_\b(x,y) + g(x) - \g\left( 1-\frac{\g\lambda_\b}{2}\right) \|G\|^2, %\end{align} %where the last line above uses the convexity of $g$. Recalling that $\g\le 1/\lambda_\b$ completes the proof of Lemma \ref{lem:11}. % % %\section{Proof of Lemma \ref{lem:eval Lipsc cte}\label{sec:proof of linesearch}} % % %Recalling $x^+_{\gamma}$ in \eqref{eq:defn of gamma line search}, we note that %\begin{equation} %x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = -\xi \in -\partial g (x^+_{\gamma}). %\label{eq:optimality of uplus} %\end{equation} %Lastly, $\gamma$ by definition in \eqref{eq:defn of gamma line search} satisfies %\begin{align} %& \mathcal{L}_{\beta}(x^+_{\gamma},y) + g(x_\g^+) \nonumber\\ % & \le \mathcal{L}_\beta(x,y) + \g \left\langle %x^+_{\gamma} - x , \nabla_x \mathcal{L}_\beta (x,y) %\right\rangle + \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 %+ g(x_\g^+) %\nonumber\\ %& = \mathcal{L}_\beta(x,y) + \left\langle %x - x^+_{\gamma} ,\xi %\right\rangle %- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+)\nonumber\\ %& \le \mathcal{L}_\beta(x,y) %- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x) - g(x^+_\g) %\qquad (\text{convexity of } g) \nonumber\\ %& = \mathcal{L}_\beta(x,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(x,y)\|^2 +g(x) - g(x^+_\g), %\qquad \text{(see Definition \ref{def:grad map})} %\end{align} %which completes the proof of Lemma \ref{lem:eval Lipsc cte}. % % % \section{Clustering \label{sec:verification2}} We only verify the condition in~\eqref{eq:regularity}. Note that \begin{align} A(x) = VV^\top \mathbf{1}- \mathbf{1}, \end{align} \begin{align} DA(x) & = \left[ \begin{array}{cccc} w_{1,1} x_1^\top & \cdots & w_{1,n} x_{1}^\top\\ %x_2^\top p& \cdots & x_{2}^\top\\ \vdots\\ w_{n,1}x_{n}^\top & \cdots & w_{n,n}1 x_{n}^\top \end{array} \right] \nonumber\\ & = \left[ \begin{array}{ccc} V & \cdots & V \end{array} \right] + \left[ \begin{array}{ccc} x_1^\top & \\ & \ddots & \\ & & x_n^\top \end{array} \right], \label{eq:Jacobian clustering} \end{align} %where $I_n\in\RR^{n\times n}$ is the identity matrix, where $w_{i.i}=2$ and $w_{i,j}=1$ for $i\ne j$. In the last line above, $n$ copies of $V$ appear and the last matrix above is block-diagonal. For $x_k$, define $V_k$ as in the example and let $x_{k,i}$ be the $i$th row of $V_k$. Consequently, \begin{align} DA(x_k)^\top A(x_k) & = \left[ \begin{array}{c} (V_k^\top V_k - I_n) V_k^\top \mathbf{1}\\ \vdots\\ (V_k^\top V_k - I_n) V_k^\top \mathbf{1} \end{array} \right] \nonumber\\ & \qquad + \left[ \begin{array}{c} x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\ \vdots \\ x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n \end{array} \right], \end{align} where $I_n\in \RR^{n\times n}$ is the identity matrix. Let us make a number of simplifying assumptions. First, we assume that $\|x_k\|< \sqrt{s}$, which can be easily enforced in the iterates. Under this assumption, it follows that \begin{align} (\partial g(x_k))_i = \begin{cases} 0 & (x_k)_i > 0\\ \{a: a\le 0\} & (x_k)_i = 0, \end{cases} \qquad i\le d. \label{eq:exp-subgrad-cluster} \end{align} Second, we assume that $V_k$ has nearly orthonormal columns, namely, $V_k^\top V_k \approx I_n$. This can also be easily enforced in each iterate of Algorithm~\ref{Algo:2} and naturally corresponds to well-separated clusters. While a more fine-tuned argument can remove these assumptions, they will help us simplify the presentation here. Under these assumptions, the (squared) right-hand side of \eqref{eq:regularity} becomes \begin{align} & \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right)^2 \nonumber\\ & = \left\| \left( -DA(x_k)^\top A(x_k) \right)_+\right\|^2 \qquad (a_+ = \max(a,0)) \nonumber\\ & = \left\| \left[ \begin{array}{c} x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\ \vdots \\ x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n \end{array} \right] \right\|^2 \qquad (x_k\in C \Rightarrow x_k\ge 0) \nonumber\\ & = \sum_{i=1}^n \| x_{k,i}\|^2 (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\ & \ge \min_i \| x_{k,i}\|^2 \cdot \sum_{i=1}^n (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\ & = \min_i \| x_{k,i}\|^2 \cdot \| V_kV_k^\top \mathbf{1}-\mathbf{1} \|^2. \end{align} Given a prescribed $\nu$, ensuring $\|x_{k,i}\| \ge \nu$ guarantees \eqref{eq:regularity}. This requirement corresponds again to well-separated clusters. When the clusters are sufficiently separated and the algorithm is initialized close enough to the constraint set, there is indeed no need to separately enforce this condition. In practice, often $n$ exceeds the number of true clusters and a more fine-tuned analysis is required to establish \eqref{eq:regularity} by restricting the argument to a particular subspace of $\RR^n$. \section{Basis Pursuit \label{sec:verification1}} We only verify the regularity condition in \eqref{eq:regularity} for \eqref{prob:01} with $f,A,g$ specified in \eqref{eq:bp-equiv}. Note that \begin{align} DA(x) = 2 \overline{B} \text{diag}(x), \label{eq:jacob-bp} \end{align} where $\text{diag}(x)\in\RR^{2d\times 2d}$ is the diagonal matrix formed by $x$. The left-hand side of \eqref{eq:regularity} then reads as \begin{align} & \text{dist} \left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{\b_{k-1}} \right) \nonumber\\ & = \text{dist} \left( -DA(x_k)^\top A(x_k) , \{0\} \right) \qquad (g\equiv 0)\nonumber\\ & = \|DA(x_k)^\top A(x_k) \| \nonumber\\ & =2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|. \qquad \text{(see \eqref{eq:jacob-bp})} \label{eq:cnd-bp-pre} \end{align} To bound the last line above, let $x_*$ be a solution of~\eqref{prob:01} and note that $\overline{B} x_*^{\circ 2} = b $ by definition. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. Corresponding to $x_k$, also define $u_{k,1},u_{k,2}$ naturally and let $|z_k| = u_{k,1}^{\circ 2} + u_{k,2}^{\circ 2} \in \RR^d$ be the amplitudes of $z_k$. To simplify matters, let us assume also that $B$ is full-rank. We then rewrite the last line of \eqref{eq:cnd-bp-pre} as \begin{align} & \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|^2 \nonumber\\ & = \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x_k^{\circ 2} -x_*^{\circ 2}) \|^2 \qquad (\overline{B} x_*^{\circ 2} = b) \nonumber\\ & = \| \text{diag}(x_k)\overline{B}^\top B (x_k - x_*) \|^2 \nonumber\\ & = \| \text{diag}(u_{k,1})B^\top B (z_k - z_*) \|^2 \nonumber\\ & \qquad + \| \text{diag}(u_{k,2})B^\top B (z_k - z_*) \|^2 \nonumber\\ & = \| \text{diag}(u_{k,1}^{\circ 2}+ u_{k,2}^{\circ 2}) B^\top B (z_k - z_*) \|^2 \nonumber\\ & = \| \text{diag}(|z_k|) B^\top B (z_k - z_*) \|^2 \nonumber\\ & \ge \eta_n ( B \text{diag}(|z_k|) )^2 \| B(z_k - z_*) \|^2 \nonumber\\ & = \eta_n ( B \text{diag}(|z_k|) )^2 \| B z_k -b \|^2 \qquad ( Bz_* = \ol{B} x^{\circ2}_* = b) \nonumber\\ & \ge \min_{|T|=n}\eta_n(B_T)\cdot |z_{k,(n)}|^2 \|Bz_k - b\|^2, \end{align} where $\eta_n(\cdot)$ returns the $n$th largest singular value of its argument. In the last line above, $B_T$ is the restriction of $B$ to the columns indexed by $T$ of size $n$. Moreover, $z_{k,(n)}$ is the $n$th largest entry of $z$ in magnitude. Given a prescribed $\nu$, \eqref{eq:regularity} therefore holds if \begin{align} |z_{k,(n)} | \ge \sqrt{\frac{\nu}{ \min_{|T|=n} \eta_n(B_T) \cdot \| Bz_k - b\|^2}}, \end{align} for every iteration $k$. If Algorithm \ref{Algo:2} is initialized close enough to the solution $z^*$, there will be no need to directly enforce this condition. \paragraph{Discussion} The true potential of the reformulation of BP in \eqref{eq:bp-equiv} is in dealing with more structured norms than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as \begin{align*} \|z\|_{\Omega} = \sum_{i=1}^I \| z_{\Omega_i} \|, \end{align*} where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the non-convex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work. \subsection{$\ell_\infty$ Denoising with a Generative Prior} The authors of \citep{Ilyas2017} have proposed to project onto the range of a Generative Adversarial network (GAN) \citep{Goodfellow2014}, as a way to defend against adversarial examples. For a given noisy observation $x^* + \eta$, they consider a projection in the $\ell_2$ norm. We instead propose to use our augmented Lagrangian method to denoise in the $\ell_\infty$ norm, a much harder task: \begin{align} \begin{array}{lll} \underset{x, z}{\text{min}} & & \|x^* + \eta - x\|_\infty \\ \text{s.t. } && x=G(z). \end{array} \end{align} \begin{figure}[h!] % \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf} \begin{center} {\includegraphics[width=.4\columnwidth]{figs/example_denoising_fab.pdf}} %\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}} \label{fig:comparison_fab} \caption{Augmented Lagrangian vs Adam for $\ell_\infty$ denoising (left). $\ell_2$ vs $\ell_\infty$ denoising as defense against adversarial examples} \end{center} \end{figure} We use a pretrained generator for the MNIST dataset, given by a standard deconvolutional neural network architecture. We compare the succesful optimizer Adam against our method. Our algorithm involves two forward/backward passes through the network, as oposed to Adam that requires only one. For this reason we let our algorithm run for 4000 iterations, and Adam for 8000 iterations. For a particular example, we plot the objective value vs iteration count in figure \ref{fig:comparison_fab}. Our method successfully minimizes the objective value, while Adam does not succeed. {\editf{ \subsection{Generalized Eigenvalue Problem} \begin{figure}[h] %\begin{table}[h] \begin{tabular}{l|l|l} ~~~~~~~~~(i) $C:$ Gaussian iid & ~~~~~(ii) $C:$ Polynomial decay & ~~~~~(iii) $C:$ Exponential decay \\ \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case1_convergence.pdf}& \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case2_convergence.pdf}& \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case3_convergence.pdf}\\ \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case1_eigenvalues.pdf} & \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case2_eigenvalues.pdf} & \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case3_eigenvalues.pdf} \\ \hline %~~~~~~~~~~~~~~~~~~~~(iv) & ~~~~~~~~~~~~~~~~~~~~(v) & ~~~~~~~~~~~~~~~~~~~(vi) \\ %\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case4_convergence.pdf}& %\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case6_convergence.pdf}& %\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case7_convergence.pdf}\\ % % %\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case4_eigenvalues.pdf} & %\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case6_eigenvalues.pdf} & %\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case7_eigenvalues.pdf} \end{tabular} %\end{table} \caption{{\it{(Top)}} Objective convergence for calculating top generalized eigenvalue and eigenvector of $B$ and $C$. {\it{(Bottom)}} Eigenvalue structure of the matrices. For (i),(ii) and (iii), $C$ is positive semidefinite; for (iv), (v) and (vi), $C$ contains negative eigenvalues. {[(i): Generated by taking symmetric part of iid Gaussian matrix. (ii): Generated by randomly rotating diag($1^{-p}, 2^{-p}, \cdots, 1000^{-p}$)($p=1$). (iii): Generated by randomly rotating diag($10^{-p}, 10^{-2p}, \cdots, 10^{-1000p}$)($p=0.0025$).]} } \label{fig:geig1} \end{figure} Generalized eigenvalue problem has extensive applications in machine learning, statistics and data analysis~\cite{ge2016efficient}. The well-known nonconvex formulation of the problem is~\cite{boumal2016non}. \begin{align} \begin{cases} \underset{x\in\mathbb{R}^n}{\min} x^\top C x \\ x^\top B x = 1, \end{cases} \label{eq:eig} \end{align} where $B, C \in \RR^{n \times n}$ are symmetric matrices and $B$ is positive definite, i.e. $B \succ 0$. %Due to the invertibilty of $B$, we have $\langle x, Bx \rangle \geq \frac{\| x\|_F^2}{\| B^{-1} \|}$, which implies the constraint $\| x \|_F ^2 \leq \| B^{-1} \|$. The generalized eigenvector computation is equivalent to performing principal component analysis (PCA) of $C$ in the norm $B$. Moreover, it is also equivalent to computing the top eigenvector of symmetric matrix $S = B^{-1/2}CB^{1-2}$ and multiplying the resulting vector by $B^{-1/2}$. However, for sufficiently large $n$, computing $B^{-1/2}$ is extremely expensive. The natural convex sdp relaxation for~\eqref{eq:eig} involves lifting $Y = xx^\top$ and removes the non-convex rank$(Y) = 1$ constraint, \begin{align} \begin{cases} \underset{Y \in \RR^{n \times n}}{\min} \text{tr}(CY)\\ \text{tr}(BY) = 1, \quad X \succeq 0. \end{cases} \label{eq:eig-sdp} \end{align} Here, we solve~\eqref{eq:eig} because it directly fits into our template with, \begin{align} f(x) =& x^\top C x, \quad g(x) = 0\nonumber\\ A(x) =& x^\top B x - 1. \label{eq:eig-equiv} \end{align} We compare our approach against 3 different methods. Manifold based Riemannian gradient descent and Riemannian trust region methods in\cite{boumal2016global} and generalized eigenvector via linear system solver (abbrevated as. GenELin) in~\cite{ge2016efficient}. We have used Manopt software package in \cite{manopt} for the manifold based methods. For GenELin, we have utilized Matlab's backslash operator as the linear solver. The results are compiled in Figure \ref{fig:geig1}. }} %\editf{ %\subsection{Quadratic assginment problem} %(mfs): Problem description here... %\begin{figure*}[ht] %% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf} %\begin{center} %{\includegraphics[width=.7\columnwidth]{figs/qap/r4esc16i}} %{\includegraphics[width=.7\columnwidth]{figs/qap/r4esc64a}} %%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}} %\label{fig:qap} %\caption{Convergence of feasibility(left) and feasible upper bound after rounding(right) for sparse quadratic assignment data from QAPLIB.} %\end{center} %\end{figure*} % %} \ No newline at end of file diff --git a/NeurIPS 19/sections/experiments.tex b/NeurIPS 19/sections/experiments.tex index 2edd902..faf0b59 100644 --- a/NeurIPS 19/sections/experiments.tex +++ b/NeurIPS 19/sections/experiments.tex @@ -1,234 +1,234 @@ %!TEX root = ../main.tex \section{Numerical Evidence \label{sec:experiments}} -We first begin with a caveat: It is known that quasi-Newton methods, such as BFGS and lBFGS, might not converge for non-convex problems~\cite{dai2002convergence, mascarenhas2004bfgs}. For this reason, we have used the trust region method as the second-order solver in our analysis in Section~\ref{sec:cvg rate}, which is well-studied for non-convex problems~\cite{cartis2012complexity}. Empirically, however, BFGS and lBGFS are extremely successful and we have therefore opted for those solvers in this section since the subroutine does not affect Theorem~\ref{thm:main} as long as the subsolver performs well in practice. +We first begin with a caveat: It is known that quasi-Newton methods, such as BFGS and lBFGS, might not converge for nonconvex problems~\cite{dai2002convergence, mascarenhas2004bfgs}. For this reason, we have used the trust region method as the second-order solver in our analysis in Section~\ref{sec:cvg rate}, which is well-studied for nonconvex problems~\cite{cartis2012complexity}. Empirically, however, BFGS and lBGFS are extremely successful and we have therefore opted for those solvers in this section since the subroutine does not affect Theorem~\ref{thm:main} as long as the subsolver performs well in practice. \subsection{Clustering} Given data points $\{z_i\}_{i=1}^n $, the entries of the corresponding Euclidean distance matrix $D \in \RR^{nxn}$ are $ D_{i, j} = \left\| z_i - z_j\right\|^2 $. Clustering is then the problem of finding a co-association matrix $Y\in \RR^{n\times n}$ such that $Y_{ij} = 1$ if points $z_i$ and $z_j$ are within the same cluster and $Y_{ij} = 0$ otherwise. In~\cite{Peng2007}, the authors provide a SDP relaxation of the clustering problem, specified as \begin{align} \begin{cases} \underset{Y \in \RR^{nxn}}{\min} \text{tr}(DY) \\ Y\mathbf{1} = \mathbf{1}, ~\text{tr}(Y) = s,~ Y\succeq 0,~Y \geq 0, \end{cases} \label{eq:sdp_svx} \end{align} where $s$ is the number of clusters and $Y $ is both positive semidefinite and has nonnegative entries. Standard SDP solvers do not scale well with the number of data points~$n$, since they often require projection onto the semidefinite cone with the complexity of $\mathcal{O}(n^3)$. We instead use the Burer-Monteiro factorization to solve \eqref{eq:sdp_svx}, sacrificing convexity to reduce the computational complexity. More specifically, we solve the program \begin{align} \label{eq:nc_cluster} \begin{cases} \underset{V \in \RR^{nxr}}{\min} \text{tr}(DVV^{\top}) \\ VV^{\top}\mathbf{1} = \mathbf{1},~~ \|V\|_F^2 \le s, ~~V \geq 0, \end{cases} \end{align} where $\mathbf{1}\in \RR^n$ is the vector of all ones. Note that $Y \geq 0$ in \eqref{eq:sdp_svx} is replaced above by the much stronger but easier-to-enforce constraint $V \geq 0$ in \eqref{eq:nc_cluster}, see~\citep{kulis2007fast} for the reasoning behind this relaxation. %. Trace constraint translates to a Frobenius norm constraint in factorized space. Semidefinite constraint is naturally removed due to factorization $Y=VV^{\top}$. %See~\citep{kulis2007fast} for the details of the relaxation. Now, we can cast~\eqref{eq:nc_cluster} as an instance of~\eqref{prob:01}. Indeed, for every $i\le n$, let $x_i \in \RR^r$ denote the $i$th row of $V$. We next form $x \in \RR^d$ with $d = nr$ by expanding the factorized variable $V$, namely, \begin{align*} x = [x_1^{\top}, \cdots, x_n^{\top}]^{\top} \in \RR^d, \end{align*} and then set \begin{align*} f(x) =\sum_{i,j=1}^n D_{i, j} \left\langle x_i, x_j \right\rangle, \qquad g = \delta_C, \end{align*} \begin{align} A(x) = [x_1^{\top}\sum_{j=1}^n x_j -1, \cdots, x_n^{\top}\sum_{j=1}^n x_j-1]^{\top}, \end{align} where $C$ is the intersection of the positive orthant in $\RR^d$ with the Euclidean ball of radius $\sqrt{s}$. In Appendix~\ref{sec:verification2}, we somewhat informally verify that Theorem~\ref{thm:main} applies to~\eqref{prob:01} with $f,g,A$ specified above. -In our simulations, we use two different solvers for Step~2 of Algorithm~\ref{Algo:2}, namely, APGM and lBFGS. APGM is a solver for non-convex problems of the form~\eqref{e:exac} with convergence guarantees to first-order stationarity, as discussed in Section~\ref{sec:cvg rate}. lBFGS is a limited-memory version of BFGS algorithm in~\cite{fletcher2013practical} that approximately leverages the second-order information of the problem. +In our simulations, we use two different solvers for Step~2 of Algorithm~\ref{Algo:2}, namely, APGM and lBFGS. APGM is a solver for nonconvex problems of the form~\eqref{e:exac} with convergence guarantees to first-order stationarity, as discussed in Section~\ref{sec:cvg rate}. lBFGS is a limited-memory version of BFGS algorithm in~\cite{fletcher2013practical} that approximately leverages the second-order information of the problem. We compare our approach against the following convex methods: \begin{itemize} -\item HCGM: Homotopy-based Conditional Gradient Method in\cite{yurtsever2018conditional} which directly solves~\eqref{eq:sdp_svx}. +\item HCGM: Homotopy-based Conditional Gradient Method in~\cite{yurtsever2018conditional} which directly solves~\eqref{eq:sdp_svx}. \item SDPNAL+: A second-order augmented Lagrangian method for solving SDP's with nonnegativity constraints~\cite{yang2015sdpnal}. \end{itemize} As for the dataset, our experimental setup is similar to that described by~\cite{mixon2016clustering}. We use the publicly-available fashion-MNIST data in \cite{xiao2017/online}, which is released as a possible replacement for the MNIST handwritten digits. Each data point is a $28\times 28$ gray-scale image, associated with a label from ten classes, labeled from $0$ to $9$. First, we extract the meaningful features from this dataset using a simple two-layer neural network with a sigmoid activation function. Then, we apply this neural network to 1000 test samples from the same dataset, which gives us a vector of length $10$ for each data point, where each entry represents the posterior probability for each class. Then, we form the $\ell_2$ distance matrix ${D}$ from these probability vectors. The results are depicted in Figure~\ref{fig:clustering}. -We implemented 3 algorithms on MATLAB and used the software package for SDPNAL+ which contains mex files. Performance of the nonconvex approach would be much better if we also used mex files. +We implemented 3 algorithms on MATLAB and used the software package for SDPNAL+ which contains mex files. It is predictable that the performance of our nonconvex approach would also improve by using mex files. \begin{figure*}[ht] % \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf} \begin{center} {\includegraphics[width=.4\columnwidth]{figs/clustering_fig4_times_linearbeta_last.pdf}} {\includegraphics[width=.4\columnwidth]{figs/clustering_fig4_iter_linearbeta_last.pdf}} %\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}} -\caption{Convergence of different algorithms for k-Means clustering with fashion MNIST dataset. Here, we set the rank to be equal to 20 for the non-convex approaches. The solution rank for the template~\eqref{eq:sdp_svx} is known and it is equal to number of clusters $k$ (Theorem 1. \cite{kulis2007fast}). As discussed in~\cite{tepper2018clustering}, setting rank $r>k$ leads more accurate reconstruction in expense of speed. Therefore, we set the rank to 20.} +\caption{Convergence of different algorithms for clustering with fashion-MNIST dataset. Here, we set the rank as $r=20$ for the nonconvex approaches. The solution rank for the template~\eqref{eq:sdp_svx} is the number of clusters $s$ \cite[Theorem 1]{kulis2007fast}. However, as discussed in~\cite{tepper2018clustering}, setting rank $r>s$ leads more accurate reconstruction at the expense of speed, hence our choice of $r=20$. } \label{fig:clustering} %(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])} \end{center} \end{figure*} \subsection{Basis Pursuit} Basis Pursuit (BP) finds sparsest solutions of an under-determined system of linear equations, namely, \begin{align} \begin{cases} \min_{z} \|z\|_1 \\ Bz = b, \end{cases} \label{eq:bp_main} \end{align} where $B \in \RR^{n \times d}$ and $b \in \RR^{n}$. BP has found many applications in machine learning, statistics and signal processing \cite{chen2001atomic, candes2008introduction, arora2018compressed}. A plethora of primal-dual convex optimization algorithms are available in the literature to solve BP, including~\cite{tran2018adaptive,chambolle2011first}. There also exists a line of work \cite{beck2009fast} that handles sparse regression via regularization with $\ell_1$ norm. %\textbf{AE: Fatih, maybe mention a few other algorithms including asgard and also say why we can't use fista and nista.} Here, we take a different approach and cast~(\ref{eq:bp_main}) as an instance of~\eqref{prob:01}. Note that any $z \in \RR^d$ can be decomposed as $z = z^+ - z^-$, where $z^+,z^-\in \RR^d$ are the positive and negative parts of $z$, respectively. Then consider the change of variables $z^+ = u_1^{\circ 2}$ and $z^-= u_2^{\circ 2} \in \RR^d$, where $\circ$ denotes element-wise power. Next, we concatenate $u_1$ and $u_2$ as $x := [ u_1^{\top}, u_2^{\top} ]^{\top} \in \RR^{2d}$ and define $\overline{B} := [B, -B] \in \RR^{n \times 2d}$. Then, \eqref{eq:bp_main} is equivalent to \eqref{prob:01} with \begin{align} f(x) =& \|x\|^2, \quad g(x) = 0\nonumber\\ A(x) =& \overline{B}x^{\circ 2}- b. \label{eq:bp-equiv} \end{align} In Appendix~\ref{sec:verification1}, we verify with minimal detail that Theorem~\ref{thm:main} indeed applies to~\eqref{prob:01} with the above $f,A$. %Let $\mu(B)$ denote the \emph{coherence} of ${B}$, namely, %\begin{align} %\mu = \max_{i,j} | \langle B_i, B_j \rangle |, %\end{align} %where $B_i\in \RR^n$ is the $i$th column of $B$. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. We rewrite the last line of \eqref{eq:cnd-bp-pre} as %\begin{align} %& 2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \| \nonumber\\ %& = 2 \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x^{\circ 2} -x_*^{\circ 2}) \| \nonumber\\ %& = 2 \| \text{diag}(x_k) {B}^\top B(x_k- x_*) \| \nonumber\\ %& = 2 \| \text{diag}(x_k) {B}^\top B(x_k- x_*) \|_{\infty} \nonumber\\ %& \ge 2 \| \text{diag}(x_k) \text{diag}({B}^\top B) (x_k- x_*) \|_{\infty}\nonumber\\ %& \qquad - 2 \| \text{diag}(x_k) \text{hollow}({B}^\top B) (x_k- x_*) \|_{\infty}, %\end{align} %where $\text{hollow}$ returns the hollow part of th % % and let $\overline{B}=U\Sigma V^\top$ be its thin singular value decomposition, where $U\in \RR^{n\times n}, V\in \RR^{d\times n}$ have orthonormal columns and the diagonal matrix $\Sigma\in \RR^{n\times n}$ contains the singular values $\{\sigma_i\}_{i=1}^r$. Let also $x_{k,i},U_i$ be the $i$th entry of $x_k$ and the $i$th row of $U_i$, respectively. Then we bound the last line above as %\begin{align} %& 2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \| \nonumber\\ %& = 2 \| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \| \nonumber\\ %& \ge 2 \| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \|_{\infty} \nonumber\\ %& = \max_i |x_{k,i}| \cdot | U_i ^\top \Sigma \cdot \Sigma U^\top ( x^{\circ 2} - x_*^{\circ 2}) | \nonumber\\ %& %\end{align} % %where $\tilde{b}_{i,j} = (\tilde{B})_{ij},~ i\in [1:n]$ and $j \in [1:2d]$. %\begin{align*} %-DA(x)^\top(A(x) - b) = -2x \odot (\tilde{B}^\top (A(x) - b)), %\end{align*} %where $\odot$ denotes hadamard product. %\begin{align*} %& \text{dist} \left( -DA(x)^\top (A(x)-b), \frac{\partial g(x)}{\b} \right) \nonumber\\ %& = \text{dist} \left( -DA(x)^\top (A(x)-b), \{0\} \right) \nonumber\\ %& = \left\| -DA(x)^\top (A(x)-b) \right\| \nonumber\\ %& \ge ?????, %\end{align*} %Hence, this completes the proof for regularity condition. We draw the entries of $B$ independently from a zero-mean and unit-variance Gaussian distribution. For a fixed sparsity level $k$, the support of $z_*\in \RR^d$ and its nonzero amplitudes are also drawn from the standard Gaussian distribution. %We then pick a sparsity level $k$ and choose $k$ indexes, i.e, $\Omega \subset [1:d]$, which are corresponding to nonzero entries of $z$. %We then assign values from normal distribution to those entries. Then the measurement vector is created as $b = Bz + \epsilon$, where $\epsilon$ is the noise vector with entries drawn independently from the zero-mean Gaussian distribution with variance $\sigma^2=10^{-6}$. \edita{\textbf{AE: the image sizes throughout the paper are inconsistent which is not nice. The font size in Fig 3 is also very different from the rest of the paper which is not nice. please change.}} \begin{figure}[ht] \begin{center} {\includegraphics[width=1\columnwidth]{figs/bp_fig1_subsolvers.pdf}} -\caption{Convergence with different subsolvers for the aforementioned non-convex relaxation. +\caption{Convergence with different subsolvers for the aforementioned nonconvex relaxation. } \label{fig:bp1} %(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])} \end{center} \end{figure} \vspace{-3mm} -Figure~\ref{fig:bp1} compiles our results for the proposed relaxation. It is, indeed, interesting to see that these type of non-convex relaxations gives the solution of convex one and first order methods succeed. +Figure~\ref{fig:bp1} compiles our results for the proposed relaxation. It is, indeed, interesting to see that these type of nonconvex relaxations gives the solution of convex one and first order methods succeed. \paragraph{Discussion:} The true potential of our reformulation is in dealing with more structured norms rather than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as \begin{align*} \|z\|_{\Omega} = \sum_{i=1}^I \| z_{\Omega_i} \|, \end{align*} -{where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the non-convex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work. +{where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the nonconvex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work. -\subsection{Adversarial Denoising with GANs} -In the appendix, we provide a contemporary application example that our template applies. +%\subsection{Adversarial Denoising with GANs} +%In the appendix, we provide a contemporary application example that our template applies. %This relaxation transformed a non-smooth objective into a smooth one while loosing the linearity on the equality constraint. %\subsection{Latent Group Lasso \label{sec:latent lasso}} % %For a collection of subsets $\Omega=\{\Omega_i\}_{i=1}^{I}\subset [1:p]$, the latent group Lasso norm on $\RR^p$ takes $z\in \RR^p$ to %\begin{align} %\|z\|_{\Omega,1} = \sum_{i=1}^I \| z_{\Omega_i} \|. %\end{align} %Note that we do not require $\Omega_i$ to be disjoint. For $B\in \RR^{n\times p}$, $b\in \RR^n$, and $\lambda>0$, consider the latent group lasso as %\begin{align} %\min_{z\in \RR^d} \frac{1}{2}\| Bz - b\|_2^2+ \lambda \| z\|_{\Omega,1}. %\label{eq:group_lasso} %\end{align} %Because $\Omega$ is not a partition of $[1:p]$, computing the proximal operator of $\|\cdot\|_{\Omega}$ is often intractable, ruling out proximal gradient descent and similar algorithms for solving Program~\eqref{eq:group_lasso}. Instead, often Program~\eqref{eq:group_lasso} is solved by Alternating Direction Method of Multipliers (ADMM). More concretely, ??? % %We take a radically different approach here and cast Program~\eqref{eq:group_lasso} as an instance of Program~\eqref{prob:01}. More specifically, let $z^+,z^-\in \RR^p$ be positive and negative parts of $z$, so that $z=z^+-z^-$. Let us introduce the nonnegative $u\in \RR^p$ such that $z^++z^- = u^{\circ 2}$, where we used $\circ$ notation to show entrywise power. We may now write that %\begin{align} %\| z^++z^- \|_{\Omega,1} %& = \| u^{\circ 2} \|_{\Omega,1 } =: \|u\|_{\Omega,2}^2, %\end{align} %Unlike $\|\cdot\|_{\Omega,1}$, note that $\|\cdot\|_{\Omega,2}^2$ is differentiable and, in fact, there exists a positive semidefinite matrix $Q\in \RR^{d\times d}$ such that $\|u\|_{\Omega,2}^2=u^\top Q_\Omega u$. Let us form $x=[(z^+)^\top,(z^-)^\top, u^\top]^\top\in \RR^{3d}$ and set %\begin{align*} %f(x) = \frac{1}{2}\| Bz^+-Bz^- -b\|_1+ \|u\|_{\Omega,2}^2, %\end{align*} %\begin{align} %g(x) = 0, \qquad A(x) = z^++z^--u^{\circ 2}. %\end{align} %We can now apply Algorithm~\ref{Algo:2} to solve Program~\eqref{prob:01} with $f,g,A$ specified above. % % %\paragraph{Convergence rate.} %Clearly, $f,A$ are strongly smooth in the sense that \eqref{eq:smoothness basic} holds with proper $\lambda_f,\lambda_A$. Likewise, both $f$ and the Jacobian $DA$ are continuous and the restricted Lipschitz constants $\lambda'_f,\lambda'_A$ in \eqref{eq:defn_lambda'} are consequently well-defined and finite for a fixed $\rho'>0$. We next verify the key regularity condition in Theorem~\ref{thm:main}, namely, \eqref{eq:regularity}. Note that %\begin{align*} %DA(x) & = %\left[ %\begin{array}{ccc} %I_p & I_p & -2\text{diag}(u) %\end{array} %\right]\in \RR^{d\times 3d}, %\end{align*} %\begin{align*} %-DA(x)^\top A(x) = %\left[ %\begin{array}{c} %-z^+-z^-+u^{\circ 2} \\ %-z^+-z^-+u^{\circ 2}\\ %2\text{diag}(u)( z^++z^--u^{\circ 2}) %\end{array} %\right], %\end{align*} %\begin{align*} %& \text{dist} \left( -DA(x)^\top A(x), \frac{\partial g(x)}{\b} \right) \nonumber\\ %& = \text{dist} \left( -DA(x)^\top A(x), \{0\} \right) \nonumber\\ %& = \left\| -DA(x)^\top A(x) \right\| \nonumber\\ %& \ge \sqrt{2} \| A(x)\|, %\end{align*} %namely, \eqref{eq:regularity} holds with $\nu=1$. diff --git a/NeurIPS 19/sections/guarantees.tex b/NeurIPS 19/sections/guarantees.tex index 2e8eeda..02c02e3 100644 --- a/NeurIPS 19/sections/guarantees.tex +++ b/NeurIPS 19/sections/guarantees.tex @@ -1,193 +1,195 @@ %!TEX root = ../main.tex \section{Convergence Rate \label{sec:cvg rate}} In this section, we detail the convergence rate of Algorithm~\ref{Algo:2} for finding first-order and second-order stationary points, along with total iteration complexity results. All the proofs are deferred to Appendix~\ref{sec:theory}. Theorem~\ref{thm:main} below characterizes the convergence rate of Algorithm~\ref{Algo:2} for finding stationary points in terms of the number of outer iterations.\\ %<<<<<<< HEAD %{\color{red}{Ahmet: Maybe instead of sufficiently large k0 we can say k0 such that beta is bigger than 1, it makes the proof go thru, it would be a bit more explanatory}} %======= %>>>>>>> b39eea61625954bc9d3858590b7b37a182a6af4f \begin{theorem}\textbf{\emph{(convergence rate)}} \label{thm:main} Let $\rho:=\sup_{k\in [K]} \|x\|$.\footnote{If necessary, to ensure that $\rho<\infty$, one can add a small factor of $\|x\|^2$ to $\mathcal{L}_{\b}$ in \eqref{eq:Lagrangian}. Then it is easy to verify that the iterates of Algorithm \ref{Algo:2} remain bounded, provided that the penalty weight $\b$ is large enough, $\sup_x \|\nabla f(x)\|/\|x\|< \infty$ and $\sup_x \|A(x)\| < \infty$. } Suppose that $f$ and $A$ satisfy (\ref{eq:smoothness basic}) and let \begin{align} \lambda'_f = \max_{\|x\|\le \rho} \|\nabla f(x)\|, ~~ \lambda'_A = \max_{\|x\| \le \rho} \|DA(x)\|, \label{eq:defn_restricted_lipsichtz} \end{align} be the (restricted) Lipschitz constants of $f$ and $A$, respectively. %Suppose that $\b_1 \ge \b_0$, where the expression for $\b_0(f,g,A,\s_1)$ is given in (\ref{eq:beta-0}). For integers $2 \le k_0\le k_1$, consider the interval $K=[k_0:k_1]$, and let $\{x_k\}_{k\in K}$ be the output sequence of Algorithm~\ref{Algo:2} on the interval $K$.\footnote{The choice of $k_1 = \infty$ is valid here too.} For $\nu>0$, assume that \begin{align} \nu \|A(x_k)\| & \le \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right), \label{eq:regularity} \end{align} for every $k\in K$. We consider two cases: %{\color{red} Ahmet: I removed the squares and showed the rate on dist + feasibility, since it is the measure for the stationarity, using squares confuses complexity analysis...} \begin{itemize}[leftmargin=*] \item If a first-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f},\b_k)$ first-order stationary point of (\ref{prob:01}) with \begin{align} \epsilon_{k,f} & = \frac{1}{\beta_{k-1}} \left(\frac{2(\lambda'_f+\lambda'_A y_{\max}) (1+\lambda_A' \sigma_k)}{\nu}+1\right) \nonumber \\ &=: \frac{Q(f,g,A,\s_1)}{\beta_{k-1}}, \label{eq:stat_prec_first} \end{align} -for every $k\in K$, where the expression for $y_{\max}(x_1,y_0,\s_1)$ is given in (\ref{eq:dual growth}). +for every $k\in K$, where the expression for $y_{\max}(x_1,y_0,\s_1)$ is given in (\ref{eq:dual growth}) due to the limited space. \item If a second-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f}, \epsilon_{k,s},\b_k)$ second-order stationary point of~(\ref{prob:01}) with $\epsilon_{k,s}$ specified above and with \begin{align} \epsilon_{k,s} &= \epsilon_{k-1} + \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} \nonumber\\ &= \frac{\nu + \sigma_k \sqrt{m} \lambda_A 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} =: \frac{Q'(f,g,A,\s_1)}{\beta_{k-1}}. \end{align} \end{itemize} \end{theorem} %A few remarks are in order about Theorem~\ref{thm:main}. %\textbf{Tradeoff.} %Roughly speaking, Theorem~\ref{thm:main} states that the iterates of Algorithm~\ref{Algo:2} approach first-order stationarity for~\eqref{prob:01} at the rate of $1/{\b_k}$, where $\b_k$ is the penalty weight in AL in iteration $k$ as in~\eqref{eq:Lagrangian}. Note the inherent tradeoff here: For a faster sequence $\{\b_k\}_k$, the iterates approach stationarity faster but the computational cost of the inexact primal subroutine (Step 2) increases since a higher accuracy is required and Lipschitz constant of the unconstrained problem is also affected by $\beta_k$. In words, there is a tradeoff between the number of outer and inner loop iterations of Algorithm~\ref{Algo:2}. We highlight this tradeoff for a number of subroutines below. Loosely speaking, Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} converges to a (first- or second-) order stationary point of \eqref{prob:01} at the rate of $1/\b_k$. A few remarks are in order. \textbf{Regularity.} The key geometric condition in Theorem~\ref{thm:main} is \eqref{eq:regularity} which, broadly speaking, ensures that the primal updates of Algorithm \ref{Algo:2} reduce the feasibility gap as the penalty weight $\b_k$ grows. We will verify this condition for several examples in Section \ref{sec:experiments}. This condition is closely related to those in the existing literature. In the special case where $g=0$ in~\eqref{prob:01}, it is easy to verify that \eqref{eq:regularity} reduces to the Polyak-Lojasiewicz (PL) condition for minimizing $\|A(x)\|^2$~\cite{karimi2016linear}. PL condition itself is a special case of Kurdyka-Lojasiewicz with $\theta = 1/2$, see \cite[Definition 1.1]{xu2017globally}. When $g=0$, it is also easy to see that \eqref{eq:regularity} is weaker than the Mangasarian-Fromovitz condition in nonlinear optimization \cite{bolte2018nonconvex}. By its definition, we may think of \eqref{eq:regularity} as a local condition, which should hold within a neighborhood of the constraint set $\{x:A(x)=0\}$ rather than everywhere in $\mathbb{R}^d$. There is a constant complexity algorithm in \cite{bolte2018nonconvex} to reach this so-called ``information zone'', which supplements Theorem \ref{thm:main}. Moreover, in contrast to most conditions in the nonconvex optimization literature, such as~\cite{flores2012complete}, the condition in~\eqref{eq:regularity} appears to be easier to verify, as we see in Section~\ref{sec:experiments}. \edita{\textbf{AE: Fatih, I think you had added the following two references to our response for icml. Could you discuss their relevance here? [2] Rockafellar, Lagrange Multipliers and Optimality, 1993 [3] Bertsekas, On penalty and multiplier methods for constrained minimization. 1996}} +\edita{\textbf{AE: the spars review talks about the "Pong-Li" work. Fatih, do you know what is that?}} + %We now argue that such a condition is necessary for controlling the feasibility gap of~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}. %To provide further insight about \eqref{eq:regularity}, consider the convex case where $f=0$ and $g=\delta_\mathcal{X}$, where $\mathcal{X}$ is a convex set and $A$ is a linear operator. In this case, solving \eqref{prob:01} finds a point in the convex set $\mathcal{X}\cap \text{null}(A)$, where the subspace $\text{null}(A)=\{ x\in\mathbb{R}^d: A(x) = 0 \} \subset \RR^d$ is the null space of $A$. %Here, the Slater's condition~\cite{boyd2004convex} requires that %\begin{equation} %\text{relint} (\mathcal{X}) \cap \text{null}(A)\neq \emptyset. %\end{equation} %%<<<<<<< HEAD %Recall that the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}. %Intuitively, the Slater's condition achieves this by removing pathological cases and ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01} and finding a point in $\mathcal{X}\cap \text{null}(A)$ can be particularly slow. For instance, in Figure \ref{fig:convex_slater}, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) converges to the intersection point at the suboptimal rate of $1/\sqrt{k}$ rather than the standard $1/k$ rate. %{\color{red} Ahmet: a reference would be cool here} %======= %In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}. Intuitively, the Slater's condition here removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01}, namely, finding a point in $\mathcal{X}\cap \text{null}(A)$, can be particularly difficult. For instance, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) has arbitrarily slow convergence, see Figure~\ref{fig:convex_slater}. %>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732 %\begin{figure} %\begin{center} %\includegraphics[scale=.5]{Slater.pdf} %\end{center} %\caption{In pathological cases where the Slater's condition does not apply, solving \eqref{prob:01} can be particularly slow, even when \eqref{prob:01} is a convex program. See the first remark after Theorem~\ref{thm:main} for more details. \label{fig:convex_slater}} %\end{figure} \paragraph{Penalty method.} A classical algorithm to solve \eqref{prob:01} is the penalty method, which is characterized by the absence of the dual variable ($y=0$) in \eqref{eq:Lagrangian}. Indeed, ALM can be interpreted as an adaptive penalty or smoothing method with a variable center as determined by the dual variable. It is worth noting that, with same proof technique, one can establish the same convergence rate of Theorem \ref{thm:main} for the penalty method. However, while both methods have the same convergence rate in theory, iALM outperforms the penalty method in practice by virtue of its variable center and has been excluded from this presentation. \textbf{Computational complexity.} Theorem~\ref{thm:main} allows us to specify the number of (outer) iterations that Algorithm~\ref{Algo:2} requires to reach a near-stationary point of problem~\eqref{prob:01} with a prescribed precision and, in particular, specifies the number of calls made to the solver in Step~2. In this sense, Theorem~\ref{thm:main} does not fully capture the computational complexity of Algorithm~\ref{Algo:2}, as it does not take into account the computational cost of the solver in Step~2. To better understand the total iteration complexity of Algorithm~\ref{Algo:2}, we consider two scenarios in the following. In the first scenario, we take the solver in Step~2 to be the Accelerated Proximal Gradient Method (APGM), a well-known first-order algorithm~\cite{ghadimi2016accelerated}. In the second scenario, we will use the second-order trust region method developed in~\cite{cartis2012complexity}. \subsection{First-Order Optimality \label{sec:first-o-opt}} %\textbf{AE: we go back and forth between "subroutine" and "solver". for consistency, I'm just using "solver" everywhere.} Let us first consider the case where the solver in Step~2 is is the first-order algorithm APGM, described in detail in ~\cite{ghadimi2016accelerated}. At a high level, APGM makes use of $\nabla_x \mathcal{L}_{\beta}(x,y)$, the proximal operator $\text{prox}_g$ and classical Nesterov acceleration for the iterates~\cite{nesterov1983method} to reach first-order stationarity for the first update in~\eqref{e:exac}. % \textbf{AE: Ahmet to give a brief description of APGM here. } %First, we characterize the iteration complexity of Algorithm~\ref{Algo:2} for finding a first-order stationary point of~\eqref{prob:01}. %We propose to use the standard accelerated proximal method (APGM), guarantees of which are analyzed in~\cite{ghadimi2016accelerated} for nonconvex problems of the form~\eqref{e:exac}. Suppose that $g=\delta_\mathcal{X}$ is the indicator function on a bounded convex set $\mathcal{X}\subset \RR^d$ and let \begin{align} \rho'= \max_{x\in \mathcal{X}}\|x\|, \end{align} be the radius of a ball centered at the origin that includes $\mathcal{X}$. Then, adapting the results in~\cite{ghadimi2016accelerated} to our setup, APGM reaches $x_{k}$ in Step 2 of Algorithm~\ref{Algo:2} after \begin{equation} \mathcal{O}\left ( \frac{\lambda_{\beta_{k}}^2 \rho'^{2} }{\epsilon_{k+1}} \right) \label{eq:iter_1storder} \end{equation} (inner) iterations, where $\lambda_{\beta_{k}}$ denotes the Lipschitz constant of $\nabla_x{\mathcal{L}_{\beta_{k}}(x, y)}$, bounded in~\eqref{eq:smoothness of Lagrangian}. For the clarity of the presentation, we have used a looser bound in \eqref{eq:iter_1storder} compared to ~\cite{ghadimi2016accelerated}. Using \eqref{eq:iter_1storder}, we derive the following corollary, describing the total iteration complexity of Algorithm~\ref{Algo:2} in terms of the number calls made to the first-order oracle in APGM. %\textbf{AE: we haven't talked about oracle before.} \begin{corollary}\label{cor:first} For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point, after $T$ calls to the first-order oracle, where % \begin{equation} T = \mathcal{O}\left( \frac{Q^3 \rho'^2}{\epsilon^{3}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} \rho'^2}{\epsilon^{3}} \right). \end{equation} \end{corollary} -For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is intractable. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight.\footnote{In this context, homotopy loosely corresponds to the gradual enforcement of the constraints by increasing the penalty weight.} This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}. +For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is thus not feasible. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight.\footnote{In this context, homotopy loosely corresponds to the gradual enforcement of the constraints by increasing the penalty weight.} This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}. %\textbf{AE: if we find some time, i'll add a couple of lines for how 1st order opt implies 2nd order opt for smooth constraints.} \subsection{Second-Order Optimality \label{sec:second-o-opt}} %{\color{red}{Ahmet (note to myself): not sure of the constants of trust-region, check again}} \\ Let us now consider the second-order optimality case where the solver in Step~2 is the the trust region method developed in~\cite{cartis2012complexity}. Trust region method minimizes quadratic approximation of the function within a dynamically updated trust-region radius. Second-order trust region method that we consider in this section makes use of Hessian (or an approximation of Hessian) of the augmented Lagrangian in addition to first order oracles. As shown in~\cite{nouiehed2018convergence}, finding approximate second-order stationary points of convex-constrained problems is in general NP-hard. For this reason, we focus in this section on the special case of~\eqref{prob:01} with $g=0$. %We first give a theorem to show the convergence rate of the algorithm for second order stationarity: \\ %{\color{red}{Ahmet: I think that it is possible to remove sufficiently large k0 assumption here. }} \textbf{AE: not worth it really} %\begin{corollary} %\label{thm:main_second} %Under the assumptions of Theorem~\ref{thm:main}, the output of Algorithm~\ref{Algo:2} satisfies %\begin{align} %%& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ %%& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\ %%&=: \frac{Q}{\beta_{k-1}^2}. %\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_{k-1}}(x_k, y_k)) \geq - \frac{C}{\beta_{k-1}} - \epsilon_{k-1}. %\end{align} %\end{corollary} % %We consider \textbf{AE: Ahmet to add a brief description of the algorithm.} Let us compute the total computational complexity of Algorithm~\ref{Algo:2} with the trust region method in Step~2, in terms of the number of calls made to the second-order oracle. By adapting the result in~\cite{cartis2012complexity} to our setup, we find that the number of (inner) iterations required in Step~2 of Algorithm~\ref{Algo:2} to produce $x_{k+1}$ is % \begin{equation} \mathcal{O}\left ( \frac{\lambda_{\beta_{k}, H}^2 (\mathcal{L}_{\beta_{k}}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y))}{\epsilon_k^3} \right), \label{eq:sec_inn_comp} \end{equation} % %<<<<<<< HEAD where $\lambda_{\beta, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop. In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y)$ is bounded by a constant independent of $\epsilon$. We assume a uniform bound for this quantity $\forall \beta_k$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following: %======= %where $\lambda_{\beta_k, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta_k$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop. %In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta_k}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y)$ is bounded by a constant independent of $\epsilon_k$. %We assume a uniform bound for this quantity for all $\b$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following result. The proof is very similar to that of Corollary~\ref{cor:first} and hence omitted here. %>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732 % \begin{corollary}\label{cor:second} For $b>1$, let $\beta_k =b^k $ for every $k$. We assume that \begin{equation} \mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) \leq L_{u},~~ \forall \beta. \end{equation} If we use the trust region method from~\cite{cartis2012complexity} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon$-second-order stationary point of~(\ref{prob:01}) in $T$ calls to the second-order oracle where % \begin{equation} T \leq \mathcal{O}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \log_b{\left( \frac{Q'}{\epsilon} \right)} \right) = \widetilde{\mathcal{O}}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \right). \end{equation} \end{corollary} % Before closing this section, we note that the remark after Corollary~\ref{cor:first} applies here as well. % diff --git a/NeurIPS 19/sections/preliminaries.tex b/NeurIPS 19/sections/preliminaries.tex index caf5af5..0d98724 100644 --- a/NeurIPS 19/sections/preliminaries.tex +++ b/NeurIPS 19/sections/preliminaries.tex @@ -1,183 +1,183 @@ %!TEX root = ../main.tex \vspace{-3mm} \section{Preliminaries \label{sec:preliminaries}} %\editf{Try to simplify this section as much as possible. We need to shorthen the paper.} \paragraph{\textbf{{Notation.}}} We use the notation $\langle\cdot ,\cdot \rangle $ and $\|\cdot\|$ for the {standard inner} product and the norm on $\RR^d$. For matrices, $\|\cdot\|$ and $\|\cdot\|_F$ denote the spectral and the Frobenius norms, respectively. %For a matrix $M$, its row and null spaces are denoted by $\row(A)$ and $\text{null}(A)$, respectively. For the convex function $g:\RR^d\rightarrow\RR$, the subdifferential set at $x\in \RR^d$ is denoted by $\partial g(x)$ and we will occasionally use the notation $\partial g(x)/\b = \{ z/\b : z\in \partial g(x)\}$. When presenting iteration complexity results, we often use $\widetilde{O}(\cdot)$ which suppresses the logarithmic dependencies. %The proximal operator $P_g:\RR^d\rightarrow\RR^d$ takes $x$ to %\begin{align} %P_g(x) = \underset{y}{\argmin} \, g(y) + \frac{1}{2}\|x-y\|^2, %\label{eq:defn of prox} %\end{align} %and, if $g=1_C$ is the indicator function of a convex set $C$, we will use the shorthand $P_C=P_{1_C}$ to show the orthogonal projection onto $C$. We use the indicator function $\delta_\mathcal{X}:\RR^d\rightarrow\RR$ of a set $\mathcal{X}\subset\RR^d$, which takes $x$ to \begin{align} \delta_\mathcal{X}(x) = \begin{cases} 0 & x \in \mathcal{X}\\ \infty & x\notin \mathcal{X}. \end{cases} \label{eq:indicator} \end{align} %The tangent cone of $C$ at $x\in C$ is denoted by $T_C(x)$ and The distance function from a point $x$ to $\mathcal{X}$ is denoted by $\dist(x,\mathcal{X}) = \min_{z\in \mathcal{X}} \|x-z\|$. For integers $k_0 \le k_1$, we denote $[k_0:k_1]=\{k_0,\ldots,k_1\}$. For an operator $A:\RR^d\rightarrow\RR^m$ with components $\{A_i\}_{i=1}^m$, we let $DA(x) \in \mathbb{R}^{m\times d}$ denote the Jacobian of $A$, where the $i$th row of $DA(x)$ is the gradient vector $\nabla A_i(x) \in \RR^d$. %We denote the Hessian of the augmented Lagrangian with respect to $x$ as $\nabla _{xx} \mathcal{L}_{\beta}(x,y)$. \\ %{\color{red} define $A_i$ rigorously.} \textbf{Smoothness.} We require $f:\RR^d\rightarrow\RR$ and $A:\RR^d\rightarrow \RR^m$ to be smooth, namely, there exist $\lambda_f,\lambda_A\ge 0$ such that % %\vspace{-.3cm} \begin{align} \| \nabla f(x) - \nabla f(x')\| & \le \lambda_f \|x-x'\|, \nonumber\\ \| DA(x) - DA(x') \| & \le \lambda_A \|x-x'\|, \label{eq:smoothness basic} \end{align} for every $x,x'\in \mathbb{R}^d$. \textbf{Augmented Lagrangian method (ALM)}. ALM is a classical algorithm, which first appeared in~\cite{hestenes1969multiplier, powell1969method} and extensively studied afterwards in~\cite{bertsekas1982constrained, birgin2014practical}. For solving \eqref{prob:01}, ALM suggests solving the problem % %\vspace{-.3cm} \begin{equation} \min_{x} \max_y \,\,\mathcal{L}_\beta(x,y) + g(x), \label{eq:minmax} \end{equation} %\vspace{-.5cm} % where, for penalty weight $\b>0$, $\mathcal{L}_\b$ is the corresponding augmented Lagrangian, defined as \begin{align} \label{eq:Lagrangian} \mathcal{L}_\beta(x,y) := f(x) + \langle A(x), y \rangle + \frac{\beta}{2}\|A(x)\|^2. \end{align} %\textbf{AE: Adding the bias $b$ doesn't seem to add much except making the presentation more clunky.} %\vspace{-.5cm} The minimax formulation in \eqref{eq:minmax} naturally suggests the following algorithm for solving \eqref{prob:01}. For dual step sizes $\{\s_k\}_k$, consider the iterations \begin{equation}\label{e:exac} x_{k+1} \in \underset{x}{\argmin} \,\, \mathcal{L}_{\beta}(x,y_k)+g(x), \end{equation} \begin{equation*} y_{k+1} = y_k+\s_k A(x_{k+1}). \end{equation*} However, computing $x_{k+1}$ above requires solving the nonconvex problem~\eqref{e:exac} to optimality, which is typically intractable. Instead, it is often easier to find an approximate first- or second-order stationary point of~\eqref{e:exac}. %We therefore consider an algorithm that only requires approximate stationarity in every iteration. Hence, we argue that by gradually improving the stationarity precision and increasing the penalty weight $\b$ above, we can reach a stationary point of the main problem in~\eqref{prob:01}, as detailed in Section~\ref{sec:AL algorithm}. {\textbf{{{Optimality conditions.}}} \label{sec:opt cnds}} {First-order necessary optimality conditions} for \eqref{prob:01} are well-studied. {Indeed, $x\in \RR^d$ is a first-order stationary point of~\eqref{prob:01} if there exists $y\in \RR^m$ such that \begin{align} \begin{cases} -\nabla f(x) - DA(x)^\top y \in \partial g(x)\\ A(x) = 0, \end{cases} \label{e:inclu1} \end{align} where $DA(x)$ is the Jacobian of $A$ at $x$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to \begin{align} \begin{cases} -\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x)\\ A(x) = 0, \end{cases} \label{e:inclu2} \end{align} which is in turn the necessary optimality condition for \eqref{eq:minmax}.} %Note that \eqref{e:inclu2} is equivalent to %\begin{align} %\left[ %\begin{array}{c} %\nabla_x \L_\b(x,y) \\ %\nabla_y \L_\b(x,y) %\end{array} %\right] %\in %\left\{ %\left[ %\begin{array}{c} % v\\ %0 %\end{array} %\right] %: v\in \partial g(x) \right\} %, %\end{align} %which rephrases the stationarity condition in terms of the gradient of the duality gap of Program~\eqref{eq:Lagrangian}. \editf{mfs: check approx. optimality conditions, how they apply in this setting.} Inspired by this, we say that $x$ is an $(\epsilon_f,\b)$ first-order stationary point of \eqref{eq:minmax} if {there exists a $y \in \RR^m$} such that \begin{align} \begin{cases} \dist(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x)) \leq \epsilon_f \\ \| A(x) \| \leq \epsilon_f, \end{cases} \label{eq:inclu3} \end{align} for $\epsilon_f\ge 0$. In light of \eqref{eq:inclu3}, a suitable metric for evaluating the stationarity of a pair $(x,y)\in \RR^d\times \RR^m$ is \begin{align} \dist\left(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x) \right) + \|A(x)\| , \label{eq:cvg metric} \end{align} which we use as the first-order stopping criterion. %When $g=0$, it is also not difficult to verify that the expression in \eqref{eq:cvg metric} is the norm of the gradient of the duality gap in \eqref{eq:minmax}. As an example, for a convex set $\mathcal{X}\subset\RR^d$, suppose that $g = \delta_\mathcal{X}$ is the indicator function on $\mathcal{X}$.} Let also $T_\mathcal{X}(x) \subseteq \RR^d$ denote the tangent cone to $\mathcal{X}$ at $x$, and with $P_{T_\mathcal{X}(x)}:\RR^d\rightarrow\RR^d$ we denote the orthogonal projection onto this tangent cone. Then, for $u\in\RR^d$, it is not difficult to verify that \begin{align}\label{eq:dist_subgrad} \dist\left(u, \partial g(x) \right) = \| P_{T_\mathcal{X}(x)}(u) \|. \end{align} % When $g = 0$, a first-order stationary point $x\in \RR^d$ of \eqref{prob:01} is also second-order stationary if % %\vspace{-.5cm} \begin{equation} \lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))\ge 0, \end{equation} %\vspace{-.5cm} % where $\nabla_{xx}\mathcal{L}_\b$ is the Hessian of $\mathcal{L}_\b$ with respect to $x$, and $\lambda_{\text{min}}(\cdot)$ returns the smallest eigenvalue of its argument. Analogously, $x$ is an $(\epsilon_f, \epsilon_s,\b)$ second-order stationary point if, in addition to \eqref{eq:inclu3}, it holds that % %\vspace{-.5cm} \begin{equation}\label{eq:sec_opt} \lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) \ge -\epsilon_s, \end{equation} for $\epsilon_s\ge 0$. %\vspace{-.5cm} % Naturally, for second-order stationarity, we use $\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))$ as the stopping criterion. \paragraph{{\textbf{Smoothness lemma.}}} This next result controls the smoothness of $\L_\b(\cdot,y)$ for a fixed $y$. The proof is standard but nevertheless is included in Appendix~\ref{sec:proof of smoothness lemma} for completeness. \begin{lemma}[\textbf{smoothness}]\label{lem:smoothness} For fixed $y\in \RR^m$ and $\rho,\rho'\ge 0$, it holds that \begin{align} \| \nabla_x \mathcal{L}_{\beta}(x, y)- \nabla_x \mathcal{L}_{\beta}(x', y) \| \le \lambda_\b \|x-x'\|, \end{align} - for every $x,x' \in \{ x'': \|A(x'') \|\le \rho, \|x''\|\le \rho'\}$, where + for every $x,x' \in \{ x'': \|x''\|\le \rho, \|A(x'') \|\le \rho'\}$, where \begin{align} \lambda_\beta -& \le \lambda_f + \sqrt{m}\lambda_A \|y\| + (\sqrt{m}\lambda_A\rho + d \lambda'^2_A )\b \nonumber\\ +& \le \lambda_f + \sqrt{m}\lambda_A \|y\| + (\sqrt{m}\lambda_A\rho' + d \lambda'^2_A )\b \nonumber\\ & =: \lambda_f + \sqrt{m}\lambda_A \|y\| + \lambda''(A,\rho,\rho') \b. \label{eq:smoothness of Lagrangian} \end{align} Above, $\lambda_f,\lambda_A$ were defined in (\ref{eq:smoothness basic}) and \begin{align} -\lambda'_A := \max_{\|x\|\le \rho'}\|DA(x)\|. +\lambda'_A := \max_{\|x\|\le \rho}\|DA(x)\|. \end{align} \end{lemma} diff --git a/NeurIPS 19/sections/related_works.tex b/NeurIPS 19/sections/related_works.tex index 562a826..d5727b4 100644 --- a/NeurIPS 19/sections/related_works.tex +++ b/NeurIPS 19/sections/related_works.tex @@ -1,87 +1,87 @@ %!TEX root = ../main.tex -\section{Related works \label{sec:related work}} +\section{Related Work \label{sec:related work}} ALM has a long history in the optimization literature, dating back to~\cite{hestenes1969multiplier, powell1969method}. In the special case of~\eqref{prob:01} with a convex function $f$ and a linear operator $A$, standard, inexact and linearized versions of ALM have been extensively studied~\cite{lan2016iteration,nedelcu2014computational,tran2018adaptive,xu2017inexact}. Classical works on ALM focused on the general template of~\eqref{prob:01} with nonconvex $f$ and nonlinear $A$, with arguably stronger assumptions and required exact solutions to the subproblems of the form \eqref{e:exac}, which appear in Step 2 of Algorithm~\ref{Algo:2}, see for instance \cite{bertsekas2014constrained}. A similar analysis was conducted in~\cite{fernandez2012local} for the general template of~\eqref{prob:01}. The authors considered inexact ALM and proved convergence rates for the outer iterates, under specific assumptions on the initialization of the dual variable. However, unlike our results, the authors did not analyze how to solve the subproblems inexactly and they did not provide total complexity results and verifiable conditions. %\textbf{AE: Mention if this paper was convex or nonconvex?} Problem~\eqref{prob:01} with similar assumptions to us is also studied in~\cite{birgin2016evaluation} and~\cite{cartis2018optimality} for first-order and second-order stationarity, respectively, with explicit iteration complexity analysis. As we have mentioned in Section~\ref{sec:cvg rate}, our iteration complexity results matches these theoretical algorithms with a simpler algorithm and a simpler analysis. In addition, these algorithms require setting final accuracies since they utilize this information in the algorithm. In contrast to \cite{birgin2016evaluation,cartis2018optimality}, Algorithm~\ref{Algo:2} does not set accuracies a priori. \cite{cartis2011evaluation} also considers the same template~\eqref{prob:01} for first-order stationarity with a penalty-type method instead of ALM. Even though the authors show $\mathcal{O}(1/\epsilon^2)$ complexity, this result is obtained by assuming that the penalty parameter remains bounded. We note that such an assumption can also be used to match our complexity results. \cite{bolte2018nonconvex} studies the general template~\eqref{prob:01} with specific assumptions involving local error bound conditions for the~\eqref{prob:01}. These conditions are studied in detail in~\cite{bolte2017error}, but their validity for general SDPs~\eqref{eq:sdp} has never been established. This work also lacks the total iteration complexity analysis presented here. % that we can compare. Another work~\cite{clason2018acceleration} focused on solving~\eqref{prob:01} by adapting the primal-dual method of Chambolle and Pock~\cite{chambolle2011first}. The authors proved the convergence of the method and provided convergence rate by imposing error bound conditions on the objective function that do not hold for standard SDPs.%~\eqref{eq:sdp}. %Some works also considered the application of ALM/ADMM to nonconvex problems~\cite{wang2015global, liu2017linearized}. %These works assume that the operator in~\eqref{prob:01} is linear, therefore, they do not apply to our setting. %since we have a nonlinear constraint in addition to a nonconvex objective function. \cite{burer2003nonlinear, burer2005local} is the first work that proposes the splitting $X=UU^\top$ for solving SDPs of the form~\eqref{eq:sdp}. Following these works, the literature on Burer-Monteiro (BM) splitting for the large part focused on using ALM for solving the reformulated problem~\eqref{prob:nc}. However, this approach has a few drawbacks: First, it requires exact solutions in Step 2 of Algorithm~\ref{Algo:2} in theory, which in practice is replaced with inexact solutions. Second, their results only establish convergence without providing the rates. In this sense, our work provides a theoretical understanding of the BM splitting with inexact solutions to Step 2 of Algorithm~\ref{Algo:2} and complete iteration complexities. %\textbf{AE: Ahmet you discuss global optimality below but are you sure that's relevant to our work? in the sense that we just give an algorithm to solve to local optimality.} %Their practical stopping condition is also not analyzed theoretically. %In addition, Burer and Monteiro in~\cite{burer2005local} needed to bound the primal iterates they have to put an artificial bound to the primal domain which will be ineffective in practice; which is impossible to do without knowing the norm of the solution. % and their results do not extend to the case where projection in primal domain are required in each iteration. \cite{bhojanapalli2016dropping, park2016provable} are among the earliest efforts to show convergence rates for BM splitting, focusing on the special case of SDPs without any linear constraints. For these specific problems, they prove the convergence of gradient descent to global optima with convergence rates, assuming favorable initialization. These results, however, do not apply to general SDPs of the form~\eqref{eq:sdp} where the difficulty arises due to the linear constraints. %{\color{red} Ahmet: I have to cite this result bc it is very relevant.} %\textbf{AE: also if you do want to talk about global optimality you could also talk about the line of works that show no spurious local optima.} %In their followup work~, the authors considered projected gradient descent, but only for restricted strongly convex functions. %Their results are not able to extend to linear constraints and general convex functions. %, therefore not applicable to general SDPs. %They do not apply to the general semidefinite programming since $f(U)=\langle C, UU^\ast \rangle$. %, these conditions are not satisfied. %{\color{red} Ahmet: I have to cite this result bc it is relevant and the reviewers will complain if I don't} \cite{bhojanapalli2018smoothed} focused on the quadratic penalty formulation of~\eqref{prob:01}, namely, \begin{equation}\label{eq:pen_sdp} \min_{X\succeq 0} \langle C, X\rangle + \frac{\mu}{2} \| B(x)-b\|^2, \end{equation} which after BM splitting becomes \begin{equation}\label{eq:pen_nc} \min_{U\in\mathbb{R}^{d\times r}} \langle C, UU^\top \rangle + \frac{\mu}{2} \|B(UU^\top)-b\|^2, \end{equation} for which they study the optimality of the second-order stationary points. These results are for establishing a connection between the stationary points of~\eqref{eq:pen_nc} and global optima of~\eqref{eq:pen_sdp}. In contrast, we focus on the relation of the stationary points of~\eqref{eq:minmax} to the constrained problem~\eqref{prob:01}. %\textbf{AE: so i'm not sure why we are comparing with them. their work establish global optimality of local optima but we just find local optima. it seems to add to our work rather compare against it. you also had a paragraph for global optimality with close initialization earlier. could you put the two next to each other?} Another popular method for solving SDPs are due to~\cite{boumal2014manopt, boumal2016global, boumal2016non}, focusing on the case where the constraints in~\eqref{prob:01} can be written as a Riemannian manifold after BM splitting. In this case, the authors apply the Riemannian gradient descent and Riemannian trust region methods for obtaining first- and second-order stationary points, respectively. They obtain~$\mathcal{O}(1/\epsilon^2)$ complexity for finding first-order stationary points and~$\mathcal{O}(1/\epsilon^3)$ complexity for finding second-order stationary points. While these complexities appear better than ours, the smooth manifold requirement in these works is indeed restrictive. In particular, this requirement holds for max-cut and generalized eigenvalue problems, but it is not satisfied for other important SDPs such as quadratic programming (QAP), optimal power flow and clustering with general affine constraints. In addition, as noted in~\cite{boumal2016global}, per iteration cost of their method for max-cut problem is an astronomical~$\mathcal{O}({d^6})$. % which is astronomical. %ly larger than our cost of $\mathcal{O}(n^2r)$ where $r \ll n$. %\cite{birgin2016evaluation} can handle the same problem but their algorithm is much more complicated than ours. Lastly, there also exists a line of work for solving SDPs in their original convex formulation, in a storage efficient way~\cite{nesterov2009primal, yurtsever2015universal, yurtsever2018conditional}. These works have global optimality guarantees by their virtue of directly solving the convex formulation. On the downside, these works require the use of eigenvalue routines and exhibit significantly slower convergence as compared to nonconvex approaches~\cite{jaggi2013revisiting}.