diff --git a/ICML19/Drafts/bibliography.bib b/ICML19/Drafts/bibliography.bib index 46bf2fb..8658e2c 100644 --- a/ICML19/Drafts/bibliography.bib +++ b/ICML19/Drafts/bibliography.bib @@ -1,709 +1,746 @@ %% This BibTeX bibliography file was created using BibDesk. %% http://bibdesk.sourceforge.net/ %% Saved with string encoding Unicode (UTF-8) % Created by Mehmet Fatih Sahin for nonconvex inexact augmented lagrangian paper % December 14, Friday. 01:52 am +@book{bauschke2011convex, + title={Convex analysis and monotone operator theory in Hilbert spaces}, + author={Bauschke, Heinz H and Combettes, Patrick L and others}, + volume={408}, + year={2011}, + publisher={Springer} +} + + +@article{waldspurger2018rank, + title={Rank optimality for the Burer-Monteiro factorization}, + author={Waldspurger, Ir{\`e}ne and Waters, Alden}, + journal={arXiv preprint arXiv:1812.03046}, + year={2018} +} + + +@inproceedings{jaggi2013revisiting, + title={Revisiting Frank-Wolfe: Projection-Free Sparse Convex Optimization.}, + author={Jaggi, Martin}, + booktitle={ICML (1)}, + pages={427--435}, + year={2013} +} + + + +@inproceedings{nesterov1983method, + title={A method for solving the convex programming problem with convergence rate O (1/k\^{} 2)}, + author={Nesterov, Yurii E}, + booktitle={Dokl. Akad. Nauk SSSR}, + volume={269}, + pages={543--547}, + year={1983} +} + + @inproceedings{raghavendra2008optimal, title={Optimal algorithms and inapproximability results for every CSP?}, author={Raghavendra, Prasad}, booktitle={Proceedings of the fortieth annual ACM symposium on Theory of computing}, pages={245--254}, year={2008}, organization={ACM} } @online{xiao2017/online, author = {Han Xiao and Kashif Rasul and Roland Vollgraf}, title = {Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms}, date = {2017-08-28}, year = {2017}, eprintclass = {cs.LG}, eprinttype = {arXiv}, eprint = {cs.LG/1708.07747}, } @article{cartis2011evaluation, title={On the evaluation complexity of composite function minimization with applications to nonconvex nonlinear programming}, author={Cartis, Coralia and Gould, Nicholas IM and Toint, Philippe L}, journal={SIAM Journal on Optimization}, volume={21}, number={4}, pages={1721--1739}, year={2011}, publisher={SIAM} } @article{bolte2017error, title={From error bounds to the complexity of first-order descent methods for convex functions}, author={Bolte, J{\'e}r{\^o}me and Nguyen, Trong Phong and Peypouquet, Juan and Suter, Bruce W}, journal={Mathematical Programming}, volume={165}, number={2}, pages={471--507}, year={2017}, publisher={Springer} } @article{obozinski2011group, title={Group lasso with overlaps: the latent group lasso approach}, author={Obozinski, Guillaume and Jacob, Laurent and Vert, Jean-Philippe}, journal={arXiv preprint arXiv:1110.0413}, year={2011} } @article{tran2018smooth, title={A smooth primal-dual optimization framework for nonsmooth composite convex minimization}, author={Tran-Dinh, Quoc and Fercoq, Olivier and Cevher, Volkan}, journal={SIAM Journal on Optimization}, volume={28}, number={1}, pages={96--134}, year={2018}, publisher={SIAM} } @article{tran2018adaptive, title={An Adaptive Primal-Dual Framework for Nonsmooth Convex Minimization}, author={Tran-Dinh, Quoc and Alacaoglu, Ahmet and Fercoq, Olivier and Cevher, Volkan}, journal={arXiv preprint arXiv:1808.04648}, year={2018} } @article{arora2018compressed, title={A compressed sensing view of unsupervised text embeddings, bag-of-n-grams, and LSTMs}, author={Arora, Sanjeev and Khodak, Mikhail and Saunshi, Nikunj and Vodrahalli, Kiran}, year={2018} } @article{candes2008introduction, title={An introduction to compressive sampling}, author={Cand{\`e}s, Emmanuel J and Wakin, Michael B}, journal={IEEE signal processing magazine}, volume={25}, number={2}, pages={21--30}, year={2008}, publisher={IEEE} } @article{chen2001atomic, title={Atomic decomposition by basis pursuit}, author={Chen, Scott Shaobing and Donoho, David L and Saunders, Michael A}, journal={SIAM review}, volume={43}, number={1}, pages={129--159}, year={2001}, publisher={SIAM} } @article{flores2012complete, title={A complete characterization of strong duality in nonconvex optimization with a single constraint}, author={Flores-Baz{\'a}n, Fabi{\'a}n and Flores-Baz{\'a}n, Fernando and Vera, Cristi{\'a}n}, journal={Journal of Global Optimization}, volume={53}, number={2}, pages={185--201}, year={2012}, publisher={Springer} } @article{dai2002convergence, title={Convergence properties of the BFGS algoritm}, author={Dai, Yu-Hong}, journal={SIAM Journal on Optimization}, volume={13}, number={3}, pages={693--701}, year={2002}, publisher={SIAM} } @article{yang2015sdpnal, title={SDPNAL+: a majorized semismooth Newton-CG augmented Lagrangian method for semidefinite programming with nonnegative constraints}, author={Yang, Liuqin and Sun, Defeng and Toh, Kim-Chuan}, journal={Mathematical Programming Computation}, volume={7}, number={3}, pages={331--366}, year={2015}, publisher={Springer} } @article{mascarenhas2004bfgs, title={The BFGS method with exact line searches fails for non-convex objective functions}, author={Mascarenhas, Walter F}, journal={Mathematical Programming}, volume={99}, number={1}, pages={49--61}, year={2004}, publisher={Springer} } @inproceedings{boumal2016non, title={The non-convex Burer-Monteiro approach works on smooth semidefinite programs}, author={Boumal, Nicolas and Voroninski, Vlad and Bandeira, Afonso}, booktitle={Advances in Neural Information Processing Systems}, pages={2757--2765}, year={2016} } @inproceedings{bhojanapalli2016dropping, title={Dropping convexity for faster semi-definite optimization}, author={Bhojanapalli, Srinadh and Kyrillidis, Anastasios and Sanghavi, Sujay}, booktitle={Conference on Learning Theory}, pages={530--582}, year={2016} } @inproceedings{nesterov1983method, title={A method for solving the convex programming problem with convergence rate O (1/k\^{} 2)}, author={Nesterov, Yurii E}, booktitle={Dokl. Akad. Nauk SSSR}, volume={269}, pages={543--547}, year={1983} } @article{shi2017penalty, title={Penalty dual decomposition method for nonsmooth nonconvex optimization}, author={Shi, Qingjiang and Hong, Mingyi and Fu, Xiao and Chang, Tsung-Hui}, journal={arXiv preprint arXiv:1712.04767}, year={2017} } @article{fernandez2012local, title={Local convergence of exact and inexact augmented Lagrangian methods under the second-order sufficient optimality condition}, author={Fern{\'a}ndez, Dami{\'a}n and Solodov, Mikhail V}, journal={SIAM Journal on Optimization}, volume={22}, number={2}, pages={384--407}, year={2012}, publisher={SIAM} } @article{bolte2018nonconvex, title={Nonconvex Lagrangian-based optimization: monitoring schemes and global convergence}, author={Bolte, J{\'e}r{\^o}me and Sabach, Shoham and Teboulle, Marc}, journal={Mathematics of Operations Research}, year={2018}, publisher={INFORMS} } @article{nouiehed2018convergence, title={Convergence to Second-Order Stationarity for Constrained Non-Convex Optimization}, author={Nouiehed, Maher and Lee, Jason D and Razaviyayn, Meisam}, journal={arXiv preprint arXiv:1810.02024}, year={2018} } @article{cartis2018optimality, title={Optimality of orders one to three and beyond: characterization and evaluation complexity in constrained nonconvex optimization}, author={Cartis, Coralia and Gould, Nicholas IM and Toint, Ph L}, journal={Journal of Complexity}, year={2018}, publisher={Elsevier} } @article{nesterov2009primal, title={Primal-dual subgradient methods for convex problems}, author={Nesterov, Yurii}, journal={Mathematical programming}, volume={120}, number={1}, pages={221--259}, year={2009}, publisher={Springer} } @inproceedings{yurtsever2015universal, title={A universal primal-dual convex optimization framework}, author={Yurtsever, Alp and Dinh, Quoc Tran and Cevher, Volkan}, booktitle={Advances in Neural Information Processing Systems}, pages={3150--3158}, year={2015} } @article{yurtsever2018conditional, title={A Conditional Gradient Framework for Composite Convex Minimization with Applications to Semidefinite Programming}, author={Yurtsever, Alp and Fercoq, Olivier and Locatello, Francesco and Cevher, Volkan}, journal={arXiv preprint arXiv:1804.08544}, year={2018} } @article{bertsekas1976penalty, title={On penalty and multiplier methods for constrained minimization}, author={Bertsekas, Dimitri P}, journal={SIAM Journal on Control and Optimization}, volume={14}, number={2}, pages={216--235}, year={1976}, publisher={SIAM} } @article{tran2018adaptive, title={An Adaptive Primal-Dual Framework for Nonsmooth Convex Minimization}, author={Tran-Dinh, Quoc and Alacaoglu, Ahmet and Fercoq, Olivier and Cevher, Volkan}, journal={arXiv preprint arXiv:1808.04648}, year={2018} } @article{hestenes1969multiplier, title={Multiplier and gradient methods}, author={Hestenes, Magnus R}, journal={Journal of optimization theory and applications}, volume={4}, number={5}, pages={303--320}, year={1969}, publisher={Springer} } @article{powell1969method, title={A method for nonlinear constraints in minimization problems}, author={Powell, Michael JD}, journal={Optimization}, pages={283--298}, year={1969}, publisher={Academic Press} } @book{bertsekas2014constrained, title={Constrained optimization and Lagrange multiplier methods}, author={Bertsekas, Dimitri P}, year={2014}, publisher={Academic press} } @inproceedings{kulis2007fast, title={Fast low-rank semidefinite programming for embedding and clustering}, author={Kulis, Brian and Surendran, Arun C and Platt, John C}, booktitle={Artificial Intelligence and Statistics}, pages={235--242}, year={2007} } @article{xu2017inexact, title={Iteration complexity of inexact augmented Lagrangian methods for constrained convex programming}, author={Xu, Yangyang}, journal={arXiv preprint arXiv:1711.05812v2}, year={2017} } @article{nedelcu2014computational, title={Computational complexity of inexact gradient augmented Lagrangian methods: application to constrained MPC}, author={Nedelcu, Valentin and Necoara, Ion and Tran-Dinh, Quoc}, journal={SIAM Journal on Control and Optimization}, volume={52}, number={5}, pages={3109--3134}, year={2014}, publisher={SIAM} } @article{pataki1998rank, title={On the rank of extreme matrices in semidefinite programs and the multiplicity of optimal eigenvalues}, author={Pataki, G{\'a}bor}, journal={Mathematics of operations research}, volume={23}, number={2}, pages={339--358}, year={1998}, publisher={INFORMS} } @article{Barvinok1995problems, title={Problems of distance geometry and convex properties of quadratic maps}, author={Barvinok, Alexander I.}, journal={Discrete \& Computational Geometry}, volume={13}, number={2}, pages={189--202}, year={1995}, publisher={Springer} } @article{cartis2012complexity, title={Complexity bounds for second-order optimality in unconstrained optimization}, author={Cartis, Coralia and Gould, Nicholas IM and Toint, Ph L}, journal={Journal of Complexity}, volume={28}, number={1}, pages={93--108}, year={2012}, publisher={Elsevier} } @article{ghadimi2016accelerated, title={Accelerated gradient methods for nonconvex nonlinear and stochastic programming}, author={Ghadimi, Saeed and Lan, Guanghui}, journal={Mathematical Programming}, volume={156}, number={1-2}, pages={59--99}, year={2016}, publisher={Springer} } @article{boumal2014manopt, Author = {Boumal, Nicolas and Mishra, Bamdev and Absil, P-A and Sepulchre, Rodolphe}, Journal = {The Journal of Machine Learning Research}, Number = {1}, Pages = {1455--1459}, Publisher = {JMLR. org}, Title = {Manopt, a Matlab toolbox for optimization on manifolds}, Volume = {15}, Year = {2014}} @article{boumal2016global, Author = {Boumal, Nicolas and Absil, P-A and Cartis, Coralia}, Journal = {arXiv preprint arXiv:1605.08101}, Title = {Global rates of convergence for nonconvex optimization on manifolds}, Year = {2016}} @inproceedings{ge2016efficient, Author = {Ge, Rong and Jin, Chi and Netrapalli, Praneeth and Sidford, Aaron and others}, Booktitle = {International Conference on Machine Learning}, Pages = {2741--2750}, Title = {Efficient algorithms for large-scale generalized eigenvector computation and canonical correlation analysis}, Year = {2016}} @inproceedings{boumal2016non, Author = {Boumal, Nicolas and Voroninski, Vlad and Bandeira, Afonso}, Booktitle = {Advances in Neural Information Processing Systems}, Pages = {2757--2765}, Title = {The non-convex Burer-Monteiro approach works on smooth semidefinite programs}, Year = {2016}} @article{davis2011university, Author = {Davis, Timothy A and Hu, Yifan}, Journal = {ACM Transactions on Mathematical Software (TOMS)}, Number = {1}, Pages = {1}, Publisher = {ACM}, Title = {The University of Florida sparse matrix collection}, Volume = {38}, Year = {2011}} @article{clason2018acceleration, Author = {Clason, Christian and Mazurenko, Stanislav and Valkonen, Tuomo}, Journal = {arXiv preprint arXiv:1802.03347}, Title = {Acceleration and global convergence of a first-order primal--dual method for nonconvex problems}, Year = {2018}} @article{chambolle2011first, Author = {Chambolle, Antonin and Pock, Thomas}, Journal = {Journal of mathematical imaging and vision}, Number = {1}, Pages = {120--145}, Publisher = {Springer}, Title = {A first-order primal-dual algorithm for convex problems with applications to imaging}, Volume = {40}, Year = {2011}} @article{tran2018smooth, Author = {Tran-Dinh, Quoc and Fercoq, Olivier and Cevher, Volkan}, Journal = {SIAM Journal on Optimization}, Number = {1}, Pages = {96--134}, Publisher = {SIAM}, Title = {A smooth primal-dual optimization framework for nonsmooth composite convex minimization}, Volume = {28}, Year = {2018}} @article{bhojanapalli2018smoothed, Author = {Bhojanapalli, Srinadh and Boumal, Nicolas and Jain, Prateek and Netrapalli, Praneeth}, Journal = {arXiv preprint arXiv:1803.00186}, Title = {Smoothed analysis for low-rank solutions to semidefinite programs in quadratic penalty form}, Year = {2018}} @article{park2016provable, Author = {Park, Dohyung and Kyrillidis, Anastasios and Bhojanapalli, Srinadh and Caramanis, Constantine and Sanghavi, Sujay}, Journal = {arXiv preprint arXiv:1606.01316}, Title = {Provable Burer-Monteiro factorization for a class of norm-constrained matrix problems}, Year = {2016}} @article{mixon2016clustering, Author = {Mixon, Dustin G and Villar, Soledad and Ward, Rachel}, Journal = {arXiv preprint arXiv:1602.06612}, Title = {Clustering subgaussian mixtures by semidefinite programming}, Year = {2016}} @article{yurtsever2018conditional, Author = {Yurtsever, Alp and Fercoq, Olivier and Locatello, Francesco and Cevher, Volkan}, Journal = {arXiv preprint arXiv:1804.08544}, Title = {A Conditional Gradient Framework for Composite Convex Minimization with Applications to Semidefinite Programming}, Year = {2018}} @inproceedings{jin2017escape, Author = {Jin, Chi and Ge, Rong and Netrapalli, Praneeth and Kakade, Sham M and Jordan, Michael I}, Booktitle = {International Conference on Machine Learning}, Pages = {1724--1732}, Title = {How to Escape Saddle Points Efficiently}, Year = {2017}} @article{birgin2016evaluation, title={Evaluation complexity for nonlinear constrained optimization using unscaled KKT conditions and high-order models}, author={Birgin, Ernesto G and Gardenghi, JL and Martínez, Jos{\'e} Mario and Santos, SA and Toint, Ph L}, journal={SIAM Journal on Optimization}, volume={26}, number={2}, pages={951--967}, year={2016}, publisher={SIAM} } @article{liu2017linearized, Author = {Liu, Qinghua and Shen, Xinyue and Gu, Yuantao}, Journal = {arXiv preprint arXiv:1705.02502}, Title = {Linearized admm for non-convex non-smooth optimization with convergence analysis}, Year = {2017}} @article{wang2015global, Author = {Wang, Yu and Yin, Wotao and Zeng, Jinshan}, Journal = {arXiv preprint arXiv:1511.06324}, Title = {Global convergence of ADMM in nonconvex nonsmooth optimization}, Year = {2015}} @book{bertsekas2014constrained, title={Constrained optimization and Lagrange multiplier methods}, author={Bertsekas, Dimitri P}, year={2014}, publisher={Academic press} } @article{lan2016iteration, Author = {Lan, Guanghui and Monteiro, Renato DC}, Journal = {Mathematical Programming}, Number = {1-2}, Pages = {511--547}, Publisher = {Springer}, Title = {Iteration-complexity of first-order augmented Lagrangian methods for convex programming}, Volume = {155}, Year = {2016}} @article{nedelcu2014computational, title={Computational complexity of inexact gradient augmented Lagrangian methods: application to constrained MPC}, author={Nedelcu, Valentin and Necoara, Ion and Tran-Dinh, Quoc}, journal={SIAM Journal on Control and Optimization}, volume={52}, number={5}, pages={3109--3134}, year={2014}, publisher={SIAM} } @book{bertsekas1999nonlinear, title={Nonlinear programming}, author={Bertsekas, Dimitri P} } @article{hestenes1969multiplier, title={Multiplier and gradient methods}, author={Hestenes, Magnus R}, journal={Journal of optimization theory and applications}, volume={4}, number={5}, pages={303--320}, year={1969}, publisher={Springer} } @incollection{powell1978fast, title={A fast algorithm for nonlinearly constrained optimization calculations}, author={Powell, Michael JD}, booktitle={Numerical analysis}, pages={144--157}, year={1978}, publisher={Springer} } @article{burer2005local, title={Local minima and convergence in low-rank semidefinite programming}, author={Burer, Samuel and Monteiro, Renato DC}, journal={Mathematical Programming}, volume={103}, number={3}, pages={427--444}, year={2005}, publisher={Springer} } @article{burer2003nonlinear, title={A nonlinear programming algorithm for solving semidefinite programs via low-rank factorization}, author={Burer, Samuel and Monteiro, Renato DC}, journal={Mathematical Programming}, volume={95}, number={2}, pages={329--357}, year={2003}, publisher={Springer} } @article{singer2011angular, title={Angular synchronization by eigenvectors and semidefinite programming}, author={Singer, Amit}, journal={Applied and computational harmonic analysis}, volume={30}, number={1}, pages={20}, year={2011}, publisher={NIH Public Access} } @article{singer2011three, title={Three-dimensional structure determination from common lines in cryo-EM by eigenvectors and semidefinite programming}, author={Singer, Amit and Shkolnisky, Yoel}, journal={SIAM journal on imaging sciences}, volume={4}, number={2}, pages={543--572}, year={2011}, publisher={SIAM} } @inproceedings{song2007dependence, title={A dependence maximization view of clustering}, author={Song, Le and Smola, Alex and Gretton, Arthur and Borgwardt, Karsten M}, booktitle={Proceedings of the 24th international conference on Machine learning}, pages={815--822}, year={2007}, organization={ACM} } @inproceedings{mossel2015consistency, title={Consistency thresholds for the planted bisection model}, author={Mossel, Elchanan and Neeman, Joe and Sly, Allan}, booktitle={Proceedings of the forty-seventh annual ACM symposium on Theory of computing}, pages={69--75}, year={2015}, organization={ACM} } @incollection{lovasz2003semidefinite, title={Semidefinite programs and combinatorial optimization}, author={Lov{\'a}sz, L{\'a}szl{\'o}}, booktitle={Recent advances in algorithms and combinatorics}, pages={137--194}, year={2003}, publisher={Springer} } @article{khot2011grothendieck, title={Grothendieck-type inequalities in combinatorial optimization}, author={Khot, Subhash and Naor, Assaf}, journal={arXiv preprint arXiv:1108.2464}, year={2011} } @article{park2016provable, title={Provable Burer-Monteiro factorization for a class of norm-constrained matrix problems}, author={Park, Dohyung and Kyrillidis, Anastasios and Bhojanapalli, Srinadh and Caramanis, Constantine and Sanghavi, Sujay}, journal={arXiv preprint arXiv:1606.01316}, year={2016} } @article{mixon2016clustering, title={Clustering subgaussian mixtures by semidefinite programming}, author={Mixon, Dustin G and Villar, Soledad and Ward, Rachel}, journal={arXiv preprint arXiv:1602.06612}, year={2016} } @inproceedings{kulis2007fast, title={Fast low-rank semidefinite programming for embedding and clustering}, author={Kulis, Brian and Surendran, Arun C and Platt, John C}, booktitle={Artificial Intelligence and Statistics}, pages={235--242}, year={2007} } @article{Peng2007, Author = {Peng, J. and Wei, Y.}, Date-Added = {2017-10-26 15:22:37 +0000}, Date-Modified = {2017-10-26 15:30:53 +0000}, Journal = {SIAM J. Optim.}, Number = {1}, Pages = {186--205}, Title = {Approximating {K}--means--type clustering via semidefinite programming}, Volume = {18}, Year = {2007}} @book{fletcher2013practical, title={Practical methods of optimization}, author={Fletcher, Roger}, year={2013}, publisher={John Wiley \& Sons} } @article{Bora2017, title = {Compressed {Sensing} using {Generative} {Models}}, url = {http://arxiv.org/abs/1703.03208}, urldate = {2018-10-25}, journal = {arXiv:1703.03208 [cs, math, stat]}, author = {Bora, Ashish and Jalal, Ajil and Price, Eric and Dimakis, Alexandros G.}, month = mar, year = {2017}, note = {arXiv: 1703.03208}, keywords = {Statistics - Machine Learning, Computer Science - Machine Learning, Computer Science - Information Theory}, } @book{nocedal2006numerical, title={Numerical Optimization}, author={Nocedal, J. and Wright, S.}, isbn={9780387400655}, lccn={2006923897}, series={Springer Series in Operations Research and Financial Engineering}, url={https://books.google.ch/books?id=VbHYoSyelFcC}, year={2006}, publisher={Springer New York} } @article{parikh2014proximal, title={Proximal algorithms}, author={Parikh, Neal and Boyd, Stephen and others}, journal={Foundations and Trends{\textregistered} in Optimization}, volume={1}, number={3}, pages={127--239}, year={2014}, publisher={Now Publishers, Inc.} } \ No newline at end of file diff --git a/ICML19/Drafts/main.tex b/ICML19/Drafts/main.tex index 81590dd..62feec4 100644 --- a/ICML19/Drafts/main.tex +++ b/ICML19/Drafts/main.tex @@ -1,173 +1,173 @@ %%%%%%%% ICML 2019 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%% \documentclass{article} % Recommended, but optional, packages for figures and better typesetting: \usepackage{microtype} \usepackage{graphicx} \usepackage{subfigure} \usepackage{booktabs} % for professional tables % hyperref makes hyperlinks in the resulting PDF. % If your build breaks (sometimes temporarily if a hyperlink spans a page) % please comment out the following usepackage line and replace % \usepackage{icml2019} with \usepackage[nohyperref]{icml2019} above. \usepackage{hyperref} % Attempt to make hyperref and algorithmic work together better: \newcommand{\theHalgorithm}{\arabic{algorithm}} % Use the following line for the initial blind version submitted for review: \usepackage{icml2019} % If accepted, instead use the following line for the camera-ready submission: %\usepackage[accepted]{icml2019} % The \icmltitle you define below is probably too long as a header. % Therefore, a short form for the running title is supplied here: -\icmltitlerunning{Inexact Augmented Lagrangian Framework for Non-Convex Optimization} +\icmltitlerunning{Inexact Augmented Lagrangian Framework for Nonconvex Optimization} % Our own packages \usepackage{amsthm} \usepackage{amsmath} \usepackage{amssymb} \usepackage{graphicx} \usepackage{algorithm} \usepackage{algorithmic} \usepackage{color} %\usepackage{cite} \usepackage{enumitem} % AE's commands \newcommand{\RR}{\mathbb{R}} \newcommand{\edita}[1]{{\color{blue} #1}} \newcommand{\notea}[1]{{\color{magenta} \textbf{Note:AE: #1}}} \newcommand{\ol}{\overline} \newcommand{\Null}{\operatorname{null}} \newcommand{\relint}{\operatorname{relint}} \newcommand{\row}{\operatorname{row}} \newcommand{\s}{\sigma} \newcommand{\editaa}[1]{{\color{red} #1}} \renewcommand{\b}{\beta} \renewcommand{\L}{\mathcal{L}} % Lagrangian \renewcommand{\i}{\iota} \newcommand{\g}{\gamma} \newcommand{\cone}{\operatorname{cone}} \newcommand{\argmin}{\operatorname{argmin}} \newcommand{\dist}{\operatorname{dist}} % MFS's commands \newcommand{\editf}[1]{{\color[rgb]{1,0.4,0.4} #1}} %\DeclareMathOperator*{\argmax}{argmax} % thin space, limits underneath in displays % Theorem style \newtheorem{theorem}{Theorem}[section] % reset theorem numbering for each section \newtheorem{question}[theorem]{Question} \newtheorem{lemma}[theorem]{Lemma} \newtheorem{corollary}[theorem]{Corollary} \newtheorem{definition}{Definition} \newtheorem{proposition}{Proposition} \begin{document} \twocolumn[ -\icmltitle{An Inexact Augmented Lagrangian Framework \\ for Non-Convex Optimization with Nonlinear Constraints} +\icmltitle{An Inexact Augmented Lagrangian Framework \\ for Nonconvex Optimization with Nonlinear Constraints} % It is OKAY to include author information, even for blind % submissions: the style file will automatically remove it for you % unless you've provided the [accepted] option to the icml2019 % package. % List of affiliations: The first argument should be a (short) % identifier you will use later to specify author affiliations % Academic affiliations should list Department, University, City, Region, Country % Industry affiliations should list Company, City, Region, Country % You can specify symbols, otherwise they are numbered in order. % Ideally, you should not use this facility. Affiliations will be numbered % in order of appearance and this is the preferred way. \icmlsetsymbol{equal}{*} \begin{icmlauthorlist} \icmlauthor{Aeiau Zzzz}{equal,to} \icmlauthor{Bauiu C.~Yyyy}{equal,to,goo} \icmlauthor{Cieua Vvvvv}{goo} \icmlauthor{Iaesut Saoeu}{ed} \icmlauthor{Fiuea Rrrr}{to} \icmlauthor{Tateu H.~Yasehe}{ed,to,goo} \icmlauthor{Aaoeu Iasoh}{goo} \icmlauthor{Buiui Eueu}{ed} \icmlauthor{Aeuia Zzzz}{ed} \icmlauthor{Bieea C.~Yyyy}{to,goo} \icmlauthor{Teoau Xxxx}{ed} \icmlauthor{Eee Pppp}{ed} \end{icmlauthorlist} \icmlaffiliation{to}{Department of Computation, University of Torontoland, Torontoland, Canada} \icmlaffiliation{goo}{Googol ShallowMind, New London, Michigan, USA} \icmlaffiliation{ed}{School of Computation, University of Edenborrow, Edenborrow, United Kingdom} %\icmlcorrespondingauthor{Cieua Vvvvv}{c.vvvvv@googol.com} %\icmlcorrespondingauthor{Eee Pppp}{ep@eden.co.uk} % You may provide any keywords that you % find helpful for describing your paper; these are used to populate % the "keywords" metadata in the PDF but will not be shown in the document \icmlkeywords{Machine Learning, ICML} \vskip 0.3in ] % this must go after the closing bracket ] following \twocolumn[ ... % This command actually creates the footnote in the first column % listing the affiliations and the copyright notice. % The command takes one argument, which is text to display at the start of the footnote. % The \icmlEqualContribution command is standard text for equal contribution. % Remove it (just {}) if you do not need this facility. %\printAffiliationsAndNotice{} % leave blank if no need to mention equal contribution \printAffiliationsAndNotice{\icmlEqualContribution} % otherwise use the standard text. %\input{preamble.tex} \input{sections/abstract.tex} \input{sections/introduction.tex} \input{sections/preliminaries.tex} \input{sections/AL.tex} \input{sections/guarantees.tex} %\input{sections/guarantees2.tex} \input{sections/related_works.tex} \input{sections/experiments.tex} %\begin{acknowledgements} %If you'd like to thank anyone, place your comments here %and remove the percent signs. %\end{acknowledgements} % BibTeX users please use one of \newpage -%\bibliographystyle{icml2019} % basic style, author-year citations -%\bibliography{bibliography.bib} % name your BibTeX data base +\bibliographystyle{icml2019} % basic style, author-year citations +\bibliography{bibliography.bib} % name your BibTeX data base \newpage \appendix \input{sections/AL_theory.tex} \input{sections/appendix.tex} \end{document} % end of file template.tex diff --git a/ICML19/Drafts/sections/AL.tex b/ICML19/Drafts/sections/AL.tex index 868450f..ece7094 100644 --- a/ICML19/Drafts/sections/AL.tex +++ b/ICML19/Drafts/sections/AL.tex @@ -1,53 +1,55 @@ \section{Algorithm \label{sec:AL algorithm}} -To solve the equivalent formulation presented in \eqref{eq:minmax}, we propose the inexact ALM (iALM), detailed in Algorithm~\ref{Algo:2}. +To solve the formulation presented in \eqref{eq:minmax}, we propose the inexact ALM (iALM), detailed in Algorithm~\ref{Algo:2}. -An iteration $k$ of Algorithm~\ref{Algo:2} calls in Step 2 a solver that finds an approximate stationary point of the augmented Lagrangian $\L_{\b_k}(\cdot,y_k)$ with the accuracy of $\epsilon_{k+1}$, and this accuracy gradually increases in a controlled fashion. +An iteration $k$, Algorithm~\ref{Algo:2} calls in Step 2 a solver that finds an approximate stationary point of the augmented Lagrangian $\L_{\b_k}(\cdot,y_k)$ with the accuracy of $\epsilon_{k+1}$, and this accuracy gradually increases in a controlled fashion. -The increasing sequence of penalty weights $\{\b_k\}_k$ and the dual update (Steps 4 and 5) are responsible for continuously enforcing the constraints in~\eqref{prob:01}. As we will see in the convergence analysis, the particular choice of the dual step size $\s_k$ in Algorithm~\ref{Algo:2} ensures that the dual variable $y_k$ remains bounded; see~\cite{bertsekas1976penalty} for a precedent in the ALM literature. +The increasing sequence of penalty weights $\{\b_k\}_k$ and the dual update (Steps 4 and 5) are responsible for continuously enforcing the constraints in~\eqref{prob:01}. As we will see in the convergence analysis, the particular choice of the dual step size $\s_k$ in Algorithm~\ref{Algo:2} ensures that the dual variable $y_k$ remains bounded; see~\cite{bertsekas1976penalty} for a precedent in the ALM literature where a similar choice for $\sigma_k$ is considered. -Step 3 of Algorithm~\ref{Algo:2} removes pathological cases with divergent iterates. As an example, suppose that $g=1_\mathcal{X}$ in \eqref{prob:01} is the indicator function for a bounded convex set $\mathcal{X}\subset \RR^d$ and take $\rho' > \max_{x\in \mathcal{X}} \|x\|$. Then, for sufficiently large $k$, it is not difficult to verify that all the iterates of Algorithm~\ref{Algo:2} automatically satisfy $\|x_k\|\le \rho'$ without the need to execute Step 3. +Step 3 of Algorithm~\ref{Algo:2} removes pathological cases with divergent iterates. As an example, suppose that $g=\delta_\mathcal{X}$ in \eqref{prob:01} is the indicator function for a bounded convex set $\mathcal{X}\subset \RR^d$ and take $\rho' > \max_{x\in \mathcal{X}} \|x\|$. Then, for sufficiently large $k$, it is not difficult to verify that all the iterates of Algorithm~\ref{Algo:2} automatically satisfy $\|x_k\|\le \rho'$ without the need to execute Step 3. \begin{algorithm}[h!] \begin{algorithmic} -\STATE \textbf{Input:} $\rho,\rho',\rho''>0$. A non-decreasing, positive, unbounded sequence $\{\b_k\}_{k\ge 1}$, stopping threshold $\tau$. +\STATE \textbf{Input:} $\rho,\rho',\rho''>0$. A non-decreasing, positive, unbounded sequence $\{\b_k\}_{k\ge 1}$, stopping threshold $\tau_f$ and $\tau_s$. \STATE \textbf{Initialization:} $x_{1}\in \RR^d$ such that $\|A(x_1)\|\le \rho$ and $\|x_1\|\le \rho'$, $y_0\in \RR^m$, $\s_1$. %For $k=1,2,\ldots$, execute\\ \FOR{$k=1,2,\dots$} \STATE \begin{enumerate}[leftmargin=*] \item \textbf{(Update tolerance)} $\epsilon_{k+1} = 1/\b_k$. %\begin{align} %\beta_k = \frac{\beta_{1}}{\log 2}\sqrt{k}\log^2(k+1) . %\end{align} \item \textbf{(Inexact primal solution)} Obtain $x_{k+1}\in \RR^d$ such that \begin{equation*} \dist(-\nabla_x \L_{\beta_k} (x_{k+1},y_k), \partial g(x_{k+1}) ) \le \epsilon_{k+1} \end{equation*} for first-order stationarity and, in addition, \begin{equation*} \lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_k}(x_{k+1}, y_k)) \ge -\epsilon_{k+1} \end{equation*} for second-order-stationarity. \item \textbf{(Control)} If necessary, project $x_{k+1}$ to ensure that $\|x_{k+1}\|\le \rho'$.\\ \item \textbf{(Update dual step size)} \begin{align*} \s_{k+1} & = \s_{1} \min\Big( \frac{\|A(x_1)\| \log^2 2 }{\|A(x_{k+1})\| (k+1)\log^2(k+2)} ,1 \Big). \end{align*} \item \textbf{(Dual ascent)} $y_{k+1} = y_{k} + \sigma_{k+1}A(x_{k+1})$. \item \textbf{(Stopping criterion)} If \begin{align*} -& \dist^2(-\nabla_x \L_{\b_k}(x_{k+1}),\partial g(x_{k+1}) \nonumber\\ -& \qquad + \s_{k+1} \|A(x_{k+1})\|^2 \le \tau, +& \dist(-\nabla_x \L_{\b_k}(x_{k+1}),\partial g(x_{k+1}) \nonumber\\ +& \qquad + \s_{k+1} \|A(x_{k+1})\| \le \tau_f, \end{align*} + +(and $\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_{k}}(x_{k+1}, y_k)) \geq -\tau_s$ if second-order stationarity is required), then quit and return $x_{k+1}$ as an (approximate) stationary point of~\eqref{prob:01}. \end{enumerate} \ENDFOR \end{algorithmic} \caption{Inexact AL for solving~\eqref{prob:01}} \label{Algo:2} \end{algorithm} diff --git a/ICML19/Drafts/sections/AL_theory.tex b/ICML19/Drafts/sections/AL_theory.tex index 3d95abb..63ecac7 100644 --- a/ICML19/Drafts/sections/AL_theory.tex +++ b/ICML19/Drafts/sections/AL_theory.tex @@ -1,144 +1,144 @@ \section{Proof of Theorem \ref{thm:main} \label{sec:theory}} -{\color{red} Ahmet: I'll modify with $Ax-b$, and also remove the assumption on $k_0$, also add the small proof for second order stationarity we discussed with Armin} \textbf{AE: IMHO, the first two are not worth the effort right now.} + For every $k\ge2$, recall from (\ref{eq:Lagrangian}) and Step~2 of Algorithm~\ref{Algo:2} that $x_{k}$ satisfies \begin{align} & \dist(-\nabla f(x_k) - DA(x_k)^\top y_{k-1} \nonumber\\ & \qquad - \b_{k-1} DA(x_{k})^\top A(x_k) ,\partial g(x_k) ) \nonumber\\ & = \dist(-\nabla_x \L_{\b_{k-1}} (x_k ,y_{k-1}) ,\partial g(x_k) ) \le \epsilon_{k}. \end{align} With an application of the triangle inequality, it follows that \begin{align} & \dist( -\b_{k-1} DA(x_k)^\top A(x_k) , \partial g(x_k) ) \nonumber\\ & \qquad \le \| \nabla f(x_k )\| + \| DA(x_k)^\top y_{k-1}\| + \epsilon_k, \end{align} which in turn implies that \begin{align} & \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \nonumber\\ & \le \frac{ \| \nabla f(x_k )\|}{\b_{k-1} } + \frac{\| DA(x_k)^\top y_{k-1}\|}{\b_{k-1} } + \frac{\epsilon_k}{\b_{k-1} } \nonumber\\ & \le \frac{\lambda'_f+\lambda'_A \|y_{k-1}\|+\epsilon_k}{\b_{k-1}} , \label{eq:before_restriction} \end{align} where $\lambda'_f,\lambda'_A$ were defined in \eqref{eq:defn_restricted_lipsichtz}. %For example, when $g=\delta_\mathcal{X}$ is the indicator , then \eqref{eq:before_restriction} reads as %\begin{align} %& \| P_{T_C(x_k)} DA(x_k)^\top A(x_k) \| \nonumber\\ %& \qquad \le \frac{ \lambda'_f+ \lambda'_A \|y_{k-1}\| + \epsilon_k }{\b_{k-1} } . %\label{eq:before} %\end{align} We next translate \eqref{eq:before_restriction} into a bound on the feasibility gap $\|A(x_k)\|$. Using the regularity condition \eqref{eq:regularity}, the left-hand side of \eqref{eq:before_restriction} can be bounded below as \begin{align} & \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \nonumber\\ & \ge \nu \|A(x_k) \|, \qquad \text{(see (\ref{eq:regularity}))} \label{eq:restrited_pre} \end{align} provided that $\rho,\rho'$ satisfy \begin{align} \max_{k\in K} \|A(x_k)\| \le \rho, \qquad \max_{k\in K} \|x_k\| \le \rho'. \label{eq:to_be_checked_later} \end{align} By substituting \eqref{eq:restrited_pre} back into \eqref{eq:before_restriction}, we find that \begin{align} \|A(x_k)\| \le \frac{ \lambda'_f + \lambda'_A \|y_{k-1}\| + \epsilon_k}{\nu \b_{k-1} }. \label{eq:before_dual_controlled} \end{align} In words, the feasibility gap is directly controlled by the dual sequence $\{y_k\}_k$. We next establish that the dual sequence is bounded. Indeed, for every $k\in K$, note that \begin{align} \|y_k\| & = \| y_0 + \sum_{i=1}^{k} \s_i A(x_i) \| \quad \text{(Step 5 of Algorithm \ref{Algo:2})} \nonumber\\ & \le \|y_0\|+ \sum_{i=1}^k \s_i \|A(x_i)\| \qquad \text{(triangle inequality)} \nonumber\\ & \le \|y_0\|+ \sum_{i=1}^k \frac{ \|A(x_1)\| \log^2 2 }{ k \log^2(k+1)} \quad \text{(Step 4)} \nonumber\\ & \le \|y_0\|+ c \|A(x_1) \| \log^2 2 =: y_{\max}, \label{eq:dual growth} \end{align} where \begin{align} c \ge \sum_{i=1}^{\infty} \frac{1}{k \log^2 (k+1)}. \end{align} Substituting \eqref{eq:dual growth} back into \eqref{eq:before_dual_controlled}, we reach \begin{align} \|A(x_k)\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} } \nonumber\\ & \le \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} } , \label{eq:cvg metric part 2} \end{align} where the second line above holds if $k_0$ is large enough, which would in turn guarantees that $\epsilon_k=1/\b_{k-1}$ is sufficiently small since $\{\b_k\}_k$ is increasing and unbounded. Let us now revisit and simplify \eqref{eq:to_be_checked_later}. Note that $\rho'$ automatically satisfies the second inequality there, owing to Step~3 of Algorithm~\ref{Algo:2}. Also, $\rho$ satisfies the first inequality in \eqref{eq:to_be_checked_later} if \begin{align} \frac{\lambda'_f+\lambda'_A y_{\max}}{ \nu_A \b_1} \le \rho/2, \end{align} and $k_0$ is large enough. Indeed, this claim follows directly from \eqref{eq:cvg metric part 2}. %\begin{align} %\|A(x_k)\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} } %\qquad \text{(see \eqref{eq:cvg metric part 2})} %\nonumber\\ %& \le \frac{ 2( \lambda'_f + \lambda'_A y_{\max})}{\nu \b_1 } %\qquad \text{(see \eqref{eq:dual growth})}, %\end{align} %where the last line above holds when $k_0$ is large enough, because $\b_k \ge \b_1$ and $\lim_{k\rightarrow\infty} \epsilon_k=0$. It remains to control the first term in \eqref{eq:cvg metric}. To that end, after recalling Step 2 of Algorithm~\ref{Algo:2} and applying the triangle inequality, we can write that \begin{align} & \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\ & \le \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) , \partial g(x_{k-1}) ) \nonumber\\ & + \| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \|. \label{eq:cvg metric part 1 brk down} \end{align} The first term on the right-hand side above is bounded by $\epsilon_k$, by Step 5 of Algorithm~\ref{Algo:2}. For the second term on the right-hand side of \eqref{eq:cvg metric part 1 brk down}, we write that \begin{align} & \| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \| \nonumber\\ & = \| DA(x_k)^\top (y_k - y_{k-1}) \| \qquad \text{(see \eqref{eq:Lagrangian})} \nonumber\\ & \le \lambda'_A \|y_k- y_{k-1}\| \qquad \text{(see \eqref{eq:defn_restricted_lipsichtz})} \nonumber\\ & = \lambda'_A \s_k \|A (x_k) \| \qquad \text{(see Step 5 of Algorithm \ref{Algo:2})} \nonumber\\ & \le \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) . \qquad \text{(see \eqref{eq:cvg metric part 2})} \label{eq:part_1_2} \end{align} By combining (\ref{eq:cvg metric part 1 brk down},\ref{eq:part_1_2}), we find that \begin{align} & \dist( \nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\ & \le \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) + \epsilon_k. \label{eq:cvg metric part 1} \end{align} By combining (\ref{eq:cvg metric part 1},\ref{eq:cvg metric part 2}), we find that \begin{align} & \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ & \le \left( \frac{2\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) + \epsilon_k \right)^2 \nonumber\\ & \qquad + 4\left( \frac{ \lambda'_f + \lambda'_A y_{\max}}{\nu \b_{k-1} } \right)^2. \end{align} Applying the inequalities $(a+b)^2 \le 2a^2+2b^2$ and $\s_k\le \s_1$, we find that \begin{align} & \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ & \le \frac{8 \lambda'_A\s_1^2 + 4}{ \nu^2\b_{k-1}^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\epsilon_k^2, \end{align} For the second part of the theorem, we use the Weyl's inequality and Step 5 of Algorithm~\ref{Algo:2} to write \begin{align}\label{eq:sec} \lambda_{\text{min}} &(\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k-1})) \geq \lambda_{\text{min}} (\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k})) \notag \\&- \| \sum_{i=1}^m \nabla^2 A_i(x_k) \sigma_k A_i(x_k) \|. \end{align} The first term on the right hand side is lower bounded by $-\epsilon_{k-1}$ by Step 2 of Algorithm~\ref{Algo:2}, we next have to bound the second term on the right hand side. \begin{align*} &\| \sum_{i=1}^m \nabla^2 A_i(x_k) \sigma_k A_i(x_k) \| \leq \\ &\sigma_k \sqrt{m} \max_{i} \| \nabla^2 A_i(x_k)\| \| A_i(x_k)\| \leq \\ &\sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} }, \end{align*} where the last inequality is due to~\eqref{eq:cvg metric part 2} and~\eqref{eq:smoothness basic}. Plugging into~\eqref{eq:sec} gives \begin{align*} \lambda_{\text{min}} &(\nabla_{xx} \mathcal{L}_{\beta_{k-1}}(x_k, y_{k-1})) \geq -\epsilon_{k-1} \\&- \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1} }. \end{align*} This completes the proof of Theorem \ref{thm:main}. diff --git a/ICML19/Drafts/sections/abstract.tex b/ICML19/Drafts/sections/abstract.tex index eae1689..b215b47 100644 --- a/ICML19/Drafts/sections/abstract.tex +++ b/ICML19/Drafts/sections/abstract.tex @@ -1,16 +1,17 @@ +%!TEX root = ../main.tex + + \begin{abstract} %We consider a canonical nonlinear-constrained nonconvex problem template with broad applications in machine learning, theoretical computer science, and signal processing. %This template involves the Burer-Monteiro splitting for semidefinite programming as a special case. -We propose a practical inexact augmented Lagrangian method (iALM) for non-convex problems with nonlinear constrains. +We propose a practical inexact augmented Lagrangian method (iALM) for nonconvex problems with nonlinear constrains. %involving nonlinear operators. We characterize the total computational complexity of our method subject to a verifiable geometric condition. - In particular, when a first-order solver is used for the inner iterates, we prove that iALM finds a first-order stationary point with $\tilde{\mathcal{O}}(1/\epsilon^3)$ calls to the first-order oracle. Likewise, when a second-order solver is used for the inner iterates, we prove that iALM finds a second-order stationary point with $\tilde{\mathcal{O}}(1/\epsilon^5)$ calls to the second-order oracle. -These complexity results marry the best known theoretical results in the literature with a simple, implementable and versatile algorithm. - +These complexity results marry the known theoretical results in the literature with a simple, implementable and versatile algorithm. -We provide numerical evidence on large-scale machine learning problems, including the Burer-Monteiro factorization of Semidefinite Programming (SDP) relaxations, for which we verify our geometric condition in specific cases. For these problems and under suitable assumptions, our algorithm in fact achieves global optimality for the convex SDP. +We provide numerical evidence on large-scale machine learning problems, including the Burer-Monteiro factorization of standard form Semidefinite Programming (SDP) relaxations, for which we verify our geometric condition in specific cases. For these problems and under suitable assumptions, our algorithm in fact achieves global optimality for the convex SDP. \textbf{AE: we should add something about gans if we do that in time.} \end{abstract} diff --git a/ICML19/Drafts/sections/appendix.tex b/ICML19/Drafts/sections/appendix.tex index 2d11ce4..53d42e9 100644 --- a/ICML19/Drafts/sections/appendix.tex +++ b/ICML19/Drafts/sections/appendix.tex @@ -1,293 +1,293 @@ \section{Proof of Corollary~\ref{cor:first}} %Denote $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|$ for every $k$. Using the convergence proof of the outer algorithm, we have the following bound Let $K$ denote the number of (outer) iterations of Algorithm~\ref{Algo:2} and let $\epsilon_{f}$ denote the desired accuracy of Algorithm~\ref{Algo:2}, see~(\ref{eq:inclu3}). Recalling Theorem~\ref{thm:main}, we can then write that \begin{equation} - \epsilon_{f} \le \frac{Q}{\b_{K-1}^2}, + \epsilon_{f} = \frac{Q}{\b_{K}}, \label{eq:acc_to_b} \end{equation} -or, equivalently, $\b_{K-1} \le \sqrt{Q/\epsilon_{f}}$. +or, equivalently, $\b_{K} = Q/\epsilon_{f}$. %where $K$ denotes the last outer iterate and $\epsilon$ is the final accuracy we would like to get for the optimality conditions given in~\eqref{eq:inclu3}. We now count the number of total (inner) iterations $T$ of Algorithm~\ref{Algo:2} to reach the accuracy $\epsilon_{f}$. From \eqref{eq:smoothness of Lagrangian} and for sufficiently large $k$, recall that $\lambda_{\b_k}\le \lambda'' \b_k$ is the smoothness parameter of the augmented Lagrangian. Then, from \eqref{eq:iter_1storder} ad by summing over the outer iterations, we bound the total number of (inner) iterations of Algorithm~\ref{Algo:2} as \begin{align}\label{eq: tk_bound} T &= \sum_{k=1}^K\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}}^2 x_{\max}^2 }{\epsilon_k} \right) \nonumber\\ & = \sum_{k=1}^K\mathcal{O}\left (\beta_{k-1}^3 x_{\max}^2 \right) \qquad \text{(Step 1 of Algorithm \ref{Algo:2})} \nonumber\\ & \leq \mathcal{O} \left(K\beta_{K-1}^3 x_{\max}^2 \right) \qquad \left( \{\b_k\}_k \text{ is increasing} \right) \nonumber\\ - & \le \mathcal{O}\left( \frac{K Q^{\frac{3}{2}} x_{\max}^2}{\epsilon_{f}^{\frac{3}{2}}} \right). + & \le \mathcal{O}\left( \frac{K Q^{{3}} x_{\max}^2}{\epsilon_{f}^{{3}}} \right). \qquad \text{(see \eqref{eq:acc_to_b})} \end{align} -In addition, if we specify $\beta_k=k^b$ for all $k$, we can further refine $T$. Indeed, +In addition, if we specify $\beta_k=b^k$ for all $k$, we can further refine $T$. Indeed, \begin{equation} -\beta_K = K^b~~ \Longrightarrow~~ K = \left( \frac{Q}{\epsilon_f} \right)^{\frac{1}{2b}}, +\beta_K = b^K~~ \Longrightarrow~~ K = \log_b \left( \frac{Q}{\epsilon_f} \right), \end{equation} -which, after substituting into~\eqref{eq: tk_bound} gives the final bound -\begin{equation} -T \leq \mathcal{O}\left( \frac{Q^{\frac{3}{2}+\frac{1}{2b}} x_{\max}^2}{\epsilon_f^{\frac{3}{2}+\frac{1}{2b}}} \right), -\end{equation} -which completes the proof of Corollary~\ref{cor:first}. +which, after substituting into~\eqref{eq: tk_bound} gives the final bound in Corollary~\ref{cor:first}. +%\begin{equation} +%T \leq \mathcal{O}\left( \frac{Q^{\frac{3}{2}+\frac{1}{2b}} x_{\max}^2}{\epsilon_f^{\frac{3}{2}+\frac{1}{2b}}} \right), +%\end{equation} +%which completes the proof of Corollary~\ref{cor:first}. %\section{Analysis of different rates for $\beta_k$ and $\epsilon_k$} % %We check the iteration complexity analysis by decoupling $\beta_k$ and $\epsilon_k$. %\begin{equation} %\beta_k = k^b, ~~~~ \epsilon_k = k^{-e}. %\end{equation} % %By denoting $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|$, the algorithm bound becomes, % %\begin{equation} %B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e}. %\end{equation} % %Total iteration number is % %\begin{equation} %T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1}. %\end{equation} % %We now consider two different relations between $b$ and $e$ to see what is going on. % %\textbf{Case 1:} $b\geq e$: % %Bound for the algorithm: % %\begin{equation} %B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e} \leq \frac{2}{k^e} = \epsilon, %\end{equation} % %which gives the relation $K = \left( \frac{2}{\epsilon}\right)^{1/e}$. %Writing down the total number of iterations and plugging in $K$, % %\begin{equation} %T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{\frac{2b}{e}+1+\frac{1}{e}}. %\end{equation} % %To get the least number of total iterations for a given accuracy $\epsilon$, one needs to pick $e$ as large as possible and $b$ as small as possible. %Since in this case we had $b\geq e$, this suggests picking $b=e$ for the optimal iteration complexity. % %\textbf{Case 2:} $b\leq e$: % %Same calculations yield the following bound on the total number of iterations: % %\begin{equation} %T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{2+\frac{e}{b}+\frac{1}{b}}. %\end{equation} % %Given that $b\leq e$, the bound suggests picking $e$ as small as possible and $b$ as big as possible. % %To minimize the total number of iterations in both cases with flexible $b$ and $e$, the bounds suggest to pick $b=e=\alpha$ and take this value to be as large as possible. \section{Proof of Lemma \ref{lem:smoothness}\label{sec:proof of smoothness lemma}} Note that \begin{align} \mathcal{L}_{\beta}(x,y) = f(x) + \sum_{i=1}^m y_i A_i (x) + \frac{\b}{2} \sum_{i=1}^m (A_i(x))^2, \end{align} which implies that \begin{align} & \nabla_x \mathcal{L}_\beta(x,y) \nonumber\\ & = \nabla f(x) + \sum_{i=1}^m y_i \nabla A_i(x) + \frac{\b}{2} \sum_{i=1}^m A_i(x) \nabla A_i(x) \nonumber\\ & = \nabla f(x) + DA(x)^\top y + \b DA(x)^\top A(x), \end{align} where $DA(x)$ is the Jacobian of $A$ at $x$. By taking another derivative with respect to $x$, we reach \begin{align} \nabla^2_x \mathcal{L}_\beta(x,y) & = \nabla^2 f(x) + \sum_{i=1}^m \left( y_i + \b A_i(x) \right) \nabla^2 A_i(x) \nonumber\\ & \qquad +\b \sum_{i=1}^m \nabla A_i(x) \nabla A_i(x)^\top. \end{align} It follows that \begin{align} & \|\nabla_x^2 \mathcal{L}_\beta(x,y)\|\nonumber\\ & \le \| \nabla^2 f(x) \| + \max_i \| \nabla^2 A_i(x)\| \left (\|y\|_1+\b \|A(x)\|_1 \right) \nonumber\\ & \qquad +\beta\sum_{i=1}^m \|\nabla A_i(x)\|^2 \nonumber\\ & \le \lambda_h+ \sqrt{m} \lambda_A \left (\|y\|+\b \|A(x)\| \right) + \b \|DA(x)\|^2_F. \end{align} For every $x$ such that $\|A(x)\|\le \rho$ and $\|x\|\le \rho'$, we conclude that \begin{align} \|\nabla_x^2 \mathcal{L}_\beta(x,y)\| & \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) \nonumber\\ & \qquad + \b \max_{\|x\|\le \rho'}\|DA(x)\|_F^2, \end{align} which completes the proof of Lemma \ref{lem:smoothness}. %\section{Proof of Lemma \ref{lem:11}\label{sec:proof of descent lemma}} % %Throughout, let %\begin{align} %G = G_{\b,\g}(x,y) = \frac{x-x^+}{\g}, %\end{align} %for short. %Suppose that $\|A(x)\|\le \rho$, $\|x\|\le \rho$, and similarly $\|A(x^+)\|\le \rho$, $\|x^+\|\le \rho'$. An application of Lemma \ref{lem:smoothness} yields that %\begin{align} %\L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y)+ \langle x^+-x,\nabla_x \L_\b(x,y) \rangle %+ \frac{\lambda_\b}{2} \|x^+ - x\|^2 + g(x^+) \nonumber\\ %& = \L_\b(x,y)-\g \langle G ,\nabla_x \L_\b(x,y) \rangle %+ \frac{\g^2 \lambda_\b }{2} \|G\|^2 + g(x^+) %\label{eq:descent pr 1} %\end{align} %Since $x^+ = P_g(x - \g \nabla_x \L_\b(x,y))$, we also have that %\begin{align} %\g (G - \nabla_x \L_\b(x,y)) = \xi \in \partial g(x^+). %\label{eq:opt of prox} %\end{align} %By combining (\ref{eq:descent pr 1},\ref{eq:opt of prox}), we find that %\begin{align} %\L_\b(x^+,y)+g(x^+) & %\le \L_\b(x,y) -\g \|G\|^2 + \g \langle G, \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\ %& = \L_\b(x,y) -\g \|G\|^2 + \langle x- x^+ , \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\ %& \le \L_\b(x,y) + g(x) - \g\left( 1-\frac{\g\lambda_\b}{2}\right) \|G\|^2, %\end{align} %where the last line above uses the convexity of $g$. Recalling that $\g\le 1/\lambda_\b$ completes the proof of Lemma \ref{lem:11}. % % %\section{Proof of Lemma \ref{lem:eval Lipsc cte}\label{sec:proof of linesearch}} % % %Recalling $x^+_{\gamma}$ in \eqref{eq:defn of gamma line search}, we note that %\begin{equation} %x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = -\xi \in -\partial g (x^+_{\gamma}). %\label{eq:optimality of uplus} %\end{equation} %Lastly, $\gamma$ by definition in \eqref{eq:defn of gamma line search} satisfies %\begin{align} %& \mathcal{L}_{\beta}(x^+_{\gamma},y) + g(x_\g^+) \nonumber\\ % & \le \mathcal{L}_\beta(x,y) + \g \left\langle %x^+_{\gamma} - x , \nabla_x \mathcal{L}_\beta (x,y) %\right\rangle + \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 %+ g(x_\g^+) %\nonumber\\ %& = \mathcal{L}_\beta(x,y) + \left\langle %x - x^+_{\gamma} ,\xi %\right\rangle %- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+)\nonumber\\ %& \le \mathcal{L}_\beta(x,y) %- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x) - g(x^+_\g) %\qquad (\text{convexity of } g) \nonumber\\ %& = \mathcal{L}_\beta(x,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(x,y)\|^2 +g(x) - g(x^+_\g), %\qquad \text{(see Definition \ref{def:grad map})} %\end{align} %which completes the proof of Lemma \ref{lem:eval Lipsc cte}. % % % \section{Basis Pursuit \label{sec:verification1}} We only verify the regularity condition in \eqref{eq:regularity} for Program~\eqref{prob:01} with $f,A$ specified in \eqref{eq:bp-equiv}. Note that \begin{align*} DA(x) = 2 \overline{B} \text{diag}(x), \end{align*} where $\text{diag}(x)\in\RR^{2d\times 2d}$ is the diagonal matrix formed by $x$. The left-hand side of \eqref{eq:regularity} then reads as \begin{align} & \text{dist} \left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{\b_{k-1}} \right) \nonumber\\ & = \text{dist} \left( -DA(x_k)^\top A(x_k) , \{0\} \right) \nonumber\\ & = \|DA(x_k)^\top A(x_k) \| \nonumber\\ & =2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|. \label{eq:cnd-bp-pre} \end{align} To bound the last line above, let $x_*$ be a solution of Program~\eqref{prob:01} and note that $\overline{B} x_*^{\circ 2} = b $ by definition. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. Corresponding to $x_k$, also define $u_{k,1},u_{k,2}$ naturally and let $|z_k| = u_{k,1}^{\circ 2} + u_{k,2}^{\circ 2} \in \RR^d$ be the amplitudes of $z_k$. To simplify matters, let us assume also that $B$ is full-rank. We then rewrite the last line of \eqref{eq:cnd-bp-pre} as \begin{align} & \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|^2 \nonumber\\ & = \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x_k^{\circ 2} -x_*^{\circ 2}) \|^2 \nonumber\\ & = \| \text{diag}(x_k)\overline{B}^\top B (x_k - x_*) \|^2 \nonumber\\ & = \| \text{diag}(u_{k,1})B^\top B (x_k - x_*) \|^2 \nonumber\\ & \qquad + \| \text{diag}(u_{k,2})B^\top B (x_k - x_*) \|^2 \nonumber\\ & = \| \text{diag}(u_{k,1}^{\circ 2}+ u_{k,2}^{\circ 2}) B^\top B (x_k - x_*) \|^2 \nonumber\\ & = \| \text{diag}(|z_k|) B^\top B (x_k - x_*) \|^2 \nonumber\\ & \ge \eta_n ( B \text{diag}(|z_k|) )^2 \| B(x_k - x_*) \|^2 \nonumber\\ & = \eta_n ( B \text{diag}(|z_k|) )^2 \| B x_k -b \|^2, \end{align} where $\eta_n(\cdot)$ returns the $n$th largest singular value of its argument. We can therefore ensure that \eqref{eq:regularity} holds by enforcing that \begin{align} z_k \in \left\{ z \in \RR^d: \eta_n( B \text{diag}(|z|)) > \nu \right\}, \end{align} for every iteration $k$. \section{Clustering \label{sec:verification2}} We only verify the condition in~\eqref{eq:regularity}. Note that \begin{align} A(x) = VV^\top \mathbf{1}- \mathbf{1}, \end{align} \begin{align} DA(x) & = \left[ \begin{array}{cccc} w_{1,1} x_1^\top & \cdots & w_{1,n} x_{1}^\top\\ %x_2^\top p& \cdots & x_{2}^\top\\ \vdots\\ w_{n,1}x_{n}^\top & \cdots & w_{n,n}1 x_{n}^\top \end{array} \right] \nonumber\\ & = \left[ \begin{array}{ccc} V & \cdots & V \end{array} \right] + \left[ \begin{array}{ccc} x_1^\top & \\ & \ddots & \\ & & x_n^\top \end{array} \right], \label{eq:Jacobian clustering} \end{align} %where $I_n\in\RR^{n\times n}$ is the identity matrix, where $w_{i.i}=2$ and $w_{i,j}=1$ for $i\ne j$. In the last line above, $n$ copies of $V$ appear and the last matrix above is block-diagonal. For $x_k$, define $V_k$ as in the example and let $x_{k,i}$ be the $i$th row of $V_k$. Consequently, \begin{align} DA(x_k)^\top A(x_k) & = \left[ \begin{array}{c} V_k^\top (V_k V_k^\top- I_n ) \mathbf{1}\\ \vdots\\ V_k^\top (V_k V_k^\top- I_n ) \mathbf{1} \end{array} \right] \nonumber\\ & \qquad + \left[ \begin{array}{c} x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\ \vdots \\ x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n \end{array} \right], \end{align} where $I_n\in \RR^{n\times n}$ is the identity matrix. Let us make a number of simplifying assumptions. First, we assume that $x_k\in \text{relint}(C)$ so that $\partial g(x_k)=\{0\}$, which can be easily enforced in the iterates. Second, we assume that $V_k$ has nearly orthonormal columns, namely, $V_k V_k^\top \approx I_n$. This can also be easily enforced in each iterate of Algorithm~\ref{Algo:2} and naturally corresponds to well-separated clusters. While a more fine-tuned argument can remove these assumptions, they will help us simplify the derivations. Under these assumptions, the squared right-hand side of \eqref{eq:regularity} becomes \begin{align} & \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right)^2 \nonumber\\ & = \dist\left( -DA(x_k)^\top A(x_k) , \{0\} \right)^2 \nonumber\\ & = \| DA(x_k)^\top A(x_k) \|^2 \nonumber\\ & = \left\| \begin{array}{c} x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\ \vdots \\ x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n \end{array} \right\|^2 \nonumber\\ & = \sum_{i=1}^n \| x_{k,i}\|^2 (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\ & \ge \min_i \| x_{k,i}\|^2 \cdot \sum_{i=1}^n (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\ & = \min_i \| x_{k,i}\|^2 \cdot \| V_kV_k^\top \mathbf{1}-\mathbf{1} \|^2. \end{align} We can enforce the iterates to satisfy $\|x_{k,i}\| \ge \nu$, which corresponds again to well-separated clusters, and guarantee \eqref{eq:regularity}. In practice, often $n$ exceeds the number of true clusters and a more fine-tuned analysis is required to establish \eqref{eq:regularity} by restricting the argument to a particular subspace of $\RR^n$. diff --git a/ICML19/Drafts/sections/experiments.tex b/ICML19/Drafts/sections/experiments.tex index 53ee0b5..b07ff9c 100644 --- a/ICML19/Drafts/sections/experiments.tex +++ b/ICML19/Drafts/sections/experiments.tex @@ -1,205 +1,205 @@ %!TEX root = ../main.tex \begin{figure*}[ht] % \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf} \begin{center} {\includegraphics[width=.7\columnwidth]{figs/clustering_fig4_iters.pdf}} {\includegraphics[width=.7\columnwidth]{figs/clustering_fig4_time.pdf}} %\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}} \label{fig:clustering} \caption{Convergence of different algorithms for k-Means clustering with fashion MNIST dataset} %(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])} \end{center} \end{figure*} \section{Experiments \label{sec:experiments}} -Let us begin with a technical remark. It is known that quasi-Newton methods, such as BFGS and lBFGS, might not converge for non-convex problems~\cite{dai2002convergence, mascarenhas2004bfgs}. For this reason, we have used the trust region method as the second-order solver in our theoretical analysis in Section~\ref{sec:cvg rate}, which is well-studied for nonconvex problems. \textbf{AE: plz add citation.} Empirically, however, BFGS and lBGFS are extremely successful and we have opted for those solvers in this section. Note that the choice of subroutine does not affect Theorem~\ref{thm:main}. +Let us begin with a technical remark. It is known that quasi-Newton methods, such as BFGS and lBFGS, might not converge for nonconvex problems~\cite{dai2002convergence, mascarenhas2004bfgs}. For this reason, we have used the trust region method as the second-order solver in our theoretical analysis in Section~\ref{sec:cvg rate}, which is well-studied for nonconvex problems~\cite{cartis2012complexity}. Empirically, however, BFGS and lBGFS are extremely successful and we have also opted for those solvers in this section. Note that the choice of subroutine does not affect Theorem~\ref{thm:main}. \subsection{Basis Pursuit} Basis Pursuit (BP) finds sparsest solutions of an under-determined system of linear equations, namely, \begin{align} \begin{cases} \min_{z} \|z\|_1 \\ Bz = b, \end{cases} \label{eq:bp_main} \end{align} -where $B \in \RR^{n \times d}$ and $b \in \RR^{n}$. BP has found many applications in machine learning, statistics and signal processing \cite{chen2001atomic, candes2008introduction, arora2018compressed}. \cite{tran2018adaptive} and \cite{chambolle2011first} propose primal-dual nonsmooth convex methods to solve BP. \textbf{AE: Fatih, maybe mention a few other algorithms including asgard and also say why we can't use fista and nista.} +where $B \in \RR^{n \times d}$ and $b \in \RR^{n}$. BP has found many applications in machine learning, statistics and signal processing \cite{chen2001atomic, candes2008introduction, arora2018compressed}. A huge number of primal-dual convex optimization algorithms are proposed to solve BP, including, but not limited to~\cite{tran2018adaptive,chambolle2011first}. \textbf{AE: Fatih, maybe mention a few other algorithms including asgard and also say why we can't use fista and nista.} Here, we take a different approach and cast~(\ref{eq:bp_main}) as an instance of~\eqref{prob:01}. To that end, note that any $z \in \RR^d$ can be decomposed as $z = z^+ - z^-$, where $z^+,z^-\in \RR^d$ are the positive and negative parts of $z$, respectively. Then consider the change of variables $z^+ = u_1^{\circ 2}$ and $z^-= u_2^{\circ 2} \in \RR^d$, where $\circ$ denotes element-wise power. Next, we concatenate $u_1$ and $u_2$ as $x := [ u_1^{\top}, u_2^{\top} ]^{\top} \in \RR^{2d}$ and define $\overline{B} := [B, -B] \in \RR^{n \times 2d}$. Then,~\eqref{eq:bp_main} is equivalent to~\eqref{prob:01} with \begin{align} f(x) =& \|x\|_2^2, \quad g(x) = 0\nonumber\\ A(x) =& \overline{B}x^{\circ 2}- b. \label{eq:bp-equiv} \end{align} In Appendix~\ref{sec:verification1}, we verify with minimal detail that Theorem~\ref{thm:main} indeed applies to~\eqref{prob:01} with the above $f,A$. %Let $\mu(B)$ denote the \emph{coherence} of ${B}$, namely, %\begin{align} %\mu = \max_{i,j} | \langle B_i, B_j \rangle |, %\end{align} %where $B_i\in \RR^n$ is the $i$th column of $B$. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. We rewrite the last line of \eqref{eq:cnd-bp-pre} as %\begin{align} %& 2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \| \nonumber\\ %& = 2 \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x^{\circ 2} -x_*^{\circ 2}) \| \nonumber\\ %& = 2 \| \text{diag}(x_k) {B}^\top B(x_k- x_*) \| \nonumber\\ %& = 2 \| \text{diag}(x_k) {B}^\top B(x_k- x_*) \|_{\infty} \nonumber\\ %& \ge 2 \| \text{diag}(x_k) \text{diag}({B}^\top B) (x_k- x_*) \|_{\infty}\nonumber\\ %& \qquad - 2 \| \text{diag}(x_k) \text{hollow}({B}^\top B) (x_k- x_*) \|_{\infty}, %\end{align} %where $\text{hollow}$ returns the hollow part of th % % and let $\overline{B}=U\Sigma V^\top$ be its thin singular value decomposition, where $U\in \RR^{n\times n}, V\in \RR^{d\times n}$ have orthonormal columns and the diagonal matrix $\Sigma\in \RR^{n\times n}$ contains the singular values $\{\sigma_i\}_{i=1}^r$. Let also $x_{k,i},U_i$ be the $i$th entry of $x_k$ and the $i$th row of $U_i$, respectively. Then we bound the last line above as %\begin{align} %& 2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \| \nonumber\\ %& = 2 \| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \| \nonumber\\ %& \ge 2 \| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \|_{\infty} \nonumber\\ %& = \max_i |x_{k,i}| \cdot | U_i ^\top \Sigma \cdot \Sigma U^\top ( x^{\circ 2} - x_*^{\circ 2}) | \nonumber\\ %& %\end{align} % %where $\tilde{b}_{i,j} = (\tilde{B})_{ij},~ i\in [1:n]$ and $j \in [1:2d]$. %\begin{align*} %-DA(x)^\top(A(x) - b) = -2x \odot (\tilde{B}^\top (A(x) - b)), %\end{align*} %where $\odot$ denotes hadamard product. %\begin{align*} %& \text{dist} \left( -DA(x)^\top (A(x)-b), \frac{\partial g(x)}{\b} \right) \nonumber\\ %& = \text{dist} \left( -DA(x)^\top (A(x)-b), \{0\} \right) \nonumber\\ %& = \left\| -DA(x)^\top (A(x)-b) \right\| \nonumber\\ %& \ge ?????, %\end{align*} %Hence, this completes the proof for regularity condition. We draw the entries of $B$ independently from a zero-mean and unit-variance Gaussian distribution. For a fixed sparsity level $k$, the support of $z_*\in \RR^d$ and its nonzero amplitudes are also drawn from the standard Gaussian distribution. %We then pick a sparsity level $k$ and choose $k$ indexes, i.e, $\Omega \subset [1:d]$, which are corresponding to nonzero entries of $z$. %We then assign values from normal distribution to those entries. Then the measurement vector is created as $b = Bz + \epsilon$, where $\epsilon$ is the noise vector with entries drawn independently from the zero-mean Gaussian distribution with variance $\sigma^2=10^{-6}$. We compare our algorithm against state-of-the-art primal-dual convex methods, namely, Chambole-Pock~\cite{chambolle2011first}, ASGARD~\cite{tran2018smooth} and ASGARD-DL~\cite{tran2018adaptive}. The results are compiled in Figure~\ref{fig:bp_1}. Evidently, the performance of Algorithm~\ref{Algo:2} with a second-order solver for BP is comparable to the rest. %This relaxation transformed a non-smooth objective into a smooth one while loosing the linearity on the equality constraint. The true potential of our reformulation is in dealing with more structured norms rather than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as \begin{align*} \|z\|_{\Omega} = \sum_{i=1}^I \| z_{\Omega_i} \|, \end{align*} where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the non-convex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work. %\subsection{Latent Group Lasso \label{sec:latent lasso}} % %For a collection of subsets $\Omega=\{\Omega_i\}_{i=1}^{I}\subset [1:p]$, the latent group Lasso norm on $\RR^p$ takes $z\in \RR^p$ to %\begin{align} %\|z\|_{\Omega,1} = \sum_{i=1}^I \| z_{\Omega_i} \|. %\end{align} %Note that we do not require $\Omega_i$ to be disjoint. For $B\in \RR^{n\times p}$, $b\in \RR^n$, and $\lambda>0$, consider the latent group lasso as %\begin{align} %\min_{z\in \RR^d} \frac{1}{2}\| Bz - b\|_2^2+ \lambda \| z\|_{\Omega,1}. %\label{eq:group_lasso} %\end{align} %Because $\Omega$ is not a partition of $[1:p]$, computing the proximal operator of $\|\cdot\|_{\Omega}$ is often intractable, ruling out proximal gradient descent and similar algorithms for solving Program~\eqref{eq:group_lasso}. Instead, often Program~\eqref{eq:group_lasso} is solved by Alternating Direction Method of Multipliers (ADMM). More concretely, ??? % %We take a radically different approach here and cast Program~\eqref{eq:group_lasso} as an instance of Program~\eqref{prob:01}. More specifically, let $z^+,z^-\in \RR^p$ be positive and negative parts of $z$, so that $z=z^+-z^-$. Let us introduce the nonnegative $u\in \RR^p$ such that $z^++z^- = u^{\circ 2}$, where we used $\circ$ notation to show entrywise power. We may now write that %\begin{align} %\| z^++z^- \|_{\Omega,1} %& = \| u^{\circ 2} \|_{\Omega,1 } =: \|u\|_{\Omega,2}^2, %\end{align} %Unlike $\|\cdot\|_{\Omega,1}$, note that $\|\cdot\|_{\Omega,2}^2$ is differentiable and, in fact, there exists a positive semidefinite matrix $Q\in \RR^{d\times d}$ such that $\|u\|_{\Omega,2}^2=u^\top Q_\Omega u$. Let us form $x=[(z^+)^\top,(z^-)^\top, u^\top]^\top\in \RR^{3d}$ and set %\begin{align*} %f(x) = \frac{1}{2}\| Bz^+-Bz^- -b\|_1+ \|u\|_{\Omega,2}^2, %\end{align*} %\begin{align} %g(x) = 0, \qquad A(x) = z^++z^--u^{\circ 2}. %\end{align} %We can now apply Algorithm~\ref{Algo:2} to solve Program~\eqref{prob:01} with $f,g,A$ specified above. % % %\paragraph{Convergence rate.} %Clearly, $f,A$ are strongly smooth in the sense that \eqref{eq:smoothness basic} holds with proper $\lambda_f,\lambda_A$. Likewise, both $f$ and the Jacobian $DA$ are continuous and the restricted Lipschitz constants $\lambda'_f,\lambda'_A$ in \eqref{eq:defn_lambda'} are consequently well-defined and finite for a fixed $\rho'>0$. We next verify the key regularity condition in Theorem~\ref{thm:main}, namely, \eqref{eq:regularity}. Note that %\begin{align*} %DA(x) & = %\left[ %\begin{array}{ccc} %I_p & I_p & -2\text{diag}(u) %\end{array} %\right]\in \RR^{d\times 3d}, %\end{align*} %\begin{align*} %-DA(x)^\top A(x) = %\left[ %\begin{array}{c} %-z^+-z^-+u^{\circ 2} \\ %-z^+-z^-+u^{\circ 2}\\ %2\text{diag}(u)( z^++z^--u^{\circ 2}) %\end{array} %\right], %\end{align*} %\begin{align*} %& \text{dist} \left( -DA(x)^\top A(x), \frac{\partial g(x)}{\b} \right) \nonumber\\ %& = \text{dist} \left( -DA(x)^\top A(x), \{0\} \right) \nonumber\\ %& = \left\| -DA(x)^\top A(x) \right\| \nonumber\\ %& \ge \sqrt{2} \| A(x)\|, %\end{align*} %namely, \eqref{eq:regularity} holds with $\nu=1$. \subsection{k-Means Clustering} Given data points $\{z_i\}_{i=1}^n $, the entries of the corresponding Euclidean distance matrix $D \in \RR^{nxn}$ are $ D_{i, j} = \left\| z_i - z_j\right\|^2 $. Clustering is then the problem of finding a co-association matrix $Y\in \RR^{n\times n}$ such that $Y_{ij} = 1$ if points $z_i$ and $z_j$ are within the same cluster and $Y_{ij} = 0$ otherwise. In~\cite{Peng2007}, the authors provide a SDP relaxation of the clustering problem, specified as \begin{align} \begin{cases} \underset{Y \in \RR^{nxn}}{\min} \text{tr}(DY) \\ Y\mathbf{1} = \mathbf{1}, ~\text{tr}(Y) = k,~ Y\succeq 0,~Y \geq 0, \end{cases} \label{eq:sdp_svx} \end{align} where $k$ is the number of clusters and $Y $ is both positive semidefinite and has nonnegative entries. Standard SDP solvers do not scale well with the number of data points~$n$, since they often require projection onto the semidefinite cone with the complexity of $\mathcal{O}(n^3)$. We instead use the Burer-Monteiro splitting, sacrificing convexity to reduce the computational complexity. More specifically, we solve the program \begin{align} \label{eq:nc_cluster} \begin{cases} \underset{V \in \RR^{nxr}}{\min} \text{tr}(DVV^{\top}) \\ VV^{\top}\mathbf{1} = \mathbf{1},~~ \|V\|_F^2 \le k, ~~V \geq 0, \end{cases} \end{align} where $\mathbf{1}\in \RR^n$ is the vector of all ones. Note that $Y \geq 0$ in \eqref{eq:sdp_svx} is replaced above by the much stronger but easier to enforce constraint $V \geq 0$ constraint above, see~\citep{kulis2007fast} for the reasoning behind this relaxation. %. Trace constraint translates to a Frobenius norm constraint in factorized space. Semidefinite constraint is naturally removed due to factorization $Y=VV^{\top}$. %See~\citep{kulis2007fast} for the details of the relaxation. Now, we can cast~\eqref{eq:nc_cluster} as an instance of~\eqref{prob:01}. Indeed, for every $i\le n$, let $x_i \in \RR^r$ denote the $i$th row of $V$. We next form $x \in \RR^d$ with $d = nr$ by expanding the factorized variable $V$, namely, \begin{align*} x = [x_1^{\top}, \cdots, x_n^{\top}]^{\top} \in \RR^d, \end{align*} and then set \begin{align*} f(x) =\sum_{i,j=1}^n D_{i, j} \left\langle x_i, x_j \right\rangle, \qquad g = \delta_C, \end{align*} \begin{align} A(x) = [x_1^{\top}\sum_{j=1}^n x_j -1, \cdots, x_n^{\top}\sum_{j=1}^n x_j-1]^{\top}, \end{align} where $C$ is the intersection of the positive orthant in $\RR^d$ with the Euclidean ball of radius $\sqrt{k}$. In Appendix~\ref{sec:verification2}, we somewhat informally verify that Theorem~\ref{thm:main} applies to~\eqref{prob:01} with $f,g,A$ specified above. In our simulations, we use two different solvers for Step~2 of Algorithm~\ref{Algo:2}, namely, APGM and lBFGS. APGM is a solver for non-convex problems of the form~\eqref{e:exac} with convergence guarantees to first-order stationarity, as discussed in Section~\ref{sec:cvg rate}. lBFGS is a limited-memory version of BFGS algorithm in~\cite{fletcher2013practical} that approximately leverages the second-order information of the problem. We compare our approach against the following convex methods: \begin{itemize} \item HCGM: Homotopy-based Conditional Gradient Method in\cite{yurtsever2018conditional} which directly solves~\eqref{eq:sdp_svx}. \item SDPNAL+: A second-order augmented Lagrangian method for solving SDP's with nonnegativity constraints~\cite{yang2015sdpnal}. \end{itemize} As for the dataset, our expreminetal setup is similar to that described by~\cite{mixon2016clustering}. We use the publicly-available fashion-MNIST data in \cite{xiao2017/online}, which is released as a possible replacement for the MNIST handwritten digits. Each data point is a $28\times 28$ grayscale image, associated with a label from ten classes, labeled from $0$ to $9$. First, we extract the meaningful features from this dataset using a simple two-layer neural network with a sigmoid activation function. Then, we apply this neural network to 1000 test samples from the same dataset, which gives us a vector of length $10$ for each data point, where each entry represents the posterior probability for each class. Then, we form the $\ell_2$ distance matrix ${D}$ from these probability vectors. The results are depicted in Figure~\ref{fig:clustering}. diff --git a/ICML19/Drafts/sections/guarantees.tex b/ICML19/Drafts/sections/guarantees.tex index b5244a4..09e3afc 100644 --- a/ICML19/Drafts/sections/guarantees.tex +++ b/ICML19/Drafts/sections/guarantees.tex @@ -1,174 +1,177 @@ +%!TEX root = ../main.tex + \section{Convergence Rate \label{sec:cvg rate}} In this section, we detail the convergence rate of Algorithm~\ref{Algo:2} for finding first-order and second-order stationary points, along with the iteration complexity results. All the proofs are deferred to Appendix~\ref{sec:theory} for the clarity of exposition. Theorem~\ref{thm:main} below characterizes the convergence rate of Algorithm~\ref{Algo:2} for finding stationary points in terms of the number of outer iterations.\\ -{\color{red}{Ahmet: I think that it is possible to remove sufficiently large k0 assumption here. }} \textbf{AE: You're likely right but it might not be worth your time right now.} +{\color{red}{Ahmet: Maybe instead of sufficiently large k0 we can say k0 such that beta is bigger than 1, it makes the proof go thru, it would be a bit more explanatory}} \begin{theorem}\textbf{\emph{(convergence rate)}} \label{thm:main} Suppose that $f$ and $A$ are smooth in the sense specified in (\ref{eq:smoothness basic}). For $\rho'>0$, let \begin{align} \lambda'_f = \max_{\|x\|\le \rho'} \|\nabla f(x)\|, ~~ \lambda'_A = \max_{\|x\| \le \rho'} \|DA(x)\|, \label{eq:defn_restricted_lipsichtz} \end{align} be the (restricted) Lipschitz constants of $f$ and $A$, respectively. For sufficiently large integers $k_0\le k_1$, consider the interval $K=[k_0:k_1]$, and let $\{x_k\}_{k\in K}$ be the output sequence of Algorithm~\ref{Algo:2} on the interval $K$. For $\nu>0$, assume that \begin{align} \nu \|A(x_k)\| & \le \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right), \label{eq:regularity} \end{align} for every $k\in K$. We consider two cases: +{\color{red} Ahmet: I removed the squares and showed the rate on dist + feasibility, since it is the measure for the stationarity, using squares confuses complexity analysis...} \begin{itemize}[leftmargin=*] \item If a first-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f},\b_k)$ first-order stationary point of (\ref{prob:01}) with %\begin{align} %& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ %& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_1^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\ %&=: \frac{Q(f,g,A)}{\beta_{k-1}^2}, %\end{align} \begin{align} -\epsilon_{k,f} & = \frac{1}{\beta_{k-1}^2} \left( \frac{8 \lambda'_A\s_1^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2 \right) \nonumber \\ -&=: \frac{Q(f,g,A,\s_1)}{\beta_{k-1}^2}, +%\epsilon_{k,f} & = \frac{1}{\beta_{k-1}} \sqrt{ \frac{8 \lambda'_A\s_1^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2 } \nonumber \\ +%&=: \frac{Q(f,g,A,\s_1)}{\beta_{k-1}}, +\epsilon_{k,f} & = \frac{1}{\beta_{k-1}} \left(\frac{(\lambda'_f+\lambda'_A y_{\max}) (1+\lambda_A' \sigma_k)}{\nu}+1\right) \nonumber \\ +&=: \frac{Q(f,g,A,\s_1)}{\beta_{k-1}}, \label{eq:stat_prec_first} \end{align} for every $k\in K$, where the expression for $y_{\max}=y_{\max}(x_1,y_0,\s_1)$ is given in (\ref{eq:dual growth}). \item If a second-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f}, \epsilon_{k,s},\b_k)$ second-order stationary point of~(\ref{prob:01}) with $\epsilon_{k,s}$ specified above and with \begin{align} \epsilon_{k,s} &= \epsilon_{k-1} + \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} \\ &= \frac{\nu + \sigma_k \sqrt{m} \lambda_A 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} =: \frac{C}{\beta_{k-1}}. \end{align} -\textbf{We should try to get the constant C since we get the constants elsewhere for consistency.} \end{itemize} \end{theorem} %A few remarks are in order about Theorem~\ref{thm:main}. %\textbf{Tradeoff.} %Roughly speaking, Theorem~\ref{thm:main} states that the iterates of Algorithm~\ref{Algo:2} approach first-order stationarity for~\eqref{prob:01} at the rate of $1/{\b_k}$, where $\b_k$ is the penalty weight in AL in iteration $k$ as in~\eqref{eq:Lagrangian}. Note the inherent tradeoff here: For a faster sequence $\{\b_k\}_k$, the iterates approach stationarity faster but the computational cost of the inexact primal subroutine (Step 2) increases since a higher accuracy is required and Lipschitz constant of the unconstrained problem is also affected by $\beta_k$. In words, there is a tradeoff between the number of outer and inner loop iterations of Algorithm~\ref{Algo:2}. We highlight this tradeoff for a number of subroutines below. -Loosely speaking, Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} converges to a (first- or second-) order stationary point of \eqref{prob:01} at the rate of $1/\b_k^2$. +Loosely speaking, Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} converges to a (first- or second-) order stationary point of \eqref{prob:01} at the rate of $1/\b_k$. A few remarks are in order. \textbf{Regularity.} The key condition in Theorem~\ref{thm:main} is \eqref{eq:regularity} which, broadly speaking, controls the geometry of the problem. As the penalty weight $\b_k$ grows, the primal solver in Step~2 places an increasing emphasis on reducing the feasibility gap and \eqref{eq:regularity} formalizes this intuition. -In contrast to most conditions in the non-convex optimization literature, such as~\cite{flores2012complete}, our condition in~\eqref{eq:regularity} appears to be easier to verify, as we see in a few applications in Section~\ref{sec:experiments}. +In contrast to most conditions in the nonconvex optimization literature, such as~\cite{flores2012complete}, our condition in~\eqref{eq:regularity} appears to be easier to verify, as we see in a few applications in Section~\ref{sec:experiments}. -With an example, let us argue that such a condition is necessary for controlling the feasibility gap of~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}. Consider the case where $f=0$ and $g=\delta_\mathcal{X}$, where $\mathcal{X}$ is a convex set and $A$ is a linear operator. In this case, solving \eqref{prob:01} finds a point in $\mathcal{X}\cap \text{null}(A)$, where the subspace $\text{null}(A)=\{ x\in\mathbb{R}^d: A(x) = 0 \} \subset \RR^d$ is the null space of $A$. +With an example, let us argue that such a condition is necessary for controlling the feasibility gap of~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}. Consider the case where $f=0$ and $g=\delta_\mathcal{X}$, where $\mathcal{X}$ is a convex set, $A$ is a linear operator. In this case, solving \eqref{prob:01} finds a point in $\mathcal{X}\cap \text{null}(A)$, where the subspace $\text{null}(A)=\{ x\in\mathbb{R}^d: A(x) = 0 \} \subset \RR^d$ is the null space of $A$. Here, the Slater's condition requires that \begin{equation} \text{relint} (\mathcal{X}) \cap \text{null}(A)\neq \emptyset. \end{equation} -In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs. Intuitively, the Slater's condition here removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01}, namely, finding a point in $\mathcal{X}\cap \text{null}(A)$, can be particularly difficult. For instance, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) has arbitrarily slow convergence, see Figure~\ref{fig:convex_slater}. +In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}. Intuitively, the Slater's condition here removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01}, namely, finding a point in $\mathcal{X}\cap \text{null}(A)$, can be particularly difficult. For instance, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) has arbitrarily slow convergence, see Figure~\ref{fig:convex_slater}. +{\color{red} Ahmet: a reference would be cool here} \begin{figure} \begin{center} \includegraphics[scale=.5]{Slater.pdf} \end{center} \caption{A pathological example where solving \eqref{prob:01} can be particularly difficult, even when \eqref{prob:01} is a convex program. See the first remark after Theorem~\ref{thm:main} for more details. \label{fig:convex_slater}} \end{figure} \textbf{Computational complexity.} Theorem~\ref{thm:main} allows us to specify the number of iterations that Algorithm~\ref{Algo:2} requires to reach a near-stationary point of Program~\eqref{prob:01} with a prescribed precision and, in particular, specifies the number of calls made to the solver in Step~2. In this sense, Theorem~\ref{thm:main} does not fully capture the computational complexity of Algorithm~\ref{Algo:2}, as it does not take into account the computational cost of the solver in Step~2. To better understand the complexity of Algorithm~\ref{Algo:2}, we consider two scenarios in the following. In the first scenario, we take the solver in Step~2 to be the Accelerated Proximal Gradient Method (APGM), a well-known first-order algorithm~\cite{ghadimi2016accelerated}. In the second scenario, we will use the second-order trust region method developed in~\cite{cartis2012complexity}. \subsection{First-Order Optimality} %\textbf{AE: we go back and forth between "subroutine" and "solver". for consistency, I'm just using "solver" everywhere.} -Let us first consider the first-order optimality case where the solver in Step~2 is the Accelerated Proximal Gradient Method (APGM)~\cite{ghadimi2016accelerated}. % \textbf{AE: Ahmet to give a brief description of APGM here. } +Let us first consider the first-order optimality case where the solver in Step~2 is APGM~\cite{ghadimi2016accelerated}. +APGM makes use of $\nabla_x \mathcal{L}_{\beta}(x,y)$, $\text{prox}_g$ and classical Nesterov acceleration step for the iterates~\cite{nesterov1983method} to obtain first order stationarity guarantees for solving~\eqref{e:exac}. + % \textbf{AE: Ahmet to give a brief description of APGM here. } %First, we characterize the iteration complexity of Algorithm~\ref{Algo:2} for finding a first-order stationary point of~\eqref{prob:01}. %We propose to use the standard accelerated proximal method (APGM), guarantees of which are analyzed in~\cite{ghadimi2016accelerated} for nonconvex problems of the form~\eqref{e:exac}. -Suppose that $g=1_\mathcal{X}$ is the indicator function on a bounded convex set $\mathcal{X}\subset \RR^d$ and let +Suppose that $g=\delta_\mathcal{X}$ is the indicator function on a bounded convex set $\mathcal{X}\subset \RR^d$ and let \begin{align} -x_{\max}= \max_{x\in \mathcal{X}}\|x\| +x_{\max}= \max_{x\in \mathcal{X}}\|x\|, \end{align} be the radius of a ball centered at the origin that includes $\mathcal{X}$. Then, adapting the results in~\cite{ghadimi2016accelerated} to our setup, APGM reaches $x_{k}$ in Step 2 of Algorithm~\ref{Algo:2} after \begin{equation} -\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}}^2 x_{\max}^2 }{\epsilon_k} \right) +\mathcal{O}\left ( \frac{\lambda_{\beta_{k}}^2 x_{\max}^2 }{\epsilon_{k+1}} \right) \label{eq:iter_1storder} \end{equation} -(inner) iterations, where $\lambda_{\beta_{k-1}}$ denotes the Lipschitz constant of $\nabla_x{\mathcal{L}_{\beta_{k-1}}(x, y)}$, bounded in~\eqref{eq:smoothness of Lagrangian}. We note that for simplicity, we use a looser bound in \eqref{eq:iter_1storder}. +(inner) iterations, where $\lambda_{\beta_{k}}$ denotes the Lipschitz constant of $\nabla_x{\mathcal{L}_{\beta_{k}}(x, y)}$, bounded in~\eqref{eq:smoothness of Lagrangian}. We note that for simplicity, we use a looser bound in \eqref{eq:iter_1storder} than~\cite{ghadimi2016accelerated}. Using \eqref{eq:iter_1storder}, we derive the following corollary, describing the total computational complexity of Algorithm~\ref{Algo:2} in terms of the calls to the first-order oracle in APGM. %\textbf{AE: we haven't talked about oracle before.} \begin{corollary}\label{cor:first} For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point, see (\ref{eq:inclu3}), after $T$ calls to the first-order oracle, where % \begin{equation} -T = \mathcal{O}\left( \frac{Q^{\frac{3}{2}+\frac{1}{2b}} x_{\max}^2}{\epsilon^{\frac{3}{2}+\frac{1}{2b}}} \right) = \widetilde{\mathcal{O}}\left( \frac{Q^{\frac{3}{2}} x_{\max}^2}{\epsilon^{\frac{3}{2}}} \right). +T = \mathcal{O}\left( \frac{Q^3 x_{\max}^2}{\epsilon^{3}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} x_{\max}^2}{\epsilon^{3}} \right). \end{equation} \end{corollary} -For Algorithm~\ref{Algo:2}, to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1=b$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is intractable. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight. This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}. +For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1=b$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is intractable. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight. This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}. %\textbf{AE: if we find some time, i'll add a couple of lines for how 1st order opt implies 2nd order opt for smooth constraints.} \subsection{Second-Order Optimality} %{\color{red}{Ahmet (note to myself): not sure of the constants of trust-region, check again}} \\ -Let us now consider the second-order optimality case where the solver in Step~2 is the the trust region method developed in~\cite{cartis2012complexity}. +Let us now consider the second-order optimality case where the solver in Step~2 is the the trust region method developed in~\cite{cartis2012complexity}. +Trust region method minimizes quadratic approximation of the function within a dynamically updated trust-region radius. +Second order trust region method that we consider in this section makes use of Hessian (or an approximation of Hessian) of the augmented Lagrangian in addition to first order oracles. As shown in~\cite{nouiehed2018convergence}, finding approximate second-order stationary points of convex-constrained problems is in general NP-hard. For this reason, we focus in this section on the special case of~\eqref{prob:01} with $g=0$. %We first give a theorem to show the convergence rate of the algorithm for second order stationarity: \\ %{\color{red}{Ahmet: I think that it is possible to remove sufficiently large k0 assumption here. }} \textbf{AE: not worth it really} %\begin{corollary} %\label{thm:main_second} %Under the assumptions of Theorem~\ref{thm:main}, the output of Algorithm~\ref{Algo:2} satisfies %\begin{align} %%& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ %%& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\ %%&=: \frac{Q}{\beta_{k-1}^2}. %\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_{k-1}}(x_k, y_k)) \geq - \frac{C}{\beta_{k-1}} - \epsilon_{k-1}. %\end{align} %\end{corollary} % %We consider \textbf{AE: Ahmet to add a brief description of the algorithm.} -Let us compute the total computational complexity of Algorithm~\ref{Algo:2} with the trust region method in Step~2, in terms of the number of calls made to the second-order oracle. By adapting the result in~\cite{cartis2012complexity} to our setup, we find that the number of (inner) iterations required in Step~2 of Algorithm~\ref{Algo:2} to produce $x_k$ is +Let us compute the total computational complexity of Algorithm~\ref{Algo:2} with the trust region method in Step~2, in terms of the number of calls made to the second-order oracle. By adapting the result in~\cite{cartis2012complexity} to our setup, we find that the number of (inner) iterations required in Step~2 of Algorithm~\ref{Algo:2} to produce $x_{k+1}$ is % \begin{equation} -\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}, H}^2 (\mathcal{L}_{\beta_{k-1}}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y))}{\epsilon_k^3} \right), +\mathcal{O}\left ( \frac{\lambda_{\beta_{k}, H}^2 (\mathcal{L}_{\beta_{k}}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y))}{\epsilon_k^3} \right), \label{eq:sec_inn_comp} \end{equation} % -where $\lambda_{\beta, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta$, as can be proven similar to Lemma~\ref{lem:smoothness}. \textbf{AE: This is somewhat speculative. Make precise?} -In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y)$ is bounded by a constant independent of $\epsilon$. \textbf{AE: If so, there is perhaps no need to mention it in the above bound. It'd the bound.} -We have a similar assumption for this quantity $\forall \beta_k$. \textbf{AE: A bit vague. I couldn't follow...} Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following result. -% -\begin{corollary}\label{cor:second} \textbf{AE: once you converge on the statement here, please rewrite to match the language of the previous corollary as much as possible for consistency.} -Let us use the following form for $\beta$ -\begin{equation*} -\beta_k = \alpha^k, ~~ \alpha > 1. -\end{equation*} +where $\lambda_{\beta, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop. +In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y)$ is bounded by a constant independent of $\epsilon$. +We assume a uniform bound for this quantity $\forall \beta_k$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following result. % -We use the trust region method from~\cite{cartis2012complexity} for step 2 in Algorithm~\ref{Algo:2} and assume that +\begin{corollary}\label{cor:second} +For $b>1$, let $\beta_k =b^k $ for every $k$. +We assume that \begin{equation} -\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) +\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) \leq L_{u},~~ \forall \beta_k. \end{equation} -is upper bounded by a constant independent of $\epsilon$, for all $\beta_k$. -Algorithm~\ref{Algo:2} then finds an $\epsilon$-second-order stationary point of~\eqref{prob:01} in $T_K$ total number of calls to the second order oracle where +If we use the trust region method from~\cite{cartis2012complexity} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon$-second-order stationary point of~\eqref{prob:01} in $T_K$ total number of calls to the second order oracle where % \begin{equation} -T_K \leq \tilde{\mathcal{O}}\left( \frac{Q^{\frac{5}{2}}}{\epsilon^{5}} \right). +T_K \leq \mathcal{O}\left( \frac{L_u Q^{5}}{\epsilon^{5}} \log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{L_u Q^{5}}{\epsilon^{5}} \right). \end{equation} \end{corollary} % Before closing this section, let us add that the remark after Corollary~\ref{cor:first} applies here as well. -{\color{red} Ahmet: I'll check again the constants of the corollaries, eps dependence should be correct.}\\ % diff --git a/ICML19/Drafts/sections/introduction.tex b/ICML19/Drafts/sections/introduction.tex index 4f4f4bf..64f1a2e 100644 --- a/ICML19/Drafts/sections/introduction.tex +++ b/ICML19/Drafts/sections/introduction.tex @@ -1,88 +1,91 @@ +%!TEX root = ../main.tex \section{Introduction} \label{intro} -We study the following non-convex optimization problem +We study the following nonconvex optimization problem % \begin{equation} \label{prob:01} \begin{cases} \underset{x\in \mathbb{R}^d}{\min}\,\, f(x)+g(x)\\ -A(x) = 0 , +A(x) = b, \end{cases} \end{equation} % -where $f:\mathbb{R}^d\rightarrow\mathbb{R}$ is possibly non-convex and $A:\mathbb{R}^d\rightarrow\mathbb{R}^m$ is a nonlinear operator. We assume that $g:\mathbb{R}^d\rightarrow\mathbb{R}$ is proximal-friendly convex function. % (not necessarily smooth). +where $f:\mathbb{R}^d\rightarrow\mathbb{R}$ is possibly non-convex and $A:\mathbb{R}^d\rightarrow\mathbb{R}^m$ is a nonlinear operator and $b\in\mathbb{R}^m$. +For clarity of notation, we take $b=0$ in the sequel, the extension to any $b$ is trivial. +We assume that $g:\mathbb{R}^d\rightarrow\mathbb{R}$ is proximal-friendly (possibly nonsmooth) convex function. % (not necessarily smooth). %For instance, for a convex set $\mathcal{X}\subset\RR^d$, letting $g=\delta_\mathcal{X}$ to be the indicator function on $\mathcal{X}$, ~\eqref{prob:01} is a nonconvex optimization problem with the convex constraint $x\in \mathcal{X}$. %Our other requirement for $g$ is that computing its proximal operator is inexpensive, see Section~\ref{sec:preliminaries}. A host of problems in computer science \cite{khot2011grothendieck, lovasz2003semidefinite}, machine learning \cite{mossel2015consistency, song2007dependence}, and signal processing \cite{singer2011angular, singer2011three} naturally fall under the template of ~\eqref{prob:01}, including max-cut, clustering, generalized eigenvalue, as well as community detection. % \textbf{AE: We need to add some more specifics to this paragraph to convince people that this is an important template. What are the names of the problems you're citing? Max-cut, sparse regression, statistical learning with deep networks as prior \cite{Bora2017}, what else we can think of?} -To address these applications, this paper builds up on the vast literature on the classical inexact augmented Lagrangian framework and proposes a simple, intuitive and yet easy-to-implement algorithm with a total complexity result for ~\eqref{prob:01} under an interpretable geometric condition. Before we elaborate on the results, let us first motivate ~\eqref{prob:01} with an important application to Semidefinite ming (SDP): - - +To address these applications, this paper builds up on the vast literature on the classical inexact augmented Lagrangian framework and proposes a simple, intuitive and yet easy-to-implement algorithm with total complexity results for ~\eqref{prob:01} under an interpretable geometric condition. Before we elaborate on the results, let us first motivate ~\eqref{prob:01} with an important application to semidefinite programming (SDP): \vspace{-3mm} \paragraph{Vignette: Burer-Monteiro splitting.} A powerful convex relaxation for max-cut, clustering, and several other problems above is provided by the SDP \begin{equation} \label{eq:sdp} \begin{cases} \underset{X\in\mathbb{S}^{d \times d}}{\min} \langle C, X \rangle \\ -\mathcal{A}(X) = b, \,\, X \succeq 0, +B(X) = b, \,\, X \succeq 0, \end{cases} \end{equation} % -\textbf{AE: let's keep A instead of $\mathcal{A}$ for consistency } where $C\in \RR^{d\times d}$ and $X$ is a positive semidefinite and symmetric $d\times d$ matrix, %$\mathbb{S}^{d \times d}$ denotes the set of real symmetric matrices, -and $\mathcal{A}: \mathbb{S}^{d\times d} \to \mathbb{R}^m$ is a linear operator. If the unique-games conjecture is true, SDPs achieve the best approximation for the underlying discrete problem~\cite{raghavendra2008optimal}. +and ${B}: \mathbb{S}^{d\times d} \to \mathbb{R}^m$ is a linear operator. If the unique-games conjecture is true, SDPs achieve the best approximation for the underlying discrete problem~\cite{raghavendra2008optimal}. Since $d$ is often large, many first- and second-order methods for solving such SDP's are immediately ruled out, not only due to their high computational complexity, but also due to their storage requirements, which are $\mathcal{O}(d^2)$. A contemporary challenge in optimization therefore is to solve SDP's in small space and in a scalable fashion. A recent algorithm, i.e., homotopy conditional gradient method (HCGM) based on Linear Minimization Oracles (LMO), can address this template in small space via sketching \cite{yurtsever2018conditional}; however, such LMO-based methods are extremely slow in obtaining accurate solutions. %In practice, $d$ is often very large which makes interior point methods, with their poor scaling in $d$, an unattractive option for solving ~\eqref{eq:sdp}. Attempts to resolve this issue prompted extensive research in computationally- and memory-efficient SDP solvers. The first such solvers relied on the so-called Linear Minimization Oracle (LMO), reviewed in Section~\ref{sec:related work}, alongside other scalabe SDP solvers. -A key approach for solving \eqref{prob:01}, dating back to~\cite{burer2003nonlinear, burer2005local}, is the so-called Burer-Monteiro (BR) splitting $X=UU^\top$, where $U\in\mathbb{R}^{d\times r}$ and $r$ is selected according to the guidelines in~\cite{pataki1998rank, barvinok1995problems} so as to remove spurious local minima of the non-convex factorized problem. Evidently, BR splitting has the advantage of lower storage and faster iterations. +A key approach for solving \eqref{prob:01}, dating back to~\cite{burer2003nonlinear, burer2005local}, is the so-called Burer-Monteiro (BR) splitting $X=UU^\top$, where $U\in\mathbb{R}^{d\times r}$ and $r$ is selected according to the guidelines in~\cite{pataki1998rank, barvinok1995problems}. +It has been shown that these bounds on the rank, which are shown to be optimal~\cite{waldspurger2018rank}, under some suitable assumptions remove the spurious local minima of the nonconvex factorized problem~\cite{boumal2016non}. + +%so as to remove spurious local minima of the nonconvex factorized problem. Evidently, BR splitting has the advantage of lower storage and faster iterations. -This splitting results in the non-convex +This splitting results in the nonconvex problem \begin{equation} \label{prob:nc} \begin{cases} \underset{U\in\mathbb{R}^{d \times r}}{\min} \langle C, UU^\top \rangle \\ -\mathcal{A}(UU^\top) = b, +B(UU^\top) = b, \end{cases} \end{equation} which can be written in the form of ~\eqref{prob:01}. %\textbf{Vignette 2: Latent group lasso} %See Section ? for several examples. \textbf{We should explain this better and give more high level examples to motivate this .} %An example of our template in \eqref{prob:01} is semi-definite ming which provides powerful relaxations to above problems. Denote the space of $d'\times d'$ symmetric matrices by $\mathbb{S}^{d'\times d'}$ and consider % %\begin{equation} % \label{sdp} % \min_x \{h'(x): A'(x) = b', ~~x \in \mathcal{C'}~~\text{and}~~x\succeq0 \} %\end{equation} % %where $h': \mathbb{S}^{d'\times d'} \to \RR$, $A'\colon\mathbb{S}^{d'\times d'}\to\RR^m$, $b'\in\RR^m$, and $C' \subseteq \mathbb{R}^{d'\times d'}$. This template clearly can be put to the form of \eqref{prob:01} by using \emph{Burer-Monteiro factorization} \cite{burer2003nonlinear, burer2005local}. -To solve \eqref{prob:nc}, the inexact Augmented Lagrangian Method (iALM) is widely used~\cite{burer2003nonlinear, kulis2007fast}, due to its cheap per iteration cost and also its empirical success in practice. Every (outer) iteration of iALM calls a solver to inexactly solve an intermediate augmented Lagrangian subproblem to near stationarity, and the user has freedom in choosing this solver, which could use first-order (say,~proximal gradient descent \cite{parikh2014proximal}) or second-order information, such as BFGS \cite{nocedal2006numerical}. +To solve \eqref{prob:nc}, the inexact Augmented Lagrangian Method (iALM) is widely used~\cite{burer2003nonlinear, burer2005local, kulis2007fast}, due to its cheap per iteration cost and also its empirical success in practice. Every (outer) iteration of iALM calls a solver to inexactly solve an intermediate augmented Lagrangian subproblem to near stationarity, and the user has freedom in choosing this solver, which could use first-order (say,~proximal gradient descent \cite{parikh2014proximal}) or second-order information, such as BFGS \cite{nocedal2006numerical}. We argue that unlike its convex counterpart~\cite{nedelcu2014computational,lan2016iteration,xu2017inexact}, the convergence rate and the complexity of iALM for ~\eqref{prob:nc} are not well-understood, see Section~\ref{sec:related work} for a review of the related literature. Indeed, addressing this important theoretical gap is one of the contributions of our work. {\textbf{Contributions.} \\[1mm] $\circ$ Our framework is future-proof in the sense that we obtain the convergence rate of iALM for ~\eqref{prob:01} with an arbitrary solver for finding first- and second-order stationary points. \\[2mm] -$\circ$ We investigate using different solvers for augmented Lagrangian subproblems and provide overall iteration complexity bounds for finding first-order and second-order stationary points of ~\eqref{prob:01}. Our complexity bounds match the best theoretical complexity results in the optimization literature, see Section~\ref{sec:related work}. \\[2mm] +$\circ$ We investigate using different solvers for augmented Lagrangian subproblems and provide overall iteration complexity bounds for finding first-order and second-order stationary points of ~\eqref{prob:01}. Our complexity bounds match the known theoretical complexity results in the optimization literature, see Section~\ref{sec:related work}. \\[2mm] $\circ$ We propose a novel geometric condition that simplifies the algorithmic analysis for iALM. We verify the condition for key problems described in Section~\ref{sec:experiments}. \textbf{Roadmap.} Section~\ref{sec:preliminaries} collects the main tools and our notation. % that we utilize. We present the iALM in Section~\ref{sec:AL algorithm} and obtain its convergence rate to first- and second-order stationary points in Section~\ref{sec:cvg rate}, alongside their iteration complexities. We provide a comprehensive review of the literature and highlight our key differences in % with the precedents in Section~\ref{sec:related work}. Section~\ref{sec:experiments} presents the numerical evidence and comparisons with the state-of-the-art methods. diff --git a/ICML19/Drafts/sections/preliminaries.tex b/ICML19/Drafts/sections/preliminaries.tex index 8f6ed88..f9754e4 100644 --- a/ICML19/Drafts/sections/preliminaries.tex +++ b/ICML19/Drafts/sections/preliminaries.tex @@ -1,180 +1,180 @@ \vspace{-3mm} \section{Preliminaries \label{sec:preliminaries}} \paragraph{\textbf{{Notation.}}} We use the notation $\langle\cdot ,\cdot \rangle $ and $\|\cdot\|$ for the {standard inner} product and the norm on $\RR^d$. For matrices, $\|\cdot\|$ and $\|\cdot\|_F$ denote the spectral and the Frobenius norms, respectively. %For a matrix $M$, its row and null spaces are denoted by $\row(A)$ and $\text{null}(A)$, respectively. For a convex function $g:\RR^d\rightarrow\RR$, the subdifferential set at $x\in \RR^d$ is denoted by $\partial g(x)$ and we will occasionally use the notation $\partial g(x)/\b = \{ z/\b : z\in \partial g(x)\}$. -When presenting iteration complexity results, we often $\widetilde{O}(\cdot)$ which suppresses the logarithmic dependencies. +When presenting iteration complexity results, we often use $\widetilde{O}(\cdot)$ which suppresses the logarithmic dependencies. %The proximal operator $P_g:\RR^d\rightarrow\RR^d$ takes $x$ to %\begin{align} %P_g(x) = \underset{y}{\argmin} \, g(y) + \frac{1}{2}\|x-y\|^2, %\label{eq:defn of prox} %\end{align} %and, if $g=1_C$ is the indicator function of a convex set $C$, we will use the shorthand $P_C=P_{1_C}$ to show the orthogonal projection onto $C$. We use the indicator function $\delta_\mathcal{X}:\RR^d\rightarrow\RR$ of a set $\mathcal{X}\subset\RR^d$, which takes $x$ to \begin{align} \delta_\mathcal{X}(x) = \begin{cases} 0 & x \in \mathcal{X}\\ \infty & x\notin \mathcal{X}. \end{cases} \label{eq:indicator} \end{align} %The tangent cone of $C$ at $x\in C$ is denoted by $T_C(x)$ and The distance function from a point $x$ to $\mathcal{X}$ is denoted by $\dist(x,\mathcal{X}) = \min_{z\in \mathcal{X}} \|x-z\|$. For integers $k_0 \le k_1$, we denote $[k_0:k_1]=\{k_0,\ldots,k_1\}$. For an operator $A:\RR^d\rightarrow\RR^m$ with components $\{A_i\}_{i=1}^m$, we let $DA(x) \in \mathbb{R}^{m\times d}$ denote the Jacobian of $A$, where the $i$th row of $DA(x)$ is the gradient vector $\nabla A_i(x) \in \RR^d$. -\textbf{AE: we haven't defined the augmented L yet. so i removed the hessian notation, which can be defined when it first appears.} %We denote the Hessian of the augmented Lagrangian with respect to $x$ as $\nabla _{xx} \mathcal{L}_{\beta}(x,y)$. \\ %{\color{red} define $A_i$ rigorously.} \textbf{Smoothness.} -We require $f:\RR^d\rightarrow\RR$ and $A:\RR^d\rightarrow \RR^m$ in Program~\ref{prob:01} to be smooth; i.e., there exists $\lambda_f,\lambda_A\ge 0$ such that +We require $f:\RR^d\rightarrow\RR$ and $A:\RR^d\rightarrow \RR^m$ in~\eqref{prob:01} to be smooth; i.e., there exists $\lambda_f,\lambda_A\ge 0$ such that % %\vspace{-.3cm} \begin{align} \| \nabla f(x) - \nabla f(x')\| & \le \lambda_f \|x-x'\|, \nonumber\\ \| DA(x) - DA(x') \| & \le \lambda_A \|x-x'\|, \label{eq:smoothness basic} \end{align} for every $x,x'\in \mathbb{R}^d$. \textbf{Augmented Lagrangian method}. ALM is a classical algorithm, proposed in~\cite{hestenes1969multiplier, powell1969method} and further studied in~\cite{bertsekas2014constrained}. -For solving \eqref{prob:01}, ALM suggests solving the equivalent problem +For solving \eqref{prob:01}, ALM suggests solving the problem % %\vspace{-.3cm} \begin{equation} \min_{x} \max_y \,\,\mathcal{L}_\beta(x,y) + g(x), \label{eq:minmax} \end{equation} %\vspace{-.5cm} % where, for $\b>0$, $\mathcal{L}_\b$ is the corresponding augmented Lagrangian, defined as \begin{align} \label{eq:Lagrangian} \mathcal{L}_\beta(x,y) := f(x) + \langle A(x), y \rangle + \frac{\beta}{2}\|A(x)\|^2. \end{align} %\textbf{AE: Adding the bias $b$ doesn't seem to add much except making the presentation more clunky.} %\vspace{-.5cm} The minimax formulation in \eqref{eq:minmax} naturally suggests the following algorithm for solving \eqref{prob:01}. For dual step sizes $\{\s_k\}_k$, consider the iterates \begin{equation}\label{e:exac} x_{k+1} \in \underset{x}{\argmin} \,\, \mathcal{L}_{\beta}(x,y_k)+g(x), \end{equation} \begin{equation*} y_{k+1} = y_k+\s_k A(x_{k+1}). \end{equation*} -However, updating $x$ above requires solving the non-convex Program~\eqref{e:exac} to global optimality, which is typically intractable. Instead, it is often easier to find an approximate first- or second-order stationary point of Program~\eqref{e:exac}. +However, updating $x_{k+1}$ above requires solving the nonconvex problem~\eqref{e:exac} to optimality, which is typically intractable. Instead, it is often easier to find an approximate first- or second-order stationary point of~\eqref{e:exac}. %We therefore consider an algorithm that only requires approximate stationarity in every iteration. -Hence, we argue that by gradually improving the stationarity precision and increasing the penalty weight $\b$ above, we can reach a stationary point of Program~\eqref{prob:01}, as detailed in Section~\ref{sec:AL algorithm}. +Hence, we argue that by gradually improving the stationarity precision and increasing the penalty weight $\b$ above, we can reach a stationary point of the main problem in~\eqref{prob:01}, as detailed in Section~\ref{sec:AL algorithm}. {\textbf{{{Optimality conditions.}}} \label{sec:opt cnds}} -{First-order necessary optimality conditions} for \eqref{prob:01} are well-understood. {Indeed, $x\in \RR^d$ is a first-order stationary point of Program \eqref{prob:01} if there exists $y\in \RR^m$ such that +{First-order necessary optimality conditions} for \eqref{prob:01} are well-understood. {Indeed, $x\in \RR^d$ is a first-order stationary point of~\eqref{prob:01} if there exists $y\in \RR^m$ such that \begin{align} \begin{cases} -\nabla f(x) - DA(x)^\top y \in \partial g(x)\\ A(x) = 0, \end{cases} \label{e:inclu1} \end{align} where $DA(x)$ is the Jacobian of $A$ at $x$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to \begin{align} \begin{cases} -\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x)\\ A(x) = 0, \end{cases} \label{e:inclu2} \end{align} which is in turn the necessary optimality condition for \eqref{eq:minmax}. } %Note that \eqref{e:inclu2} is equivalent to %\begin{align} %\left[ %\begin{array}{c} %\nabla_x \L_\b(x,y) \\ %\nabla_y \L_\b(x,y) %\end{array} %\right] %\in %\left\{ %\left[ %\begin{array}{c} % v\\ %0 %\end{array} %\right] %: v\in \partial g(x) \right\} %, %\end{align} %which rephrases the stationarity condition in terms of the gradient of the duality gap of Program~\eqref{eq:Lagrangian}. Inspired by this, we say that $x$ is an $(\epsilon_f,\b)$ first-order stationary point if \begin{align} \begin{cases} \dist(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x)) \leq \epsilon_f \\ \| A(x) \| \leq \epsilon_f, \end{cases} \label{eq:inclu3} \end{align} -for $\epsilon_s\ge 0$. +for $\epsilon_f\ge 0$. In light of \eqref{eq:inclu3}, a suitable metric for evaluating the stationarity of a pair $(x,y)\in \RR^d\times \RR^m$ is \begin{align} \dist^2\left(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x) \right) + \|A(x)\|^2 , \label{eq:cvg metric} \end{align} -which we use as the stopping criterion. When $g=0$, it is also not difficult to verify that the expression in \eqref{eq:cvg metric} is the norm of the gradient of the duality gap in \eqref{eq:minmax}. +which we use as the first-order stopping criterion. +%When $g=0$, it is also not difficult to verify that the expression in \eqref{eq:cvg metric} is the norm of the gradient of the duality gap in \eqref{eq:minmax}. As an example, for a convex set $\mathcal{X}\subset\RR^d$, suppose that $g = \delta_\mathcal{X}$ is the indicator function on $\mathcal{X}$.} -Let also $T_\mathcal{X}(x) \subseteq \RR^d$ denote the tangent cone to $\mathcal{X}$ at $x$, and with $P_{T_\mathcal{X}(x)}:\RR^d\rightarrow\RR^d$ denote the orthogonal projection onto this tangent cone. Then, for $u\in\RR^d$, it is not difficult to verify that +Let also $T_\mathcal{X}(x) \subseteq \RR^d$ denote the tangent cone to $\mathcal{X}$ at $x$, and with $P_{T_\mathcal{X}(x)}:\RR^d\rightarrow\RR^d$, we denote the orthogonal projection onto this tangent cone. Then, for $u\in\RR^d$, it is not difficult to verify that \begin{align}\label{eq:dist_subgrad} \dist\left(u, \partial g(x) \right) = \| P_{T_\mathcal{X}(x)}(u) \|. \end{align} % When $g = 0$, a first-order stationary point $x\in \RR^d$ of \eqref{prob:01} is also second-order stationary if % %\vspace{-.5cm} \begin{equation} \lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) > 0, \end{equation} %\vspace{-.5cm} % where $\nabla_{xx}\mathcal{L}_\b$ is the Hessian with respect to $x$, and $\lambda_{\text{min}}(\cdot)$ returns the smallest eigenvalue of its argument. Analogously, $x$ is an $(\epsilon_f, \epsilon_s,\b)$ second-order stationary point if, in addition to \eqref{eq:inclu3}, it holds that % %\vspace{-.5cm} \begin{equation}\label{eq:sec_opt} \lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) \ge -\epsilon_s, \end{equation} for $\epsilon_s>0$. %\vspace{-.5cm} % Naturally, for second-order stationarity, we use $\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))$ as the stopping criterion. \paragraph{{\textbf{Smoothness lemma.}}} This next result controls the smoothness of $\L_\b(\cdot,y)$ for a fixed $y$. The proof is standard but nevertheless included in Appendix~\ref{sec:proof of smoothness lemma} for completeness. \begin{lemma}[\textbf{smoothness}]\label{lem:smoothness} For fixed $y\in \RR^m$ and $\rho,\rho'\ge 0$, it holds that \begin{align} \| \nabla_x \mathcal{L}_{\beta}(x, y)- \nabla_x \mathcal{L}_{\beta}(x', y) \| \le \lambda_\b \|x-x'\|, \end{align} for every $x,x' \in \{ x'': \|A(x'') \|\le \rho, \|x''\|\le \rho'\}$, where \begin{align} \lambda_\beta & \le \lambda_f + \sqrt{m}\lambda_A \|y\| + (\sqrt{m}\lambda_A\rho + d \lambda'^2_A )\b \nonumber\\ & =: \lambda_f + \sqrt{m}\lambda_A \|y\| + \lambda''(A,\rho,\rho') \b. \label{eq:smoothness of Lagrangian} \end{align} Above, $\lambda_f,\lambda_A$ were defined in (\ref{eq:smoothness basic}) and \begin{align} \lambda'_A := \max_{\|x\|\le \rho'}\|DA(x)\|. \end{align} \end{lemma} diff --git a/ICML19/Drafts/sections/related_works.tex b/ICML19/Drafts/sections/related_works.tex index 9e80805..52241ad 100644 --- a/ICML19/Drafts/sections/related_works.tex +++ b/ICML19/Drafts/sections/related_works.tex @@ -1,89 +1,94 @@ \begin{figure*}[ht] % \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf} \begin{center} {\includegraphics[width=1\columnwidth]{figs/bp_fig3_notconverged.pdf} \includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}} %\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}} \label{fig:bp1} \caption{(\textit{Left}) Effect of using different subsolver for the aforementioned non-convex relaxation. APGM is a first order method and does not guarantee the convergence to second order critical point. LBFGS is second order method without any convergence guarantee in the non-convex programming. Hence, it also gets stuck in a local minima if not initialized properly. (\textit{Right}) Comparison of different algorithms for solving basis pursuit template. } %(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])} \end{center} \end{figure*} \section{Related works \label{sec:related work}} ALM has a long history in the optimization literature, dating back to~\cite{hestenes1969multiplier, powell1969method}. -In the special case of~\eqref{prob:01} with a convex $f$ and a linear operator $A$, standard, inexact and linearized versions of ALM have been extensively studied~\cite{lan2016iteration,nedelcu2014computational,tran2018adaptive,xu2017inexact}. +In the special case of~\eqref{prob:01} with a convex function $f$ and a linear operator $A$, standard, inexact and linearized versions of ALM have been extensively studied~\cite{lan2016iteration,nedelcu2014computational,tran2018adaptive,xu2017inexact}. -Classical works on ALM focused on the general template of~\eqref{prob:01} with nonconvex $f$ and nonlinear $A$, had arguably stronger assumptions and required exact solutions to the subproblems of the form \eqref{e:exac}, which appear in Step 2 of Algorithm~\ref{Algo:2}, see for instance \cite{bertsekas2014constrained}. -A similar analysis was conducted in~\cite{fernandez2012local} with the same assumptions on~\eqref{prob:01}, where the authors considered inexact ALM under specific assumptions on the initialization of the dual variable with convergence rates \textbf{AE: somewhat vague. did they require certain convergence of dual variable or did they give cvg rate for the algorithm?}. However, unlike our results, the authors did not provide total complexity results and verifiable conditions. +Classical works on ALM focused on the general template of~\eqref{prob:01} with nonconvex $f$ and nonlinear $A$, with arguably stronger assumptions and required exact solutions to the subproblems of the form \eqref{e:exac}, which appear in Step 2 of Algorithm~\ref{Algo:2}, see for instance \cite{bertsekas2014constrained}. +A similar analysis was conducted in~\cite{fernandez2012local} for the general template of~\eqref{prob:01}. +The authors considered inexact ALM and proved convergence rates for the outer iterates, under specific assumptions on the initialization of the dual variable. +However, unlike our results, the authors did not analyze how to solve the subproblems inexactly and they did not provide total complexity results and verifiable conditions. %\textbf{AE: Mention if this paper was convex or nonconvex?} -Program~\eqref{prob:01} with similar assumptions is also studied in~\cite{birgin2016evaluation} and~\cite{cartis2018optimality} for first-order and second-order stationarity, respectively, with explicit iteration complexity analysis. +Problem~\eqref{prob:01} with similar assumptions to us is also studied in~\cite{birgin2016evaluation} and~\cite{cartis2018optimality} for first-order and second-order stationarity, respectively, with explicit iteration complexity analysis. As we have mentioned in Section~\ref{sec:cvg rate}, our iteration complexity results matches these theoretical algorithms with a simpler algorithm and a simpler analysis. In addition, these algorithms require setting final accuracies since they utilize this information in the algorithm. In contrast to \cite{birgin2016evaluation,cartis2018optimality}, Algorithm~\ref{Algo:2} does not require setting accuracies a priori. \cite{cartis2011evaluation} also considers the same template~\eqref{prob:01} for first-order stationarity with a penalty-type method instead of ALM. Even though the authors show $\mathcal{O}(1/\epsilon^2)$ complexity, this result is obtained by assuming that the penalty parameter remains bounded. We note that such an assumption can also be used to improve our complexities. -\cite{bolte2018nonconvex} studies the general template~\eqref{prob:01} with specific assumptions involving local error bound conditions for the program~\eqref{prob:01}. -These conditions are studied in detail in~\cite{bolte2017error}, but their validity for general SDPs~\eqref{eq:sdp} has never been established. This work also lacks the total iteration complexity presented here. % that we can compare. +\cite{bolte2018nonconvex} studies the general template~\eqref{prob:01} with specific assumptions involving local error bound conditions for the~\eqref{prob:01}. +These conditions are studied in detail in~\cite{bolte2017error}, but their validity for general SDPs~\eqref{eq:sdp} has never been established. This work also lacks the total iteration complexity analysis presented here. % that we can compare. -Another recent line of work~\cite{clason2018acceleration} \textbf{AE:when you say line of work, it suggests more than one reference.} focused on solving the program~\eqref{prob:01} by adapting the primal-dual method of Chambolle and Pock~\cite{chambolle2011first}. -The authors proved the convergence of the method and provided convergence rate by imposing conditions on the objective function that do not hold for standard SDPs.%~\eqref{eq:sdp}. +Another work~\cite{clason2018acceleration} focused on solving~\eqref{prob:01} by adapting the primal-dual method of Chambolle and Pock~\cite{chambolle2011first}. +The authors proved the convergence of the method and provided convergence rate by imposing error bound conditions on the objective function that do not hold for standard SDPs.%~\eqref{eq:sdp}. %Some works also considered the application of ALM/ADMM to nonconvex problems~\cite{wang2015global, liu2017linearized}. %These works assume that the operator in~\eqref{prob:01} is linear, therefore, they do not apply to our setting. %since we have a nonlinear constraint in addition to a nonconvex objective function. \cite{burer2003nonlinear, burer2005local} is the first work that proposes the splitting $X=UU^\top$ for solving SDPs of the form~\eqref{eq:sdp}. Following these works, the literature on Burer-Monteiro (BM) splitting for the large part focused on using ALM for solving the reformulated problem~\eqref{prob:nc}. However, this approach has a few drawbacks: First, it requires exact solutions in Step 2 of Algorithm~\ref{Algo:2} in theory, which in practice is replaced with inexact solutions. Second, their results only establish convergence without providing the rates. In this sense, our work provides a theoretical understanding of the BM splitting with inexact solutions to Step 2 of Algorithm~\ref{Algo:2} and complete iteration complexities. -\textbf{AE: Ahmet you discuss global optimality below but are you sure that's relevant to our work? in the sense that we just give an algorithm to solve to local optimality.} +%\textbf{AE: Ahmet you discuss global optimality below but are you sure that's relevant to our work? in the sense that we just give an algorithm to solve to local optimality.} %Their practical stopping condition is also not analyzed theoretically. %In addition, Burer and Monteiro in~\cite{burer2005local} needed to bound the primal iterates they have to put an artificial bound to the primal domain which will be ineffective in practice; which is impossible to do without knowing the norm of the solution. % and their results do not extend to the case where projection in primal domain are required in each iteration. -\cite{bhojanapalli2016dropping, park2016provable} are among the earliest efforts to show the global optimality of BM splitting, focusing on the special case of SDPs without any linear constraints. \textbf{AE: isn't that trivial or am I missing something?} -For these specific problems, they prove the convergence of gradient descent to global optima, assuming favorable initialization. +\cite{bhojanapalli2016dropping, park2016provable} are among the earliest efforts to show convergence rates for BM splitting, focusing on the special case of SDPs without any linear constraints. +For these specific problems, they prove the convergence of gradient descent to global optima with convergence rates, assuming favorable initialization. These results, however, do not apply to general SDPs of the form~\eqref{eq:sdp} where the difficulty arises due to the linear constraints. -\textbf{AE: also if you do want to talk about global optimality you could also talk about the line of works that show no spurious local optima.} +{\color{red} Ahmet: I have to cite this result bc it is very relevant.} +%\textbf{AE: also if you do want to talk about global optimality you could also talk about the line of works that show no spurious local optima.} %In their followup work~, the authors considered projected gradient descent, but only for restricted strongly convex functions. %Their results are not able to extend to linear constraints and general convex functions. %, therefore not applicable to general SDPs. -Another popular method for solving SDPs are due to~\cite{boumal2014manopt, boumal2016global, boumal2016non}, focusing on the case where the constraints in~\eqref{prob:01} can be written as a Riemannian manifold after BM splitting. \textbf{AE: reimannian manifold is by definition smooth.} -In this case, the authors apply the Riemannian gradient descent and Riemannian trust region methods for obtaining first- and second-order stationary points, respectively. -They obtain~$\mathcal{O}(1/\epsilon^2)$ complexity for finding first-order stationary points and~$\mathcal{O}(1/\epsilon^3)$ complexity for finding second-order stationary points. -Even though these complexities are faster than what we offer in Section~\ref{sec:cvg rate}, the smooth manifold requirement in these works is indeed restrictive. In particular, this requirement holds for max-cut and generalized eigenvalue problems, but it is not satisfied for other important SDPs such as quadratic programming (QAP), optimal power flow and clustering. \textbf{AE: to be safe, plz give citation for these applications.} -In addition, as noted in~\cite{boumal2016global}, per iteration cost of their method for max-cut problem is an astronomical~$\mathcal{O}({d^6})$. % which is astronomical. %ly larger than our cost of $\mathcal{O}(n^2r)$ where $r \ll n$. - %They do not apply to the general semidefinite programming since $f(U)=\langle C, UU^\ast \rangle$. %, these conditions are not satisfied. - +{\color{red} Ahmet: I have to cite this result bc it is relevant and the reviewers will complain if I don't} \cite{bhojanapalli2018smoothed} focused on the quadratic penalty formulation of~\eqref{prob:01}, namely, \begin{equation}\label{eq:pen_sdp} -\min_{X\succeq 0} \langle C, X\rangle + \frac{\mu}{2} \| \mathcal{A}(x)-b\|^2, +\min_{X\succeq 0} \langle C, X\rangle + \frac{\mu}{2} \| B(x)-b\|^2, \end{equation} which after BM splitting becomes \begin{equation}\label{eq:pen_nc} -\min_{U\in\mathbb{R}^{d\times r}} \langle C, UU^\top \rangle + \frac{\mu}{2} \| \mathcal{A}(UU^\top)-b\|^2, +\min_{U\in\mathbb{R}^{d\times r}} \langle C, UU^\top \rangle + \frac{\mu}{2} \|B(UU^\top)-b\|^2, \end{equation} for which they study the optimality of the second-order stationary points. These results are for establishing a connection between the stationary points of~\eqref{eq:pen_nc} and global optima of~\eqref{eq:pen_sdp}. -In contrast, we focus on the relation to the constrained problem~\eqref{eq:sdp}. -\textbf{AE: so i'm not sure why we are comparing with them. their work establish global optimality of local optima but we just find local optima. it seems to add to our work rather compare against it. you also had a paragraph for global optimality with close initialization earlier. could you put the two next to each other?} +In contrast, we focus on the relation of the stationary points of~\eqref{eq:minmax} to the constrained problem~\eqref{prob:01}. +%\textbf{AE: so i'm not sure why we are comparing with them. their work establish global optimality of local optima but we just find local optima. it seems to add to our work rather compare against it. you also had a paragraph for global optimality with close initialization earlier. could you put the two next to each other?} + + +Another popular method for solving SDPs are due to~\cite{boumal2014manopt, boumal2016global, boumal2016non}, focusing on the case where the constraints in~\eqref{prob:01} can be written as a Riemannian manifold after BM splitting. +In this case, the authors apply the Riemannian gradient descent and Riemannian trust region methods for obtaining first- and second-order stationary points, respectively. +They obtain~$\mathcal{O}(1/\epsilon^2)$ complexity for finding first-order stationary points and~$\mathcal{O}(1/\epsilon^3)$ complexity for finding second-order stationary points. +Even though these complexities are faster than what we offer in Section~\ref{sec:cvg rate}, the smooth manifold requirement in these works is indeed restrictive. In particular, this requirement holds for max-cut and generalized eigenvalue problems, but it is not satisfied for other important SDPs such as quadratic programming (QAP), optimal power flow and clustering. \textbf{AE: to be safe, plz give citation for these applications.} +In addition, as noted in~\cite{boumal2016global}, per iteration cost of their method for max-cut problem is an astronomical~$\mathcal{O}({d^6})$. % which is astronomical. %ly larger than our cost of $\mathcal{O}(n^2r)$ where $r \ll n$. + %\cite{birgin2016evaluation} can handle the same problem but their algorithm is much more complicated than ours. -Lastly, there also exists a line of work for solving SDPs in their original convex formulation, in a storage efficient way~\cite{nesterov2009primal, yurtsever2015universal, yurtsever2018conditional}. These works have global optimality guarantees by their virtue of directly solving the convex formulation. On the downside, these works require the use of eigenvalue routines and exhibit significantly slower convergence as compared to non-convex approaches. \textbf{AE: plz cite to support your claim.} +Lastly, there also exists a line of work for solving SDPs in their original convex formulation, in a storage efficient way~\cite{nesterov2009primal, yurtsever2015universal, yurtsever2018conditional}. These works have global optimality guarantees by their virtue of directly solving the convex formulation. On the downside, these works require the use of eigenvalue routines and exhibit significantly slower convergence as compared to nonconvex approaches~\cite{jaggi2013revisiting}.