diff --git a/NeurIPS 19/bibliography.bib b/NeurIPS 19/bibliography.bib index 72b3fc0..ea0913f 100644 --- a/NeurIPS 19/bibliography.bib +++ b/NeurIPS 19/bibliography.bib @@ -1,830 +1,864 @@ %% This BibTeX bibliography file was created using BibDesk. %% http://bibdesk.sourceforge.net/ %% Saved with string encoding Unicode (UTF-8) % Created by Mehmet Fatih Sahin for nonconvex inexact augmented lagrangian paper % December 14, Friday. 01:52 am +@book{birgin2014practical, + title={Practical augmented Lagrangian methods for constrained optimization}, + author={Birgin, Ernesto G and Mart\_nez, Jos{\^a} Mario}, + volume={10}, + year={2014}, + publisher={SIAM} +} + +@article{bertsekas1982constrained, + title={Constrained optimization and Lagrange Multiplier methods}, + author={Bertsekas, Dimitri P}, + journal={Computer Science and Applied Mathematics, Boston: Academic Press, 1982}, + year={1982} +} + +@inproceedings{karimi2016linear, + title={Linear convergence of gradient and proximal-gradient methods under the polyak-{\l}ojasiewicz condition}, + author={Karimi, Hamed and Nutini, Julie and Schmidt, Mark}, + booktitle={Joint European Conference on Machine Learning and Knowledge Discovery in Databases}, + pages={795--811}, + year={2016}, + organization={Springer} +} + +@article{zhao1998semidefinite, + title={Semidefinite programming relaxations for the quadratic assignment problem}, + author={Zhao, Qing and Karisch, Stefan E and Rendl, Franz and Wolkowicz, Henry}, + journal={Journal of Combinatorial Optimization}, + volume={2}, + number={1}, + pages={71--109}, + year={1998}, + publisher={Springer} +} @article{Goodfellow2014, author = {{Goodfellow}, I.~J. and {Pouget-Abadie}, J. and {Mirza}, M. and {Xu}, B. and {Warde-Farley}, D. and {Ozair}, S. and {Courville}, A. and {Bengio}, Y.}, url = {https://arxiv.org/abs/1406.2661}, title = "{Generative Adversarial Networks}", journal = {ArXiv e-prints}, archivePrefix = "arXiv", eprint = {1406.2661}, primaryClass = "stat.ML", keywords = {Statistics - Machine Learning, Computer Science - Machine Learning}, year = 2014, month = jun, adsurl = {http://adsabs.harvard.edu/abs/2014arXiv1406.2661G}, adsnote = {Provided by the SAO/NASA Astrophysics Data System} } @ARTICLE{Ilyas2017, author = {{Ilyas}, Andrew and {Jalal}, Ajil and {Asteri}, Eirini and {Daskalakis}, Constantinos and {Dimakis}, Alexandros G.}, title = "{The Robust Manifold Defense: Adversarial Training using Generative Models}", journal = {arXiv e-prints}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Cryptography and Security, Computer Science - Machine Learning, Statistics - Machine Learning}, year = 2017, month = Dec, eid = {arXiv:1712.09196}, pages = {arXiv:1712.09196}, archivePrefix = {arXiv}, eprint = {1712.09196}, primaryClass = {cs.CV}, adsurl = {https://ui.adsabs.harvard.edu/\#abs/2017arXiv171209196I}, adsnote = {Provided by the SAO/NASA Astrophysics Data System} } @ARTICLE{Szegedy2013, author = {{Szegedy}, Christian and {Zaremba}, Wojciech and {Sutskever}, Ilya and {Bruna}, Joan and {Erhan}, Dumitru and {Goodfellow}, Ian and {Fergus}, Rob}, title = "{Intriguing properties of neural networks}", journal = {arXiv e-prints}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing}, year = 2013, month = Dec, eid = {arXiv:1312.6199}, pages = {arXiv:1312.6199}, archivePrefix = {arXiv}, eprint = {1312.6199}, primaryClass = {cs.CV}, adsurl = {https://ui.adsabs.harvard.edu/\#abs/2013arXiv1312.6199S}, adsnote = {Provided by the SAO/NASA Astrophysics Data System} } @article{tepper2018clustering, title={Clustering is semidefinitely not that hard: Nonnegative SDP for manifold disentangling}, author={Tepper, Mariano and Sengupta, Anirvan M and Chklovskii, Dmitri}, journal={Journal of Machine Learning Research}, volume={19}, number={82}, year={2018} } @article{beck2009fast, title={A fast iterative shrinkage-thresholding algorithm for linear inverse problems}, author={Beck, Amir and Teboulle, Marc}, journal={SIAM journal on imaging sciences}, volume={2}, number={1}, pages={183--202}, year={2009}, publisher={SIAM} } @article{tibshirani1996regression, title={Regression shrinkage and selection via the lasso}, author={Tibshirani, Robert}, journal={Journal of the Royal Statistical Society. Series B (Methodological)}, pages={267--288}, year={1996}, publisher={JSTOR} } @book{bauschke2011convex, title={Convex analysis and monotone operator theory in Hilbert spaces}, author={Bauschke, Heinz H and Combettes, Patrick L and others}, volume={408}, year={2011}, publisher={Springer} } @article{waldspurger2018rank, title={Rank optimality for the Burer-Monteiro factorization}, author={Waldspurger, Ir{\`e}ne and Waters, Alden}, journal={arXiv preprint arXiv:1812.03046}, year={2018} } @inproceedings{jaggi2013revisiting, title={Revisiting Frank-Wolfe: Projection-Free Sparse Convex Optimization.}, author={Jaggi, Martin}, booktitle={ICML (1)}, pages={427--435}, year={2013} } @inproceedings{nesterov1983method, title={A method for solving the convex programming problem with convergence rate O (1/k\^{} 2)}, author={Nesterov, Yurii E}, booktitle={Dokl. Akad. Nauk SSSR}, volume={269}, pages={543--547}, year={1983} } @inproceedings{raghavendra2008optimal, title={Optimal algorithms and inapproximability results for every CSP?}, author={Raghavendra, Prasad}, booktitle={Proceedings of the fortieth annual ACM symposium on Theory of computing}, pages={245--254}, year={2008}, organization={ACM} } @online{xiao2017/online, author = {Han Xiao and Kashif Rasul and Roland Vollgraf}, title = {Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms}, date = {2017-08-28}, year = {2017}, eprintclass = {cs.LG}, eprinttype = {arXiv}, eprint = {cs.LG/1708.07747}, } @article{cartis2011evaluation, title={On the evaluation complexity of composite function minimization with applications to nonconvex nonlinear programming}, author={Cartis, Coralia and Gould, Nicholas IM and Toint, Philippe L}, journal={SIAM Journal on Optimization}, volume={21}, number={4}, pages={1721--1739}, year={2011}, publisher={SIAM} } @article{bolte2017error, title={From error bounds to the complexity of first-order descent methods for convex functions}, author={Bolte, J{\'e}r{\^o}me and Nguyen, Trong Phong and Peypouquet, Juan and Suter, Bruce W}, journal={Mathematical Programming}, volume={165}, number={2}, pages={471--507}, year={2017}, publisher={Springer} } @article{obozinski2011group, title={Group lasso with overlaps: the latent group lasso approach}, author={Obozinski, Guillaume and Jacob, Laurent and Vert, Jean-Philippe}, journal={arXiv preprint arXiv:1110.0413}, year={2011} } @article{tran2018smooth, title={A smooth primal-dual optimization framework for nonsmooth composite convex minimization}, author={Tran-Dinh, Quoc and Fercoq, Olivier and Cevher, Volkan}, journal={SIAM Journal on Optimization}, volume={28}, number={1}, pages={96--134}, year={2018}, publisher={SIAM} } @article{tran2018adaptive, title={An Adaptive Primal-Dual Framework for Nonsmooth Convex Minimization}, author={Tran-Dinh, Quoc and Alacaoglu, Ahmet and Fercoq, Olivier and Cevher, Volkan}, journal={arXiv preprint arXiv:1808.04648}, year={2018} } @article{arora2018compressed, title={A compressed sensing view of unsupervised text embeddings, bag-of-n-grams, and LSTMs}, author={Arora, Sanjeev and Khodak, Mikhail and Saunshi, Nikunj and Vodrahalli, Kiran}, year={2018} } @article{candes2008introduction, title={An introduction to compressive sampling}, author={Cand{\`e}s, Emmanuel J and Wakin, Michael B}, journal={IEEE signal processing magazine}, volume={25}, number={2}, pages={21--30}, year={2008}, publisher={IEEE} } @article{chen2001atomic, title={Atomic decomposition by basis pursuit}, author={Chen, Scott Shaobing and Donoho, David L and Saunders, Michael A}, journal={SIAM review}, volume={43}, number={1}, pages={129--159}, year={2001}, publisher={SIAM} } @article{flores2012complete, title={A complete characterization of strong duality in nonconvex optimization with a single constraint}, author={Flores-Baz{\'a}n, Fabi{\'a}n and Flores-Baz{\'a}n, Fernando and Vera, Cristi{\'a}n}, journal={Journal of Global Optimization}, volume={53}, number={2}, pages={185--201}, year={2012}, publisher={Springer} } @article{dai2002convergence, title={Convergence properties of the BFGS algoritm}, author={Dai, Yu-Hong}, journal={SIAM Journal on Optimization}, volume={13}, number={3}, pages={693--701}, year={2002}, publisher={SIAM} } @article{yang2015sdpnal, title={SDPNAL+: a majorized semismooth Newton-CG augmented Lagrangian method for semidefinite programming with nonnegative constraints}, author={Yang, Liuqin and Sun, Defeng and Toh, Kim-Chuan}, journal={Mathematical Programming Computation}, volume={7}, number={3}, pages={331--366}, year={2015}, publisher={Springer} } @article{mascarenhas2004bfgs, title={The BFGS method with exact line searches fails for non-convex objective functions}, author={Mascarenhas, Walter F}, journal={Mathematical Programming}, volume={99}, number={1}, pages={49--61}, year={2004}, publisher={Springer} } @inproceedings{boumal2016non, title={The non-convex Burer-Monteiro approach works on smooth semidefinite programs}, author={Boumal, Nicolas and Voroninski, Vlad and Bandeira, Afonso}, booktitle={Advances in Neural Information Processing Systems}, pages={2757--2765}, year={2016} } @inproceedings{bhojanapalli2016dropping, title={Dropping convexity for faster semi-definite optimization}, author={Bhojanapalli, Srinadh and Kyrillidis, Anastasios and Sanghavi, Sujay}, booktitle={Conference on Learning Theory}, pages={530--582}, year={2016} } @inproceedings{nesterov1983method, title={A method for solving the convex programming problem with convergence rate O (1/k\^{} 2)}, author={Nesterov, Yurii E}, booktitle={Dokl. Akad. Nauk SSSR}, volume={269}, pages={543--547}, year={1983} } @article{shi2017penalty, title={Penalty dual decomposition method for nonsmooth nonconvex optimization}, author={Shi, Qingjiang and Hong, Mingyi and Fu, Xiao and Chang, Tsung-Hui}, journal={arXiv preprint arXiv:1712.04767}, year={2017} } @article{fernandez2012local, title={Local convergence of exact and inexact augmented Lagrangian methods under the second-order sufficient optimality condition}, author={Fern{\'a}ndez, Dami{\'a}n and Solodov, Mikhail V}, journal={SIAM Journal on Optimization}, volume={22}, number={2}, pages={384--407}, year={2012}, publisher={SIAM} } @article{bolte2018nonconvex, title={Nonconvex Lagrangian-based optimization: monitoring schemes and global convergence}, author={Bolte, J{\'e}r{\^o}me and Sabach, Shoham and Teboulle, Marc}, journal={Mathematics of Operations Research}, year={2018}, publisher={INFORMS} } @article{nouiehed2018convergence, title={Convergence to Second-Order Stationarity for Constrained Non-Convex Optimization}, author={Nouiehed, Maher and Lee, Jason D and Razaviyayn, Meisam}, journal={arXiv preprint arXiv:1810.02024}, year={2018} } @article{cartis2018optimality, title={Optimality of orders one to three and beyond: characterization and evaluation complexity in constrained nonconvex optimization}, author={Cartis, Coralia and Gould, Nicholas IM and Toint, Ph L}, journal={Journal of Complexity}, year={2018}, publisher={Elsevier} } @article{nesterov2009primal, title={Primal-dual subgradient methods for convex problems}, author={Nesterov, Yurii}, journal={Mathematical programming}, volume={120}, number={1}, pages={221--259}, year={2009}, publisher={Springer} } @inproceedings{yurtsever2015universal, title={A universal primal-dual convex optimization framework}, author={Yurtsever, Alp and Dinh, Quoc Tran and Cevher, Volkan}, booktitle={Advances in Neural Information Processing Systems}, pages={3150--3158}, year={2015} } @article{yurtsever2018conditional, title={A Conditional Gradient Framework for Composite Convex Minimization with Applications to Semidefinite Programming}, author={Yurtsever, Alp and Fercoq, Olivier and Locatello, Francesco and Cevher, Volkan}, journal={arXiv preprint arXiv:1804.08544}, year={2018} } @article{bertsekas1976penalty, title={On penalty and multiplier methods for constrained minimization}, author={Bertsekas, Dimitri P}, journal={SIAM Journal on Control and Optimization}, volume={14}, number={2}, pages={216--235}, year={1976}, publisher={SIAM} } @article{tran2018adaptive, title={An Adaptive Primal-Dual Framework for Nonsmooth Convex Minimization}, author={Tran-Dinh, Quoc and Alacaoglu, Ahmet and Fercoq, Olivier and Cevher, Volkan}, journal={arXiv preprint arXiv:1808.04648}, year={2018} } @article{hestenes1969multiplier, title={Multiplier and gradient methods}, author={Hestenes, Magnus R}, journal={Journal of optimization theory and applications}, volume={4}, number={5}, pages={303--320}, year={1969}, publisher={Springer} } @article{powell1969method, title={A method for nonlinear constraints in minimization problems}, author={Powell, Michael JD}, journal={Optimization}, pages={283--298}, year={1969}, publisher={Academic Press} } @book{bertsekas2014constrained, title={Constrained optimization and Lagrange multiplier methods}, author={Bertsekas, Dimitri P}, year={2014}, publisher={Academic press} } @inproceedings{kulis2007fast, title={Fast low-rank semidefinite programming for embedding and clustering}, author={Kulis, Brian and Surendran, Arun C and Platt, John C}, booktitle={Artificial Intelligence and Statistics}, pages={235--242}, year={2007} } @article{xu2017inexact, title={Iteration complexity of inexact augmented Lagrangian methods for constrained convex programming}, author={Xu, Yangyang}, journal={arXiv preprint arXiv:1711.05812v2}, year={2017} } @article{nedelcu2014computational, title={Computational complexity of inexact gradient augmented Lagrangian methods: application to constrained MPC}, author={Nedelcu, Valentin and Necoara, Ion and Tran-Dinh, Quoc}, journal={SIAM Journal on Control and Optimization}, volume={52}, number={5}, pages={3109--3134}, year={2014}, publisher={SIAM} } @article{pataki1998rank, title={On the rank of extreme matrices in semidefinite programs and the multiplicity of optimal eigenvalues}, author={Pataki, G{\'a}bor}, journal={Mathematics of operations research}, volume={23}, number={2}, pages={339--358}, year={1998}, publisher={INFORMS} } @article{Barvinok1995problems, title={Problems of distance geometry and convex properties of quadratic maps}, author={Barvinok, Alexander I.}, journal={Discrete \& Computational Geometry}, volume={13}, number={2}, pages={189--202}, year={1995}, publisher={Springer} } @article{cartis2012complexity, title={Complexity bounds for second-order optimality in unconstrained optimization}, author={Cartis, Coralia and Gould, Nicholas IM and Toint, Ph L}, journal={Journal of Complexity}, volume={28}, number={1}, pages={93--108}, year={2012}, publisher={Elsevier} } @article{ghadimi2016accelerated, title={Accelerated gradient methods for nonconvex nonlinear and stochastic programming}, author={Ghadimi, Saeed and Lan, Guanghui}, journal={Mathematical Programming}, volume={156}, number={1-2}, pages={59--99}, year={2016}, publisher={Springer} } @article{boumal2014manopt, Author = {Boumal, Nicolas and Mishra, Bamdev and Absil, P-A and Sepulchre, Rodolphe}, Journal = {The Journal of Machine Learning Research}, Number = {1}, Pages = {1455--1459}, Publisher = {JMLR. org}, Title = {Manopt, a Matlab toolbox for optimization on manifolds}, Volume = {15}, Year = {2014}} @article{boumal2016global, Author = {Boumal, Nicolas and Absil, P-A and Cartis, Coralia}, Journal = {arXiv preprint arXiv:1605.08101}, Title = {Global rates of convergence for nonconvex optimization on manifolds}, Year = {2016}} @inproceedings{ge2016efficient, Author = {Ge, Rong and Jin, Chi and Netrapalli, Praneeth and Sidford, Aaron and others}, Booktitle = {International Conference on Machine Learning}, Pages = {2741--2750}, Title = {Efficient algorithms for large-scale generalized eigenvector computation and canonical correlation analysis}, Year = {2016}} @inproceedings{boumal2016non, Author = {Boumal, Nicolas and Voroninski, Vlad and Bandeira, Afonso}, Booktitle = {Advances in Neural Information Processing Systems}, Pages = {2757--2765}, Title = {The non-convex Burer-Monteiro approach works on smooth semidefinite programs}, Year = {2016}} @article{davis2011university, Author = {Davis, Timothy A and Hu, Yifan}, Journal = {ACM Transactions on Mathematical Software (TOMS)}, Number = {1}, Pages = {1}, Publisher = {ACM}, Title = {The University of Florida sparse matrix collection}, Volume = {38}, Year = {2011}} @article{clason2018acceleration, Author = {Clason, Christian and Mazurenko, Stanislav and Valkonen, Tuomo}, Journal = {arXiv preprint arXiv:1802.03347}, Title = {Acceleration and global convergence of a first-order primal--dual method for nonconvex problems}, Year = {2018}} @article{chambolle2011first, Author = {Chambolle, Antonin and Pock, Thomas}, Journal = {Journal of mathematical imaging and vision}, Number = {1}, Pages = {120--145}, Publisher = {Springer}, Title = {A first-order primal-dual algorithm for convex problems with applications to imaging}, Volume = {40}, Year = {2011}} @article{tran2018smooth, Author = {Tran-Dinh, Quoc and Fercoq, Olivier and Cevher, Volkan}, Journal = {SIAM Journal on Optimization}, Number = {1}, Pages = {96--134}, Publisher = {SIAM}, Title = {A smooth primal-dual optimization framework for nonsmooth composite convex minimization}, Volume = {28}, Year = {2018}} @article{bhojanapalli2018smoothed, Author = {Bhojanapalli, Srinadh and Boumal, Nicolas and Jain, Prateek and Netrapalli, Praneeth}, Journal = {arXiv preprint arXiv:1803.00186}, Title = {Smoothed analysis for low-rank solutions to semidefinite programs in quadratic penalty form}, Year = {2018}} @article{park2016provable, Author = {Park, Dohyung and Kyrillidis, Anastasios and Bhojanapalli, Srinadh and Caramanis, Constantine and Sanghavi, Sujay}, Journal = {arXiv preprint arXiv:1606.01316}, Title = {Provable Burer-Monteiro factorization for a class of norm-constrained matrix problems}, Year = {2016}} @article{mixon2016clustering, Author = {Mixon, Dustin G and Villar, Soledad and Ward, Rachel}, Journal = {arXiv preprint arXiv:1602.06612}, Title = {Clustering subgaussian mixtures by semidefinite programming}, Year = {2016}} @article{yurtsever2018conditional, Author = {Yurtsever, Alp and Fercoq, Olivier and Locatello, Francesco and Cevher, Volkan}, Journal = {arXiv preprint arXiv:1804.08544}, Title = {A Conditional Gradient Framework for Composite Convex Minimization with Applications to Semidefinite Programming}, Year = {2018}} @inproceedings{jin2017escape, Author = {Jin, Chi and Ge, Rong and Netrapalli, Praneeth and Kakade, Sham M and Jordan, Michael I}, Booktitle = {International Conference on Machine Learning}, Pages = {1724--1732}, Title = {How to Escape Saddle Points Efficiently}, Year = {2017}} @article{birgin2016evaluation, title={Evaluation complexity for nonlinear constrained optimization using unscaled KKT conditions and high-order models}, - author={Birgin, Ernesto G and Gardenghi, JL and Martínez, Jos{\'e} Mario and Santos, SA and Toint, Ph L}, + author={Birgin, Ernesto G and Gardenghi, JL and Martinez, Jos{\'e} Mario and Santos, SA and Toint, Ph L}, journal={SIAM Journal on Optimization}, volume={26}, number={2}, pages={951--967}, year={2016}, publisher={SIAM} } @article{liu2017linearized, Author = {Liu, Qinghua and Shen, Xinyue and Gu, Yuantao}, Journal = {arXiv preprint arXiv:1705.02502}, Title = {Linearized admm for non-convex non-smooth optimization with convergence analysis}, Year = {2017}} @article{wang2015global, Author = {Wang, Yu and Yin, Wotao and Zeng, Jinshan}, Journal = {arXiv preprint arXiv:1511.06324}, Title = {Global convergence of ADMM in nonconvex nonsmooth optimization}, Year = {2015}} @book{bertsekas2014constrained, title={Constrained optimization and Lagrange multiplier methods}, author={Bertsekas, Dimitri P}, year={2014}, publisher={Academic press} } @article{lan2016iteration, Author = {Lan, Guanghui and Monteiro, Renato DC}, Journal = {Mathematical Programming}, Number = {1-2}, Pages = {511--547}, Publisher = {Springer}, Title = {Iteration-complexity of first-order augmented Lagrangian methods for convex programming}, Volume = {155}, Year = {2016}} @article{nedelcu2014computational, title={Computational complexity of inexact gradient augmented Lagrangian methods: application to constrained MPC}, author={Nedelcu, Valentin and Necoara, Ion and Tran-Dinh, Quoc}, journal={SIAM Journal on Control and Optimization}, volume={52}, number={5}, pages={3109--3134}, year={2014}, publisher={SIAM} } @book{bertsekas1999nonlinear, title={Nonlinear programming}, author={Bertsekas, Dimitri P} } @article{hestenes1969multiplier, title={Multiplier and gradient methods}, author={Hestenes, Magnus R}, journal={Journal of optimization theory and applications}, volume={4}, number={5}, pages={303--320}, year={1969}, publisher={Springer} } @incollection{powell1978fast, title={A fast algorithm for nonlinearly constrained optimization calculations}, author={Powell, Michael JD}, booktitle={Numerical analysis}, pages={144--157}, year={1978}, publisher={Springer} } @article{burer2005local, title={Local minima and convergence in low-rank semidefinite programming}, author={Burer, Samuel and Monteiro, Renato DC}, journal={Mathematical Programming}, volume={103}, number={3}, pages={427--444}, year={2005}, publisher={Springer} } @article{burer2003nonlinear, title={A nonlinear programming algorithm for solving semidefinite programs via low-rank factorization}, author={Burer, Samuel and Monteiro, Renato DC}, journal={Mathematical Programming}, volume={95}, number={2}, pages={329--357}, year={2003}, publisher={Springer} } @article{singer2011angular, title={Angular synchronization by eigenvectors and semidefinite programming}, author={Singer, Amit}, journal={Applied and computational harmonic analysis}, volume={30}, number={1}, pages={20}, year={2011}, publisher={NIH Public Access} } @article{singer2011three, title={Three-dimensional structure determination from common lines in cryo-EM by eigenvectors and semidefinite programming}, author={Singer, Amit and Shkolnisky, Yoel}, journal={SIAM journal on imaging sciences}, volume={4}, number={2}, pages={543--572}, year={2011}, publisher={SIAM} } @inproceedings{song2007dependence, title={A dependence maximization view of clustering}, author={Song, Le and Smola, Alex and Gretton, Arthur and Borgwardt, Karsten M}, booktitle={Proceedings of the 24th international conference on Machine learning}, pages={815--822}, year={2007}, organization={ACM} } @inproceedings{mossel2015consistency, title={Consistency thresholds for the planted bisection model}, author={Mossel, Elchanan and Neeman, Joe and Sly, Allan}, booktitle={Proceedings of the forty-seventh annual ACM symposium on Theory of computing}, pages={69--75}, year={2015}, organization={ACM} } @incollection{lovasz2003semidefinite, title={Semidefinite programs and combinatorial optimization}, author={Lov{\'a}sz, L{\'a}szl{\'o}}, booktitle={Recent advances in algorithms and combinatorics}, pages={137--194}, year={2003}, publisher={Springer} } @article{khot2011grothendieck, title={Grothendieck-type inequalities in combinatorial optimization}, author={Khot, Subhash and Naor, Assaf}, journal={arXiv preprint arXiv:1108.2464}, year={2011} } @article{park2016provable, title={Provable Burer-Monteiro factorization for a class of norm-constrained matrix problems}, author={Park, Dohyung and Kyrillidis, Anastasios and Bhojanapalli, Srinadh and Caramanis, Constantine and Sanghavi, Sujay}, journal={arXiv preprint arXiv:1606.01316}, year={2016} } @article{mixon2016clustering, title={Clustering subgaussian mixtures by semidefinite programming}, author={Mixon, Dustin G and Villar, Soledad and Ward, Rachel}, journal={arXiv preprint arXiv:1602.06612}, year={2016} } @inproceedings{kulis2007fast, title={Fast low-rank semidefinite programming for embedding and clustering}, author={Kulis, Brian and Surendran, Arun C and Platt, John C}, booktitle={Artificial Intelligence and Statistics}, pages={235--242}, year={2007} } @article{Peng2007, Author = {Peng, J. and Wei, Y.}, Date-Added = {2017-10-26 15:22:37 +0000}, Date-Modified = {2017-10-26 15:30:53 +0000}, Journal = {SIAM J. Optim.}, Number = {1}, Pages = {186--205}, Title = {Approximating {K}--means--type clustering via semidefinite programming}, Volume = {18}, Year = {2007}} @book{fletcher2013practical, title={Practical methods of optimization}, author={Fletcher, Roger}, year={2013}, publisher={John Wiley \& Sons} } @article{Bora2017, title = {Compressed {Sensing} using {Generative} {Models}}, url = {http://arxiv.org/abs/1703.03208}, urldate = {2018-10-25}, journal = {arXiv:1703.03208 [cs, math, stat]}, author = {Bora, Ashish and Jalal, Ajil and Price, Eric and Dimakis, Alexandros G.}, month = mar, year = {2017}, note = {arXiv: 1703.03208}, keywords = {Statistics - Machine Learning, Computer Science - Machine Learning, Computer Science - Information Theory}, } @book{nocedal2006numerical, title={Numerical Optimization}, author={Nocedal, J. and Wright, S.}, isbn={9780387400655}, lccn={2006923897}, series={Springer Series in Operations Research and Financial Engineering}, url={https://books.google.ch/books?id=VbHYoSyelFcC}, year={2006}, publisher={Springer New York} } @article{parikh2014proximal, title={Proximal algorithms}, author={Parikh, Neal and Boyd, Stephen and others}, journal={Foundations and Trends{\textregistered} in Optimization}, volume={1}, number={3}, pages={127--239}, year={2014}, publisher={Now Publishers, Inc.} } \ No newline at end of file diff --git a/NeurIPS 19/sections/guarantees.tex b/NeurIPS 19/sections/guarantees.tex index 6028474..e9d3a65 100644 --- a/NeurIPS 19/sections/guarantees.tex +++ b/NeurIPS 19/sections/guarantees.tex @@ -1,194 +1,206 @@ %!TEX root = ../main.tex \section{Convergence Rate \label{sec:cvg rate}} In this section, we detail the convergence rate of Algorithm~\ref{Algo:2} for finding first-order and second-order stationary points, along with the iteration complexity results. All the proofs are deferred to Appendix~\ref{sec:theory} for the clarity. Theorem~\ref{thm:main} below characterizes the convergence rate of Algorithm~\ref{Algo:2} for finding stationary points in terms of the number of outer iterations.\\ %<<<<<<< HEAD %{\color{red}{Ahmet: Maybe instead of sufficiently large k0 we can say k0 such that beta is bigger than 1, it makes the proof go thru, it would be a bit more explanatory}} %======= %>>>>>>> b39eea61625954bc9d3858590b7b37a182a6af4f \begin{theorem}\textbf{\emph{(convergence rate)}} \label{thm:main} Suppose that $f$ and $A$ are smooth in the sense specified in (\ref{eq:smoothness basic}). For $\rho'>0$, let \begin{align} \lambda'_f = \max_{\|x\|\le \rho'} \|\nabla f(x)\|, ~~ \lambda'_A = \max_{\|x\| \le \rho'} \|DA(x)\|, \label{eq:defn_restricted_lipsichtz} \end{align} be the (restricted) Lipschitz constants of $f$ and $A$, respectively. For sufficiently large integers $k_0\le k_1$, consider the interval $K=[k_0:k_1]$, and let $\{x_k\}_{k\in K}$ be the output sequence of Algorithm~\ref{Algo:2} on the interval $K$. For $\nu>0$, assume that \begin{align} \nu \|A(x_k)\| & \le \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right), \label{eq:regularity} \end{align} for every $k\in K$. We consider two cases: %{\color{red} Ahmet: I removed the squares and showed the rate on dist + feasibility, since it is the measure for the stationarity, using squares confuses complexity analysis...} \begin{itemize}%[leftmargin=*] \item If a first-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f},\b_k)$ first-order stationary point of (\ref{prob:01}) with \begin{align} \epsilon_{k,f} & = \frac{1}{\beta_{k-1}} \left(\frac{2(\lambda'_f+\lambda'_A y_{\max}) (1+\lambda_A' \sigma_k)}{\nu}+1\right) \nonumber \\ &=: \frac{Q(f,g,A,\s_1)}{\beta_{k-1}}, \label{eq:stat_prec_first} \end{align} for every $k\in K$, where the expression for $y_{\max}=y_{\max}(x_1,y_0,\s_1)$ is given in (\ref{eq:dual growth}). \item If a second-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f}, \epsilon_{k,s},\b_k)$ second-order stationary point of~(\ref{prob:01}) with $\epsilon_{k,s}$ specified above and with \begin{align} \epsilon_{k,s} &= \epsilon_{k-1} + \sigma_k \sqrt{m} \lambda_A \frac{ 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} \nonumber\\ &= \frac{\nu + \sigma_k \sqrt{m} \lambda_A 2\lambda'_f +2 \lambda'_A y_{\max} }{\nu \b_{k-1}} =: \frac{Q'}{\beta_{k-1}}. \end{align} \end{itemize} \end{theorem} %A few remarks are in order about Theorem~\ref{thm:main}. %\textbf{Tradeoff.} %Roughly speaking, Theorem~\ref{thm:main} states that the iterates of Algorithm~\ref{Algo:2} approach first-order stationarity for~\eqref{prob:01} at the rate of $1/{\b_k}$, where $\b_k$ is the penalty weight in AL in iteration $k$ as in~\eqref{eq:Lagrangian}. Note the inherent tradeoff here: For a faster sequence $\{\b_k\}_k$, the iterates approach stationarity faster but the computational cost of the inexact primal subroutine (Step 2) increases since a higher accuracy is required and Lipschitz constant of the unconstrained problem is also affected by $\beta_k$. In words, there is a tradeoff between the number of outer and inner loop iterations of Algorithm~\ref{Algo:2}. We highlight this tradeoff for a number of subroutines below. Loosely speaking, Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} converges to a (first- or second-) order stationary point of \eqref{prob:01} at the rate of $1/\b_k$. A few remarks are in order. \textbf{Regularity.} The key condition in Theorem~\ref{thm:main} is \eqref{eq:regularity} which, broadly speaking, controls the problem geometry. As the penalty weight $\b_k$ grows, the primal solver in Step~2 places an increasing emphasis on reducing the feasibility gap and \eqref{eq:regularity} formalizes this intuition. In contrast to most conditions in the nonconvex optimization literature, such as~\cite{flores2012complete}, our condition in~\eqref{eq:regularity} appears to be easier to verify, as we see in Section~\ref{sec:experiments}. We now argue that such a condition is necessary for controlling the feasibility gap of~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}. Consider the case where $f=0$ and $g=\delta_\mathcal{X}$, where $\mathcal{X}$ is a convex set, $A$ is a linear operator. In this case, solving \eqref{prob:01} finds a point in $\mathcal{X}\cap \text{null}(A)$, where the subspace $\text{null}(A)=\{ x\in\mathbb{R}^d: A(x) = 0 \} \subset \RR^d$ is the null space of $A$. Here, the Slater's condition requires that \begin{equation} \text{relint} (\mathcal{X}) \cap \text{null}(A)\neq \emptyset. \end{equation} %<<<<<<< HEAD In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}. Intuitively, the Slater's condition here removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01}, namely, finding a point in $\mathcal{X}\cap \text{null}(A)$, can be particularly difficult. For instance, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) has arbitrarily slow convergence, see Figure~\ref{fig:convex_slater}. \editf{ -We now show that our geometric condition boils down to Polyak-Lojasiewicz (PL) condition when $g=0$. PL inequality is a sufficient condition to show linear convergence for gradient descent. It is weaker than the main conditions which are used to show linear convergence without strong convexity assumption. For a detailed discussion of this condition, see~\cite{karimi2016linear} and references therein. Let $h(x):= \|A(x)\|^2$. When $g=0$, \ref{eq:regularity} implies for $\nu > 0$, -\begin{align*} -\nu \|A(x_k)\| -& \le \left\| -DA(x_k)^\top A(x_k) \right \|^2 \\ -\nu h(x) -& \le \| \nabla h(x) \|^2 -\end{align*} -(mfs:)So what? More explanation is needed here. +We now show that our geometric condition boils down to \textbf{Polyak-Lojasiewicz} (PL) condition when $g=0$. +PL inequality is a sufficient condition to show linear convergence for gradient descent. It is weaker than the main conditions which are used to show linear convergence without strong convexity assumption. +For a detailed discussion of this condition, see~\cite{karimi2016linear} and references therein. Let $h(x):= \|A(x)\|^2$. When $g=0$, \ref{eq:regularity} implies for $\nu > 0$, +\begin{align} +\nu \|A(x)\| +& \le \left\| -DA(x)^\top A(x) \right \| \\ +\nu^2 h(x) +& \le \| \nabla h(x) \|^2 \nonumber +\label{eq:pl} +\end{align} +where second inequality follows by squaring the both sides. +[(mfs:)So what? More explanation to be added here.] + +One may also rewrite \eqref{eq:pl} as, +\begin{align} +%\label{eq:pl} +\nu \|y\| & \le \left\| -DA(x)^\top y \right \| , \,\,\,\,\, \forall x \in K, \,\, y \in \RR^m +\end{align} +in which case, $A$ is said to be uniformly regular on $K$ with constant $\nu > 0$, also discussed in \cite{bolte2018nonconvex}. If we further assert that $\nu:= \min(\|DA(x)^\top y\|: \|y\| = 1) > 0$, then this is equivalent to say that Jacobian of $A$ is surjective. It is known as Mangasarian-Fromovitz condition at x in nonlinear optimization. } %{\color{red} Ahmet: a reference would be cool here} %======= %In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs~\cite{bauschke2011convex}. Intuitively, the Slater's condition here removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving~\eqref{prob:01}, namely, finding a point in $\mathcal{X}\cap \text{null}(A)$, can be particularly difficult. For instance, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) has arbitrarily slow convergence, see Figure~\ref{fig:convex_slater}. %>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732 + \begin{figure} \begin{center} \includegraphics[scale=.5]{Slater.pdf} \end{center} \caption{Solving \eqref{prob:01} can be particularly difficult, even when \eqref{prob:01} is a convex program. We present a pathological geometry where the Slater's condition does not apply. See the first remark after Theorem~\ref{thm:main} for more details. \label{fig:convex_slater}} \end{figure} \textbf{Computational complexity.} Theorem~\ref{thm:main} allows us to specify the number of iterations that Algorithm~\ref{Algo:2} requires to reach a near-stationary point of Program~\eqref{prob:01} with a prescribed precision and, in particular, specifies the number of calls made to the solver in Step~2. In this sense, Theorem~\ref{thm:main} does not fully capture the computational complexity of Algorithm~\ref{Algo:2}, as it does not take into account the computational cost of the solver in Step~2. To better understand the total complexity of Algorithm~\ref{Algo:2}, we consider two scenarios in the following. In the first scenario, we take the solver in Step~2 to be the Accelerated Proximal Gradient Method (APGM), a well-known first-order algorithm~\cite{ghadimi2016accelerated}. In the second scenario, we will use the second-order trust region method developed in~\cite{cartis2012complexity}. \subsection{First-Order Optimality} %\textbf{AE: we go back and forth between "subroutine" and "solver". for consistency, I'm just using "solver" everywhere.} Let us first consider the first-order optimality case where the solver in Step~2 is APGM~\cite{ghadimi2016accelerated}. APGM makes use of $\nabla_x \mathcal{L}_{\beta}(x,y)$, $\text{prox}_g$ and classical Nesterov acceleration step for the iterates~\cite{nesterov1983method} to obtain first order stationarity guarantees for solving~\eqref{e:exac}. % \textbf{AE: Ahmet to give a brief description of APGM here. } %First, we characterize the iteration complexity of Algorithm~\ref{Algo:2} for finding a first-order stationary point of~\eqref{prob:01}. %We propose to use the standard accelerated proximal method (APGM), guarantees of which are analyzed in~\cite{ghadimi2016accelerated} for nonconvex problems of the form~\eqref{e:exac}. Suppose that $g=\delta_\mathcal{X}$ is the indicator function on a bounded convex set $\mathcal{X}\subset \RR^d$ and let \begin{align} x_{\max}= \max_{x\in \mathcal{X}}\|x\|, \end{align} be the radius of a ball centered at the origin that includes $\mathcal{X}$. Then, adapting the results in~\cite{ghadimi2016accelerated} to our setup, APGM reaches $x_{k}$ in Step 2 of Algorithm~\ref{Algo:2} after \begin{equation} \mathcal{O}\left ( \frac{\lambda_{\beta_{k}}^2 x_{\max}^2 }{\epsilon_{k+1}} \right) \label{eq:iter_1storder} \end{equation} (inner) iterations, where $\lambda_{\beta_{k}}$ denotes the Lipschitz constant of $\nabla_x{\mathcal{L}_{\beta_{k}}(x, y)}$, bounded in~\eqref{eq:smoothness of Lagrangian}. We note that for simplicity, we use a looser bound in \eqref{eq:iter_1storder} than~\cite{ghadimi2016accelerated}. Using \eqref{eq:iter_1storder}, we derive the following corollary, describing the total computational complexity of Algorithm~\ref{Algo:2} in terms of the calls to the first-order oracle in APGM. %\textbf{AE: we haven't talked about oracle before.} \begin{corollary}\label{cor:first} For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point, see (\ref{eq:inclu3}), after $T$ calls to the first-order oracle, where % \begin{equation} T = \mathcal{O}\left( \frac{Q^3 x_{\max}^2}{\epsilon^{3}}\log_b{\left( \frac{Q}{\epsilon} \right)} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{3} x_{\max}^2}{\epsilon^{3}} \right). \end{equation} \end{corollary} For Algorithm~\ref{Algo:2} to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, we therefore need to perform only one iteration of Algorithm~\ref{Algo:2}, with $\b_1$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. In general, however, the constants in \eqref{eq:stat_prec_first} are unknown and this approach is intractable. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight. This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by a factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}. %\textbf{AE: if we find some time, i'll add a couple of lines for how 1st order opt implies 2nd order opt for smooth constraints.} \subsection{Second-Order Optimality} %{\color{red}{Ahmet (note to myself): not sure of the constants of trust-region, check again}} \\ Let us now consider the second-order optimality case where the solver in Step~2 is the the trust region method developed in~\cite{cartis2012complexity}. Trust region method minimizes quadratic approximation of the function within a dynamically updated trust-region radius. Second-order trust region method that we consider in this section makes use of Hessian (or an approximation of Hessian) of the augmented Lagrangian in addition to first order oracles. As shown in~\cite{nouiehed2018convergence}, finding approximate second-order stationary points of convex-constrained problems is in general NP-hard. For this reason, we focus in this section on the special case of~\eqref{prob:01} with $g=0$. %We first give a theorem to show the convergence rate of the algorithm for second order stationarity: \\ %{\color{red}{Ahmet: I think that it is possible to remove sufficiently large k0 assumption here. }} \textbf{AE: not worth it really} %\begin{corollary} %\label{thm:main_second} %Under the assumptions of Theorem~\ref{thm:main}, the output of Algorithm~\ref{Algo:2} satisfies %\begin{align} %%& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ %%& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\ %%&=: \frac{Q}{\beta_{k-1}^2}. %\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_{k-1}}(x_k, y_k)) \geq - \frac{C}{\beta_{k-1}} - \epsilon_{k-1}. %\end{align} %\end{corollary} % %We consider \textbf{AE: Ahmet to add a brief description of the algorithm.} Let us compute the total computational complexity of Algorithm~\ref{Algo:2} with the trust region method in Step~2, in terms of the number of calls made to the second-order oracle. By adapting the result in~\cite{cartis2012complexity} to our setup, we find that the number of (inner) iterations required in Step~2 of Algorithm~\ref{Algo:2} to produce $x_{k+1}$ is % \begin{equation} \mathcal{O}\left ( \frac{\lambda_{\beta_{k}, H}^2 (\mathcal{L}_{\beta_{k}}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y))}{\epsilon_k^3} \right), \label{eq:sec_inn_comp} \end{equation} % %<<<<<<< HEAD where $\lambda_{\beta, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop. In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y)$ is bounded by a constant independent of $\epsilon$. We assume a uniform bound for this quantity $\forall \beta_k$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following: %======= %where $\lambda_{\beta_k, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta_k$, as can be proven similar to Lemma~\ref{lem:smoothness} and $x_1$ is the initial iterate of the given outer loop. %In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta_k}(x_1, y) - \min_{x}\mathcal{L}_{\beta_k}(x, y)$ is bounded by a constant independent of $\epsilon_k$. %We assume a uniform bound for this quantity for all $\b$, instead of for one value of $\beta_k$ as in~\cite{cartis2012complexity}. Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following result. The proof is very similar to that of Corollary~\ref{cor:first} and hence omitted here. %>>>>>>> 795311da274f2d8ab56d215c2fd481073e616732 % \begin{corollary}\label{cor:second} For $b>1$, let $\beta_k =b^k $ for every $k$. We assume that \begin{equation} \mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) \leq L_{u},~~ \forall \beta. \end{equation} If we use the trust region method from~\cite{cartis2012complexity} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon$-second-order stationary point of~(\ref{prob:01}) in $T$ calls to the second-order oracle where % \begin{equation} T \leq \mathcal{O}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \log_b{\left( \frac{Q'}{\epsilon} \right)} \right) = \widetilde{\mathcal{O}}\left( \frac{L_u Q'^{5}}{\epsilon^{5}} \right). \end{equation} \end{corollary} % Before closing this section, we note that the remark after Corollary~\ref{cor:first} applies here as well. % diff --git a/NeurIPS 19/sections/introduction.tex b/NeurIPS 19/sections/introduction.tex index 0dd9c36..3e519ca 100644 --- a/NeurIPS 19/sections/introduction.tex +++ b/NeurIPS 19/sections/introduction.tex @@ -1,97 +1,97 @@ %!TEX root = ../main.tex \vspace{-5mm} \section{Introduction} \label{intro} We study the following nonconvex optimization problem % \begin{equation} \label{prob:01} \begin{cases} \underset{x\in \mathbb{R}^d}{\min}\,\, f(x)+g(x)\\ A(x) = b, \end{cases} \end{equation} % where $f:\mathbb{R}^d\rightarrow\mathbb{R}$ is \editf{continuously differentiable} (possibly nonconvex) function, $A:\mathbb{R}^d\rightarrow\mathbb{R}^m$ is a nonlinear operator and $b\in\mathbb{R}^m$. For clarity of notation, we take $b=0$ in the sequel, the extension to any $b$ is trivial. We assume that $g:\mathbb{R}^d\rightarrow\mathbb{R}$ is proximal-friendly (possibly nonsmooth) convex function. % (not necessarily smooth). %For instance, for a convex set $\mathcal{X}\subset\RR^d$, letting $g=\delta_\mathcal{X}$ to be the indicator function on $\mathcal{X}$, ~\eqref{prob:01} is a nonconvex optimization problem with the convex constraint $x\in \mathcal{X}$. %Our other requirement for $g$ is that computing its proximal operator is inexpensive, see Section~\ref{sec:preliminaries}. A host of problems in computer science \cite{khot2011grothendieck, lovasz2003semidefinite, zhao1998semidefinite}, machine learning \cite{mossel2015consistency, song2007dependence}, and signal processing \cite{singer2011angular, singer2011three} naturally fall under the template of ~\eqref{prob:01}, including max-cut, clustering, generalized eigenvalue, as well as \editf{quadratic assignment problem (QAP)}. % \textbf{AE: We need to add some more specifics to this paragraph to convince people that this is an important template. What are the names of the problems you're citing? Max-cut, sparse regression, statistical learning with deep networks as prior \cite{Bora2017}, what else we can think of?} To address these applications, this paper %builds up on the vast literature on the classical inexact augmented Lagrangian framework and proposes a simple, intuitive, as well as easy-to-implement \editf{augmented Lagrangian} algorithm, with total complexity results for ~\eqref{prob:01} under an interpretable geometric condition. Before we elaborate on the results, let us first motivate ~\eqref{prob:01} with \editf{prominent} application to semidefinite programming (SDP): \vspace{-3mm} \paragraph{Vignette: Burer-Monteiro splitting.} A powerful convex relaxation for max-cut, clustering, and several other problems \editf{described} above is provided by the SDP \begin{equation} \label{eq:sdp} \begin{cases} \underset{X\in\mathbb{S}^{d \times d}}{\min} \langle C, X \rangle \\ B(X) = b, \,\, X \succeq 0, \end{cases} \end{equation} % where $C\in \RR^{d\times d}$, $X$ is a positive semidefinite $d\times d$ matrix, %$\mathbb{S}^{d \times d}$ denotes the set of real symmetric matrices, and ${B}: \mathbb{S}^{d\times d} \to \mathbb{R}^m$ is a linear operator. If the unique-games conjecture is true, SDPs achieve the best approximation for the underlying discrete problem~\cite{raghavendra2008optimal}. Since $d$ is often large, many first- and second-order methods for solving such SDP's are immediately ruled out, not only due to their high computational complexity, but also due to their storage requirements, which are $\mathcal{O}(d^2)$. A contemporary challenge in optimization therefore is to solve SDP's in small space and in a scalable fashion. A recent algorithm, i.e., homotopy conditional gradient method (HCGM) based on Linear Minimization Oracles (LMO), can address this template in small space via sketching \cite{yurtsever2018conditional}; however, such LMO-based methods are extremely slow in obtaining accurate solutions. %In practice, $d$ is often very large which makes interior point methods, with their poor scaling in $d$, an unattractive option for solving ~\eqref{eq:sdp}. Attempts to resolve this issue prompted extensive research in computationally- and memory-efficient SDP solvers. The first such solvers relied on the so-called Linear Minimization Oracle (LMO), reviewed in Section~\ref{sec:related work}, alongside other scalabe SDP solvers. A key approach for solving \eqref{prob:01}, dating back to~\cite{burer2003nonlinear, burer2005local}, is the so-called Burer-Monteiro (BM) splitting $X=UU^\top$, where $U\in\mathbb{R}^{d\times r}$ and $r$ is selected according to the guidelines in~\cite{pataki1998rank, barvinok1995problems}. \editf{It has been shown that these bounds on the rank are optimal ~\cite{waldspurger2018rank}, and factorization does not introduce any extraneous local minima~\cite{burer2005local}. Moreover, ~\cite{boumal2016non} established the connection between local minima of factorized problem~\ref{prob:nc} and global optimum for~\ref{eq:sdp}.} %It has been shown that these bounds on the rank, which are shown to be optimal~\cite{waldspurger2018rank}, under some assumptions removing the spurious local minima of the nonconvex factorized problem~\cite{boumal2016non}. %so as to remove spurious local minima of the nonconvex factorized problem. Evidently, BR splitting has the advantage of lower storage and faster iterations. This splitting results in the following nonconvex problem \begin{equation} \label{prob:nc} \begin{cases} \underset{U\in\mathbb{R}^{d \times r}}{\min} \langle C, UU^\top \rangle \\ B(UU^\top) = b, \end{cases} \end{equation} which can be written in the form of ~\eqref{prob:01}. %\textbf{Vignette 2: Latent group lasso} %See Section ? for several examples. \textbf{We should explain this better and give more high level examples to motivate this .} %An example of our template in \eqref{prob:01} is semi-definite ming which provides powerful relaxations to above problems. Denote the space of $d'\times d'$ symmetric matrices by $\mathbb{S}^{d'\times d'}$ and consider % %\begin{equation} % \label{sdp} % \min_x \{h'(x): A'(x) = b', ~~x \in \mathcal{C'}~~\text{and}~~x\succeq0 \} %\end{equation} % %where $h': \mathbb{S}^{d'\times d'} \to \RR$, $A'\colon\mathbb{S}^{d'\times d'}\to\RR^m$, $b'\in\RR^m$, and $C' \subseteq \mathbb{R}^{d'\times d'}$. This template clearly can be put to the form of \eqref{prob:01} by using \emph{Burer-Monteiro factorization} \cite{burer2003nonlinear, burer2005local}. -To solve \eqref{prob:nc}, the inexact Augmented Lagrangian Method (iALM) is widely used~\cite{burer2003nonlinear, burer2005local, kulis2007fast}, due to its cheap per iteration cost and its empirical success. Every (outer) iteration of iALM calls a solver to solve an intermediate augmented Lagrangian subproblem to near stationarity, and the user has freedom in choosing this solver, which could use first-order (say,~proximal gradient descent \cite{parikh2014proximal}) or second-order information, such as BFGS \cite{nocedal2006numerical}. +To solve \eqref{prob:nc}, the inexact Augmented Lagrangian Method (iALM) is widely used~\cite{burer2003nonlinear, burer2005local, kulis2007fast}, due to its cheap per iteration cost and its empirical success. Every (outer) iteration of iALM calls a solver to solve an intermediate augmented Lagrangian subproblem to near stationarity, and the user has freedom in choosing this solver, which could use first-order (e.g., proximal gradient descent \cite{parikh2014proximal}) or second-order information, such as an BFGS \cite{nocedal2006numerical}. We argue that unlike its convex counterpart~\cite{nedelcu2014computational,lan2016iteration,xu2017inexact}, the convergence rate and the complexity of iALM for ~\eqref{prob:nc} are not well-understood, see Section~\ref{sec:related work} for a review of the related literature. Indeed, addressing this important theoretical gap is one of the contributions of our work. {\textbf{A brief summary of our contributions:} \\[1mm] $\circ$ Our framework is future-proof in the sense that we obtain the convergence rate of iALM for ~\eqref{prob:01} with an arbitrary solver for finding first- and second-order stationary points. \\[2mm] $\circ$ \editf{ -We investigate using different solvers for augmented Lagrangian subproblems and provide overall iteration complexity bounds for finding first- and second-order stationary points of ~\eqref{prob:01}. Our complexity bounds match the best theoretical complexity results in optimization, see Section~\ref{sec:related work}.} \\[2mm] -$\circ$ We propose a novel geometric condition that simplifies the algorithmic analysis for iALM. We verify the condition for key problems described in Section~\ref{sec:experiments}. +We investigate the effect of using different solvers for augmented Lagrangian subproblems and provide overall iteration complexity bounds for finding first- and second-order stationary points of ~\eqref{prob:01}. Our complexity bounds match the best theoretical complexity results in optimization, see Section~\ref{sec:related work}.} \\[2mm] +$\circ$ We propose a novel geometric condition that simplifies the algorithmic analysis for iALM. \editf{We make its connection to Polyak-Lojasiewicz and Mangasarian-Fromovitz conditions.}We verify the condition for key problems described in Section~\ref{sec:experiments}. \textbf{Roadmap.} Section~\ref{sec:preliminaries} collects the main tools and our notation. % that we utilize. We present the iALM in Section~\ref{sec:AL algorithm} and obtain its convergence rate to first- and second-order stationary points in Section~\ref{sec:cvg rate}, alongside their iteration complexities. We provide a comprehensive review of the literature and highlight our key differences in % with the precedents in Section~\ref{sec:related work}. Section~\ref{sec:experiments} presents the numerical evidence and comparisons with the state-of-the-art. % methods. diff --git a/NeurIPS 19/sections/preliminaries.tex b/NeurIPS 19/sections/preliminaries.tex index b75591f..c92ec67 100644 --- a/NeurIPS 19/sections/preliminaries.tex +++ b/NeurIPS 19/sections/preliminaries.tex @@ -1,181 +1,183 @@ +%!TEX root = ../main.tex + \vspace{-3mm} \section{Preliminaries \label{sec:preliminaries}} %\editf{Try to simplify this section as much as possible. We need to shorthen the paper.} \paragraph{\textbf{{Notation.}}} We use the notation $\langle\cdot ,\cdot \rangle $ and $\|\cdot\|$ for the {standard inner} product and the norm on $\RR^d$. For matrices, $\|\cdot\|$ and $\|\cdot\|_F$ denote the spectral and the Frobenius norms, respectively. %For a matrix $M$, its row and null spaces are denoted by $\row(A)$ and $\text{null}(A)$, respectively. -For a convex function $g:\RR^d\rightarrow\RR$, the subdifferential set at $x\in \RR^d$ is denoted by $\partial g(x)$ and we will occasionally use the notation $\partial g(x)/\b = \{ z/\b : z\in \partial g(x)\}$. +For the convex function $g:\RR^d\rightarrow\RR$, the subdifferential set at $x\in \RR^d$ is denoted by $\partial g(x)$ and we will occasionally use the notation $\partial g(x)/\b = \{ z/\b : z\in \partial g(x)\}$. When presenting iteration complexity results, we often use $\widetilde{O}(\cdot)$ which suppresses the logarithmic dependencies. %The proximal operator $P_g:\RR^d\rightarrow\RR^d$ takes $x$ to %\begin{align} %P_g(x) = \underset{y}{\argmin} \, g(y) + \frac{1}{2}\|x-y\|^2, %\label{eq:defn of prox} %\end{align} %and, if $g=1_C$ is the indicator function of a convex set $C$, we will use the shorthand $P_C=P_{1_C}$ to show the orthogonal projection onto $C$. We use the indicator function $\delta_\mathcal{X}:\RR^d\rightarrow\RR$ of a set $\mathcal{X}\subset\RR^d$, which takes $x$ to \begin{align} \delta_\mathcal{X}(x) = \begin{cases} 0 & x \in \mathcal{X}\\ \infty & x\notin \mathcal{X}. \end{cases} \label{eq:indicator} \end{align} %The tangent cone of $C$ at $x\in C$ is denoted by $T_C(x)$ and The distance function from a point $x$ to $\mathcal{X}$ is denoted by $\dist(x,\mathcal{X}) = \min_{z\in \mathcal{X}} \|x-z\|$. For integers $k_0 \le k_1$, we denote $[k_0:k_1]=\{k_0,\ldots,k_1\}$. For an operator $A:\RR^d\rightarrow\RR^m$ with components $\{A_i\}_{i=1}^m$, we let $DA(x) \in \mathbb{R}^{m\times d}$ denote the Jacobian of $A$, where the $i$th row of $DA(x)$ is the gradient vector $\nabla A_i(x) \in \RR^d$. %We denote the Hessian of the augmented Lagrangian with respect to $x$ as $\nabla _{xx} \mathcal{L}_{\beta}(x,y)$. \\ %{\color{red} define $A_i$ rigorously.} \textbf{Smoothness.} -We require $f:\RR^d\rightarrow\RR$ and $A:\RR^d\rightarrow \RR^m$ in~\eqref{prob:01} to be smooth; i.e., there exists $\lambda_f,\lambda_A\ge 0$ such that +We require $f:\RR^d\rightarrow\RR$ and $A:\RR^d\rightarrow \RR^m$ be smooth; i.e., there exist $\lambda_f,\lambda_A\ge 0$ such that % %\vspace{-.3cm} \begin{align} \| \nabla f(x) - \nabla f(x')\| & \le \lambda_f \|x-x'\|, \nonumber\\ \| DA(x) - DA(x') \| & \le \lambda_A \|x-x'\|, \label{eq:smoothness basic} \end{align} for every $x,x'\in \mathbb{R}^d$. \textbf{The augmented Lagrangian method (ALM)}. -ALM is a classical algorithm, first appeared in~\cite{hestenes1969multiplier, powell1969method} and extensively studied in~\cite{bertsekas2014constrained}. +ALM is a classical algorithm, first appeared in~\cite{hestenes1969multiplier, powell1969method} and extensively studied in~\cite{bertsekas1982constrained, birgin2014practical}. For solving \eqref{prob:01}, ALM suggests solving the problem % %\vspace{-.3cm} \begin{equation} \min_{x} \max_y \,\,\mathcal{L}_\beta(x,y) + g(x), \label{eq:minmax} \end{equation} %\vspace{-.5cm} % where, for $\b>0$, $\mathcal{L}_\b$ is the corresponding augmented Lagrangian, defined as \begin{align} \label{eq:Lagrangian} \mathcal{L}_\beta(x,y) := f(x) + \langle A(x), y \rangle + \frac{\beta}{2}\|A(x)\|^2. \end{align} %\textbf{AE: Adding the bias $b$ doesn't seem to add much except making the presentation more clunky.} %\vspace{-.5cm} The minimax formulation in \eqref{eq:minmax} naturally suggests the following algorithm for solving \eqref{prob:01}. For dual step sizes $\{\s_k\}_k$, consider the iterates \begin{equation}\label{e:exac} x_{k+1} \in \underset{x}{\argmin} \,\, \mathcal{L}_{\beta}(x,y_k)+g(x), \end{equation} \begin{equation*} y_{k+1} = y_k+\s_k A(x_{k+1}). \end{equation*} However, updating $x_{k+1}$ above requires solving the nonconvex problem~\eqref{e:exac} to optimality, which is typically intractable. Instead, it is often easier to find an approximate first- or second-order stationary point of~\eqref{e:exac}. %We therefore consider an algorithm that only requires approximate stationarity in every iteration. Hence, we argue that by gradually improving the stationarity precision and increasing the penalty weight $\b$ above, we can reach a stationary point of the main problem in~\eqref{prob:01}, as detailed in Section~\ref{sec:AL algorithm}. {\textbf{{{Optimality conditions.}}} \label{sec:opt cnds}} -{First-order necessary optimality conditions} for \eqref{prob:01} are well-understood. {Indeed, $x\in \RR^d$ is a first-order stationary point of~\eqref{prob:01} if there exists $y\in \RR^m$ such that +{First-order necessary optimality conditions} for \eqref{prob:01} are well-studied. {Indeed, $x\in \RR^d$ is a first-order stationary point of~\eqref{prob:01} if there exists $y\in \RR^m$ such that \begin{align} \begin{cases} -\nabla f(x) - DA(x)^\top y \in \partial g(x)\\ A(x) = 0, \end{cases} \label{e:inclu1} \end{align} where $DA(x)$ is the Jacobian of $A$ at $x$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to \begin{align} \begin{cases} -\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x)\\ A(x) = 0, \end{cases} \label{e:inclu2} \end{align} which is in turn the necessary optimality condition for \eqref{eq:minmax}. } %Note that \eqref{e:inclu2} is equivalent to %\begin{align} %\left[ %\begin{array}{c} %\nabla_x \L_\b(x,y) \\ %\nabla_y \L_\b(x,y) %\end{array} %\right] %\in %\left\{ %\left[ %\begin{array}{c} % v\\ %0 %\end{array} %\right] %: v\in \partial g(x) \right\} %, %\end{align} %which rephrases the stationarity condition in terms of the gradient of the duality gap of Program~\eqref{eq:Lagrangian}. -Inspired by this, we say that $x$ is an $(\epsilon_f,\b)$ first-order stationary point if +\editf{mfs: check approx. optimality conditions.}Inspired by this, we say that $x$ is an $(\epsilon_f,\b)$ first-order stationary point of \eqref{eq:minmax} if \editf{there exists a $y \in \RR^m$} \begin{align} \begin{cases} \dist(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x)) \leq \epsilon_f \\ \| A(x) \| \leq \epsilon_f, \end{cases} \label{eq:inclu3} \end{align} for $\epsilon_f\ge 0$. In light of \eqref{eq:inclu3}, a suitable metric for evaluating the stationarity of a pair $(x,y)\in \RR^d\times \RR^m$ is \begin{align} \dist\left(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x) \right) + \|A(x)\| , \label{eq:cvg metric} \end{align} which we use as the first-order stopping criterion. %When $g=0$, it is also not difficult to verify that the expression in \eqref{eq:cvg metric} is the norm of the gradient of the duality gap in \eqref{eq:minmax}. As an example, for a convex set $\mathcal{X}\subset\RR^d$, suppose that $g = \delta_\mathcal{X}$ is the indicator function on $\mathcal{X}$.} Let also $T_\mathcal{X}(x) \subseteq \RR^d$ denote the tangent cone to $\mathcal{X}$ at $x$, and with $P_{T_\mathcal{X}(x)}:\RR^d\rightarrow\RR^d$, we denote the orthogonal projection onto this tangent cone. Then, for $u\in\RR^d$, it is not difficult to verify that \begin{align}\label{eq:dist_subgrad} \dist\left(u, \partial g(x) \right) = \| P_{T_\mathcal{X}(x)}(u) \|. \end{align} % When $g = 0$, a first-order stationary point $x\in \RR^d$ of \eqref{prob:01} is also second-order stationary if % %\vspace{-.5cm} \begin{equation} -\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) > 0, +\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))\ge 0, \end{equation} %\vspace{-.5cm} % where $\nabla_{xx}\mathcal{L}_\b$ is the Hessian with respect to $x$, and $\lambda_{\text{min}}(\cdot)$ returns the smallest eigenvalue of its argument. Analogously, $x$ is an $(\epsilon_f, \epsilon_s,\b)$ second-order stationary point if, in addition to \eqref{eq:inclu3}, it holds that % %\vspace{-.5cm} \begin{equation}\label{eq:sec_opt} \lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) \ge -\epsilon_s, \end{equation} for $\epsilon_s>0$. %\vspace{-.5cm} % Naturally, for second-order stationarity, we use $\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))$ as the stopping criterion. -\paragraph{{\textbf{Smoothness lemma.}}} This next result controls the smoothness of $\L_\b(\cdot,y)$ for a fixed $y$. The proof is standard but nevertheless included in Appendix~\ref{sec:proof of smoothness lemma} for completeness. +\paragraph{{\textbf{Smoothness lemma.}}} This next result controls the smoothness of $\L_\b(\cdot,y)$ for a fixed $y$. The proof is standard but nevertheless is included in Appendix~\ref{sec:proof of smoothness lemma} for completeness. \begin{lemma}[\textbf{smoothness}]\label{lem:smoothness} For fixed $y\in \RR^m$ and $\rho,\rho'\ge 0$, it holds that \begin{align} \| \nabla_x \mathcal{L}_{\beta}(x, y)- \nabla_x \mathcal{L}_{\beta}(x', y) \| \le \lambda_\b \|x-x'\|, \end{align} for every $x,x' \in \{ x'': \|A(x'') \|\le \rho, \|x''\|\le \rho'\}$, where \begin{align} \lambda_\beta & \le \lambda_f + \sqrt{m}\lambda_A \|y\| + (\sqrt{m}\lambda_A\rho + d \lambda'^2_A )\b \nonumber\\ & =: \lambda_f + \sqrt{m}\lambda_A \|y\| + \lambda''(A,\rho,\rho') \b. \label{eq:smoothness of Lagrangian} \end{align} Above, $\lambda_f,\lambda_A$ were defined in (\ref{eq:smoothness basic}) and \begin{align} \lambda'_A := \max_{\|x\|\le \rho'}\|DA(x)\|. \end{align} \end{lemma}