diff --git a/ICML19/Arxiv version/.DS_Store b/ICML19/Arxiv version/.DS_Store new file mode 100644 index 0000000..8b21086 Binary files /dev/null and b/ICML19/Arxiv version/.DS_Store differ diff --git a/ICML19/Arxiv version/bibliography.bib b/ICML19/Arxiv version/bibliography.bib index 610cb62..344e4ee 100644 --- a/ICML19/Arxiv version/bibliography.bib +++ b/ICML19/Arxiv version/bibliography.bib @@ -1,830 +1,812 @@ -%% This BibTeX bibliography file was created using BibDesk. -%% http://bibdesk.sourceforge.net/ - - -%% Saved with string encoding Unicode (UTF-8) - -% Created by Mehmet Fatih Sahin for nonconvex inexact augmented lagrangian paper -% December 14, Friday. 01:52 am - @article{Goodfellow2014, author = {{Goodfellow}, I.~J. and {Pouget-Abadie}, J. and {Mirza}, M. and {Xu}, B. and {Warde-Farley}, D. and {Ozair}, S. and {Courville}, A. and {Bengio}, Y.}, url = {https://arxiv.org/abs/1406.2661}, title = "{Generative Adversarial Networks}", journal = {ArXiv e-prints}, archivePrefix = "arXiv", eprint = {1406.2661}, primaryClass = "stat.ML", keywords = {Statistics - Machine Learning, Computer Science - Machine Learning}, year = 2014, month = jun, adsurl = {http://adsabs.harvard.edu/abs/2014arXiv1406.2661G}, adsnote = {Provided by the SAO/NASA Astrophysics Data System} } @ARTICLE{Ilyas2017, author = {{Ilyas}, Andrew and {Jalal}, Ajil and {Asteri}, Eirini and {Daskalakis}, Constantinos and {Dimakis}, Alexandros G.}, title = "{The Robust Manifold Defense: Adversarial Training using Generative Models}", journal = {arXiv e-prints}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Cryptography and Security, Computer Science - Machine Learning, Statistics - Machine Learning}, year = 2017, month = Dec, eid = {arXiv:1712.09196}, pages = {arXiv:1712.09196}, archivePrefix = {arXiv}, eprint = {1712.09196}, primaryClass = {cs.CV}, adsurl = {https://ui.adsabs.harvard.edu/\#abs/2017arXiv171209196I}, adsnote = {Provided by the SAO/NASA Astrophysics Data System} } @ARTICLE{Szegedy2013, author = {{Szegedy}, Christian and {Zaremba}, Wojciech and {Sutskever}, Ilya and {Bruna}, Joan and {Erhan}, Dumitru and {Goodfellow}, Ian and {Fergus}, Rob}, title = "{Intriguing properties of neural networks}", journal = {arXiv e-prints}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing}, year = 2013, month = Dec, eid = {arXiv:1312.6199}, pages = {arXiv:1312.6199}, archivePrefix = {arXiv}, eprint = {1312.6199}, primaryClass = {cs.CV}, adsurl = {https://ui.adsabs.harvard.edu/\#abs/2013arXiv1312.6199S}, adsnote = {Provided by the SAO/NASA Astrophysics Data System} } - @article{tepper2018clustering, title={Clustering is semidefinitely not that hard: Nonnegative SDP for manifold disentangling}, author={Tepper, Mariano and Sengupta, Anirvan M and Chklovskii, Dmitri}, journal={Journal of Machine Learning Research}, volume={19}, number={82}, year={2018} } @article{beck2009fast, title={A fast iterative shrinkage-thresholding algorithm for linear inverse problems}, author={Beck, Amir and Teboulle, Marc}, journal={SIAM journal on imaging sciences}, volume={2}, number={1}, pages={183--202}, year={2009}, publisher={SIAM} } @article{tibshirani1996regression, title={Regression shrinkage and selection via the lasso}, author={Tibshirani, Robert}, journal={Journal of the Royal Statistical Society. Series B (Methodological)}, pages={267--288}, year={1996}, publisher={JSTOR} } @book{bauschke2011convex, title={Convex analysis and monotone operator theory in Hilbert spaces}, author={Bauschke, Heinz H and Combettes, Patrick L and others}, volume={408}, year={2011}, publisher={Springer} } @article{waldspurger2018rank, title={Rank optimality for the Burer-Monteiro factorization}, author={Waldspurger, Ir{\`e}ne and Waters, Alden}, journal={arXiv preprint arXiv:1812.03046}, year={2018} } - @inproceedings{jaggi2013revisiting, title={Revisiting Frank-Wolfe: Projection-Free Sparse Convex Optimization.}, author={Jaggi, Martin}, booktitle={ICML (1)}, pages={427--435}, year={2013} } @inproceedings{nesterov1983method, title={A method for solving the convex programming problem with convergence rate O (1/k\^{} 2)}, author={Nesterov, Yurii E}, booktitle={Dokl. Akad. Nauk SSSR}, volume={269}, pages={543--547}, year={1983} } @inproceedings{raghavendra2008optimal, title={Optimal algorithms and inapproximability results for every CSP?}, author={Raghavendra, Prasad}, booktitle={Proceedings of the fortieth annual ACM symposium on Theory of computing}, pages={245--254}, year={2008}, organization={ACM} } @online{xiao2017/online, author = {Han Xiao and Kashif Rasul and Roland Vollgraf}, title = {Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms}, date = {2017-08-28}, year = {2017}, eprintclass = {cs.LG}, eprinttype = {arXiv}, eprint = {cs.LG/1708.07747}, } @article{cartis2011evaluation, title={On the evaluation complexity of composite function minimization with applications to nonconvex nonlinear programming}, author={Cartis, Coralia and Gould, Nicholas IM and Toint, Philippe L}, journal={SIAM Journal on Optimization}, volume={21}, number={4}, pages={1721--1739}, year={2011}, publisher={SIAM} } @article{bolte2017error, title={From error bounds to the complexity of first-order descent methods for convex functions}, author={Bolte, J{\'e}r{\^o}me and Nguyen, Trong Phong and Peypouquet, Juan and Suter, Bruce W}, journal={Mathematical Programming}, volume={165}, number={2}, pages={471--507}, year={2017}, publisher={Springer} } - - @article{obozinski2011group, title={Group lasso with overlaps: the latent group lasso approach}, author={Obozinski, Guillaume and Jacob, Laurent and Vert, Jean-Philippe}, journal={arXiv preprint arXiv:1110.0413}, year={2011} } @article{tran2018smooth, title={A smooth primal-dual optimization framework for nonsmooth composite convex minimization}, author={Tran-Dinh, Quoc and Fercoq, Olivier and Cevher, Volkan}, journal={SIAM Journal on Optimization}, volume={28}, number={1}, pages={96--134}, year={2018}, publisher={SIAM} } @article{tran2018adaptive, title={An Adaptive Primal-Dual Framework for Nonsmooth Convex Minimization}, author={Tran-Dinh, Quoc and Alacaoglu, Ahmet and Fercoq, Olivier and Cevher, Volkan}, journal={arXiv preprint arXiv:1808.04648}, year={2018} } -@article{arora2018compressed, - title={A compressed sensing view of unsupervised text embeddings, bag-of-n-grams, and LSTMs}, - author={Arora, Sanjeev and Khodak, Mikhail and Saunshi, Nikunj and Vodrahalli, Kiran}, - year={2018} -} @article{candes2008introduction, title={An introduction to compressive sampling}, author={Cand{\`e}s, Emmanuel J and Wakin, Michael B}, journal={IEEE signal processing magazine}, volume={25}, number={2}, pages={21--30}, year={2008}, publisher={IEEE} } @article{chen2001atomic, title={Atomic decomposition by basis pursuit}, author={Chen, Scott Shaobing and Donoho, David L and Saunders, Michael A}, journal={SIAM review}, volume={43}, number={1}, pages={129--159}, year={2001}, publisher={SIAM} } @article{flores2012complete, title={A complete characterization of strong duality in nonconvex optimization with a single constraint}, author={Flores-Baz{\'a}n, Fabi{\'a}n and Flores-Baz{\'a}n, Fernando and Vera, Cristi{\'a}n}, journal={Journal of Global Optimization}, volume={53}, number={2}, pages={185--201}, year={2012}, publisher={Springer} } @article{dai2002convergence, title={Convergence properties of the BFGS algoritm}, author={Dai, Yu-Hong}, journal={SIAM Journal on Optimization}, volume={13}, number={3}, pages={693--701}, year={2002}, publisher={SIAM} } @article{yang2015sdpnal, title={SDPNAL+: a majorized semismooth Newton-CG augmented Lagrangian method for semidefinite programming with nonnegative constraints}, author={Yang, Liuqin and Sun, Defeng and Toh, Kim-Chuan}, journal={Mathematical Programming Computation}, volume={7}, number={3}, pages={331--366}, year={2015}, publisher={Springer} } @article{mascarenhas2004bfgs, title={The BFGS method with exact line searches fails for non-convex objective functions}, author={Mascarenhas, Walter F}, journal={Mathematical Programming}, volume={99}, number={1}, pages={49--61}, year={2004}, publisher={Springer} } -@inproceedings{boumal2016non, - title={The non-convex Burer-Monteiro approach works on smooth semidefinite programs}, - author={Boumal, Nicolas and Voroninski, Vlad and Bandeira, Afonso}, - booktitle={Advances in Neural Information Processing Systems}, - pages={2757--2765}, - year={2016} -} - - - @inproceedings{bhojanapalli2016dropping, title={Dropping convexity for faster semi-definite optimization}, author={Bhojanapalli, Srinadh and Kyrillidis, Anastasios and Sanghavi, Sujay}, booktitle={Conference on Learning Theory}, pages={530--582}, year={2016} } -@inproceedings{nesterov1983method, - title={A method for solving the convex programming problem with convergence rate O (1/k\^{} 2)}, - author={Nesterov, Yurii E}, - booktitle={Dokl. Akad. Nauk SSSR}, - volume={269}, - pages={543--547}, - year={1983} + + + +@article{arora2018compressed, + title={A compressed sensing view of unsupervised text embeddings, bag-of-n-grams, and LSTMs}, + author={Arora, Sanjeev and Khodak, Mikhail and Saunshi, Nikunj and Vodrahalli, Kiran}, + year={2018} } @article{shi2017penalty, title={Penalty dual decomposition method for nonsmooth nonconvex optimization}, author={Shi, Qingjiang and Hong, Mingyi and Fu, Xiao and Chang, Tsung-Hui}, journal={arXiv preprint arXiv:1712.04767}, year={2017} } @article{fernandez2012local, title={Local convergence of exact and inexact augmented Lagrangian methods under the second-order sufficient optimality condition}, author={Fern{\'a}ndez, Dami{\'a}n and Solodov, Mikhail V}, journal={SIAM Journal on Optimization}, volume={22}, number={2}, pages={384--407}, year={2012}, publisher={SIAM} } @article{bolte2018nonconvex, title={Nonconvex Lagrangian-based optimization: monitoring schemes and global convergence}, author={Bolte, J{\'e}r{\^o}me and Sabach, Shoham and Teboulle, Marc}, journal={Mathematics of Operations Research}, year={2018}, publisher={INFORMS} } @article{nouiehed2018convergence, title={Convergence to Second-Order Stationarity for Constrained Non-Convex Optimization}, author={Nouiehed, Maher and Lee, Jason D and Razaviyayn, Meisam}, journal={arXiv preprint arXiv:1810.02024}, year={2018} } @article{cartis2018optimality, title={Optimality of orders one to three and beyond: characterization and evaluation complexity in constrained nonconvex optimization}, author={Cartis, Coralia and Gould, Nicholas IM and Toint, Ph L}, journal={Journal of Complexity}, year={2018}, publisher={Elsevier} } @article{nesterov2009primal, title={Primal-dual subgradient methods for convex problems}, author={Nesterov, Yurii}, journal={Mathematical programming}, volume={120}, number={1}, pages={221--259}, year={2009}, publisher={Springer} } @inproceedings{yurtsever2015universal, title={A universal primal-dual convex optimization framework}, author={Yurtsever, Alp and Dinh, Quoc Tran and Cevher, Volkan}, booktitle={Advances in Neural Information Processing Systems}, pages={3150--3158}, year={2015} } @article{yurtsever2018conditional, title={A Conditional Gradient Framework for Composite Convex Minimization with Applications to Semidefinite Programming}, author={Yurtsever, Alp and Fercoq, Olivier and Locatello, Francesco and Cevher, Volkan}, journal={arXiv preprint arXiv:1804.08544}, year={2018} } @article{bertsekas1976penalty, title={On penalty and multiplier methods for constrained minimization}, author={Bertsekas, Dimitri P}, journal={SIAM Journal on Control and Optimization}, volume={14}, number={2}, pages={216--235}, year={1976}, publisher={SIAM} } -@article{tran2018adaptive, - title={An Adaptive Primal-Dual Framework for Nonsmooth Convex Minimization}, - author={Tran-Dinh, Quoc and Alacaoglu, Ahmet and Fercoq, Olivier and Cevher, Volkan}, - journal={arXiv preprint arXiv:1808.04648}, - year={2018} -} - - -@article{hestenes1969multiplier, - title={Multiplier and gradient methods}, - author={Hestenes, Magnus R}, - journal={Journal of optimization theory and applications}, - volume={4}, - number={5}, - pages={303--320}, - year={1969}, - publisher={Springer} -} - @article{powell1969method, title={A method for nonlinear constraints in minimization problems}, author={Powell, Michael JD}, journal={Optimization}, pages={283--298}, year={1969}, publisher={Academic Press} } @book{bertsekas2014constrained, title={Constrained optimization and Lagrange multiplier methods}, author={Bertsekas, Dimitri P}, year={2014}, publisher={Academic press} } @inproceedings{kulis2007fast, title={Fast low-rank semidefinite programming for embedding and clustering}, author={Kulis, Brian and Surendran, Arun C and Platt, John C}, booktitle={Artificial Intelligence and Statistics}, pages={235--242}, year={2007} } @article{xu2017inexact, title={Iteration complexity of inexact augmented Lagrangian methods for constrained convex programming}, author={Xu, Yangyang}, journal={arXiv preprint arXiv:1711.05812v2}, year={2017} } @article{nedelcu2014computational, title={Computational complexity of inexact gradient augmented Lagrangian methods: application to constrained MPC}, author={Nedelcu, Valentin and Necoara, Ion and Tran-Dinh, Quoc}, journal={SIAM Journal on Control and Optimization}, volume={52}, number={5}, pages={3109--3134}, year={2014}, publisher={SIAM} } @article{pataki1998rank, title={On the rank of extreme matrices in semidefinite programs and the multiplicity of optimal eigenvalues}, author={Pataki, G{\'a}bor}, journal={Mathematics of operations research}, volume={23}, number={2}, pages={339--358}, year={1998}, publisher={INFORMS} } @article{Barvinok1995problems, title={Problems of distance geometry and convex properties of quadratic maps}, author={Barvinok, Alexander I.}, journal={Discrete \& Computational Geometry}, volume={13}, number={2}, pages={189--202}, year={1995}, publisher={Springer} } @article{cartis2012complexity, title={Complexity bounds for second-order optimality in unconstrained optimization}, author={Cartis, Coralia and Gould, Nicholas IM and Toint, Ph L}, journal={Journal of Complexity}, volume={28}, number={1}, pages={93--108}, year={2012}, publisher={Elsevier} } @article{ghadimi2016accelerated, title={Accelerated gradient methods for nonconvex nonlinear and stochastic programming}, author={Ghadimi, Saeed and Lan, Guanghui}, journal={Mathematical Programming}, volume={156}, number={1-2}, pages={59--99}, year={2016}, publisher={Springer} } @article{boumal2014manopt, Author = {Boumal, Nicolas and Mishra, Bamdev and Absil, P-A and Sepulchre, Rodolphe}, Journal = {The Journal of Machine Learning Research}, Number = {1}, Pages = {1455--1459}, Publisher = {JMLR. org}, Title = {Manopt, a Matlab toolbox for optimization on manifolds}, Volume = {15}, Year = {2014}} @article{boumal2016global, Author = {Boumal, Nicolas and Absil, P-A and Cartis, Coralia}, Journal = {arXiv preprint arXiv:1605.08101}, Title = {Global rates of convergence for nonconvex optimization on manifolds}, Year = {2016}} @inproceedings{ge2016efficient, Author = {Ge, Rong and Jin, Chi and Netrapalli, Praneeth and Sidford, Aaron and others}, Booktitle = {International Conference on Machine Learning}, Pages = {2741--2750}, Title = {Efficient algorithms for large-scale generalized eigenvector computation and canonical correlation analysis}, Year = {2016}} @inproceedings{boumal2016non, Author = {Boumal, Nicolas and Voroninski, Vlad and Bandeira, Afonso}, Booktitle = {Advances in Neural Information Processing Systems}, Pages = {2757--2765}, Title = {The non-convex Burer-Monteiro approach works on smooth semidefinite programs}, Year = {2016}} @article{davis2011university, Author = {Davis, Timothy A and Hu, Yifan}, Journal = {ACM Transactions on Mathematical Software (TOMS)}, Number = {1}, Pages = {1}, Publisher = {ACM}, Title = {The University of Florida sparse matrix collection}, Volume = {38}, - Year = {2011}} + Year = {2011} +} @article{clason2018acceleration, Author = {Clason, Christian and Mazurenko, Stanislav and Valkonen, Tuomo}, Journal = {arXiv preprint arXiv:1802.03347}, Title = {Acceleration and global convergence of a first-order primal--dual method for nonconvex problems}, Year = {2018}} @article{chambolle2011first, Author = {Chambolle, Antonin and Pock, Thomas}, Journal = {Journal of mathematical imaging and vision}, Number = {1}, Pages = {120--145}, Publisher = {Springer}, Title = {A first-order primal-dual algorithm for convex problems with applications to imaging}, Volume = {40}, Year = {2011}} @article{tran2018smooth, Author = {Tran-Dinh, Quoc and Fercoq, Olivier and Cevher, Volkan}, Journal = {SIAM Journal on Optimization}, Number = {1}, Pages = {96--134}, Publisher = {SIAM}, Title = {A smooth primal-dual optimization framework for nonsmooth composite convex minimization}, Volume = {28}, Year = {2018}} @article{bhojanapalli2018smoothed, Author = {Bhojanapalli, Srinadh and Boumal, Nicolas and Jain, Prateek and Netrapalli, Praneeth}, Journal = {arXiv preprint arXiv:1803.00186}, Title = {Smoothed analysis for low-rank solutions to semidefinite programs in quadratic penalty form}, Year = {2018}} -@article{park2016provable, - Author = {Park, Dohyung and Kyrillidis, Anastasios and Bhojanapalli, Srinadh and Caramanis, Constantine and Sanghavi, Sujay}, - Journal = {arXiv preprint arXiv:1606.01316}, - Title = {Provable Burer-Monteiro factorization for a class of norm-constrained matrix problems}, - Year = {2016}} - -@article{mixon2016clustering, - Author = {Mixon, Dustin G and Villar, Soledad and Ward, Rachel}, - Journal = {arXiv preprint arXiv:1602.06612}, - Title = {Clustering subgaussian mixtures by semidefinite programming}, - Year = {2016}} - -@article{yurtsever2018conditional, - Author = {Yurtsever, Alp and Fercoq, Olivier and Locatello, Francesco and Cevher, Volkan}, - Journal = {arXiv preprint arXiv:1804.08544}, - Title = {A Conditional Gradient Framework for Composite Convex Minimization with Applications to Semidefinite Programming}, - Year = {2018}} @inproceedings{jin2017escape, Author = {Jin, Chi and Ge, Rong and Netrapalli, Praneeth and Kakade, Sham M and Jordan, Michael I}, Booktitle = {International Conference on Machine Learning}, Pages = {1724--1732}, Title = {How to Escape Saddle Points Efficiently}, Year = {2017}} @article{birgin2016evaluation, title={Evaluation complexity for nonlinear constrained optimization using unscaled KKT conditions and high-order models}, author={Birgin, Ernesto G and Gardenghi, JL and Martinez, Jos{\'e} Mario and Santos, SA and Toint, Ph L}, journal={SIAM Journal on Optimization}, volume={26}, number={2}, pages={951--967}, year={2016}, publisher={SIAM} } @article{liu2017linearized, Author = {Liu, Qinghua and Shen, Xinyue and Gu, Yuantao}, Journal = {arXiv preprint arXiv:1705.02502}, Title = {Linearized admm for non-convex non-smooth optimization with convergence analysis}, Year = {2017}} @article{wang2015global, Author = {Wang, Yu and Yin, Wotao and Zeng, Jinshan}, Journal = {arXiv preprint arXiv:1511.06324}, Title = {Global convergence of ADMM in nonconvex nonsmooth optimization}, - Year = {2015}} - -@book{bertsekas2014constrained, - title={Constrained optimization and Lagrange multiplier methods}, - author={Bertsekas, Dimitri P}, - year={2014}, - publisher={Academic press} + Year = {2015} } + @article{lan2016iteration, Author = {Lan, Guanghui and Monteiro, Renato DC}, Journal = {Mathematical Programming}, Number = {1-2}, Pages = {511--547}, Publisher = {Springer}, Title = {Iteration-complexity of first-order augmented Lagrangian methods for convex programming}, Volume = {155}, - Year = {2016}} - -@article{nedelcu2014computational, - title={Computational complexity of inexact gradient augmented Lagrangian methods: application to constrained MPC}, - author={Nedelcu, Valentin and Necoara, Ion and Tran-Dinh, Quoc}, - journal={SIAM Journal on Control and Optimization}, - volume={52}, - number={5}, - pages={3109--3134}, - year={2014}, - publisher={SIAM} + Year = {2016} } + @book{bertsekas1999nonlinear, title={Nonlinear programming}, author={Bertsekas, Dimitri P} } @article{hestenes1969multiplier, title={Multiplier and gradient methods}, author={Hestenes, Magnus R}, journal={Journal of optimization theory and applications}, volume={4}, number={5}, pages={303--320}, year={1969}, publisher={Springer} } @incollection{powell1978fast, title={A fast algorithm for nonlinearly constrained optimization calculations}, author={Powell, Michael JD}, booktitle={Numerical analysis}, pages={144--157}, year={1978}, publisher={Springer} } @article{burer2005local, title={Local minima and convergence in low-rank semidefinite programming}, author={Burer, Samuel and Monteiro, Renato DC}, journal={Mathematical Programming}, volume={103}, number={3}, pages={427--444}, year={2005}, publisher={Springer} } @article{burer2003nonlinear, title={A nonlinear programming algorithm for solving semidefinite programs via low-rank factorization}, author={Burer, Samuel and Monteiro, Renato DC}, journal={Mathematical Programming}, volume={95}, number={2}, pages={329--357}, year={2003}, publisher={Springer} } @article{singer2011angular, title={Angular synchronization by eigenvectors and semidefinite programming}, author={Singer, Amit}, journal={Applied and computational harmonic analysis}, volume={30}, number={1}, pages={20}, year={2011}, publisher={NIH Public Access} } @article{singer2011three, title={Three-dimensional structure determination from common lines in cryo-EM by eigenvectors and semidefinite programming}, author={Singer, Amit and Shkolnisky, Yoel}, journal={SIAM journal on imaging sciences}, volume={4}, number={2}, pages={543--572}, year={2011}, publisher={SIAM} } @inproceedings{song2007dependence, title={A dependence maximization view of clustering}, author={Song, Le and Smola, Alex and Gretton, Arthur and Borgwardt, Karsten M}, booktitle={Proceedings of the 24th international conference on Machine learning}, pages={815--822}, year={2007}, organization={ACM} } @inproceedings{mossel2015consistency, title={Consistency thresholds for the planted bisection model}, author={Mossel, Elchanan and Neeman, Joe and Sly, Allan}, booktitle={Proceedings of the forty-seventh annual ACM symposium on Theory of computing}, pages={69--75}, year={2015}, organization={ACM} } @incollection{lovasz2003semidefinite, title={Semidefinite programs and combinatorial optimization}, author={Lov{\'a}sz, L{\'a}szl{\'o}}, booktitle={Recent advances in algorithms and combinatorics}, pages={137--194}, year={2003}, publisher={Springer} } @article{khot2011grothendieck, title={Grothendieck-type inequalities in combinatorial optimization}, author={Khot, Subhash and Naor, Assaf}, journal={arXiv preprint arXiv:1108.2464}, year={2011} } @article{park2016provable, title={Provable Burer-Monteiro factorization for a class of norm-constrained matrix problems}, author={Park, Dohyung and Kyrillidis, Anastasios and Bhojanapalli, Srinadh and Caramanis, Constantine and Sanghavi, Sujay}, journal={arXiv preprint arXiv:1606.01316}, year={2016} } @article{mixon2016clustering, title={Clustering subgaussian mixtures by semidefinite programming}, author={Mixon, Dustin G and Villar, Soledad and Ward, Rachel}, journal={arXiv preprint arXiv:1602.06612}, year={2016} } -@inproceedings{kulis2007fast, - title={Fast low-rank semidefinite programming for embedding and clustering}, - author={Kulis, Brian and Surendran, Arun C and Platt, John C}, - booktitle={Artificial Intelligence and Statistics}, - pages={235--242}, - year={2007} -} @article{Peng2007, Author = {Peng, J. and Wei, Y.}, Date-Added = {2017-10-26 15:22:37 +0000}, Date-Modified = {2017-10-26 15:30:53 +0000}, Journal = {SIAM J. Optim.}, Number = {1}, Pages = {186--205}, Title = {Approximating {K}--means--type clustering via semidefinite programming}, Volume = {18}, Year = {2007}} @book{fletcher2013practical, title={Practical methods of optimization}, author={Fletcher, Roger}, year={2013}, publisher={John Wiley \& Sons} } @article{Bora2017, title = {Compressed {Sensing} using {Generative} {Models}}, url = {http://arxiv.org/abs/1703.03208}, urldate = {2018-10-25}, journal = {arXiv:1703.03208 [cs, math, stat]}, author = {Bora, Ashish and Jalal, Ajil and Price, Eric and Dimakis, Alexandros G.}, month = mar, year = {2017}, note = {arXiv: 1703.03208}, keywords = {Statistics - Machine Learning, Computer Science - Machine Learning, Computer Science - Information Theory}, } @book{nocedal2006numerical, title={Numerical Optimization}, author={Nocedal, J. and Wright, S.}, isbn={9780387400655}, lccn={2006923897}, series={Springer Series in Operations Research and Financial Engineering}, url={https://books.google.ch/books?id=VbHYoSyelFcC}, year={2006}, publisher={Springer New York} } @article{parikh2014proximal, title={Proximal algorithms}, author={Parikh, Neal and Boyd, Stephen and others}, journal={Foundations and Trends{\textregistered} in Optimization}, volume={1}, number={3}, pages={127--239}, year={2014}, publisher={Now Publishers, Inc.} -} \ No newline at end of file +} + +@inproceedings{Samangouei2018, +title={Defense-{GAN}: Protecting Classifiers Against Adversarial Attacks Using Generative Models}, +author={Pouya Samangouei and Maya Kabkab and Rama Chellappa}, +booktitle={International Conference on Learning Representations}, +year={2018}, +url={https://openreview.net/forum?id=BkJ3ibb0-}, +} + +@ARTICLE{Kingma2014, + author = {{Kingma}, Diederik P. and {Ba}, Jimmy}, + title = "{Adam: A Method for Stochastic Optimization}", + journal = {arXiv e-prints}, + keywords = {Computer Science - Machine Learning}, + year = 2014, + month = Dec, + eid = {arXiv:1412.6980}, + pages = {arXiv:1412.6980}, +archivePrefix = {arXiv}, + eprint = {1412.6980}, + primaryClass = {cs.LG}, + adsurl = {https://ui.adsabs.harvard.edu/\#abs/2014arXiv1412.6980K}, + adsnote = {Provided by the SAO/NASA Astrophysics Data System} +} + +@inproceedings{Madry2018, +title={Towards Deep Learning Models Resistant to Adversarial Attacks}, +author={Aleksander Madry and Aleksandar Makelov and Ludwig Schmidt and Dimitris Tsipras and Adrian Vladu}, +booktitle={International Conference on Learning Representations}, +year={2018}, +url={https://openreview.net/forum?id=rJzIBfZAb}, +} + +@inproceedings{Duchi2008, + author = {Duchi, John and Shalev-Shwartz, Shai and Singer, Yoram and Chandra, Tushar}, + title = {Efficient Projections Onto the L1-ball for Learning in High Dimensions}, + booktitle = {Proceedings of the 25th International Conference on Machine Learning}, + series = {ICML '08}, + year = {2008}, + isbn = {978-1-60558-205-4}, + location = {Helsinki, Finland}, + pages = {272--279}, + numpages = {8}, + url = {http://doi.acm.org/10.1145/1390156.1390191}, + doi = {10.1145/1390156.1390191}, + acmid = {1390191}, + publisher = {ACM}, + address = {New York, NY, USA}, +} + +@article{Kingma2013, + author = {{Kingma}, Diederik P and {Welling}, Max}, + title = "{Auto-Encoding Variational Bayes}", + journal = {arXiv e-prints}, + keywords = {Statistics - Machine Learning, Computer Science - Machine Learning}, + year = 2013, + month = Dec, + eid = {arXiv:1312.6114}, + pages = {arXiv:1312.6114}, +archivePrefix = {arXiv}, + eprint = {1312.6114}, + primaryClass = {stat.ML}, + adsurl = {https://ui.adsabs.harvard.edu/\#abs/2013arXiv1312.6114K}, + adsnote = {Provided by the SAO/NASA Astrophysics Data System} +} + diff --git a/ICML19/Arxiv version/bibliography2.bib b/ICML19/Arxiv version/bibliography2.bib new file mode 100644 index 0000000..0df9864 --- /dev/null +++ b/ICML19/Arxiv version/bibliography2.bib @@ -0,0 +1,15 @@ +%% This BibTeX bibliography file was created using BibDesk. +%% http://bibdesk.sourceforge.net/ + + +%% Saved with string encoding Unicode (UTF-8) + +% Created by Mehmet Fatih Sahin for nonconvex inexact augmented lagrangian paper +% December 14, Friday. 01:52 am + + + + + + + diff --git a/ICML19/Arxiv version/figs/attack_error_time_clean.pdf b/ICML19/Arxiv version/figs/attack_error_time_clean.pdf new file mode 100644 index 0000000..8808acf Binary files /dev/null and b/ICML19/Arxiv version/figs/attack_error_time_clean.pdf differ diff --git a/ICML19/Arxiv version/iALM_main.pdf b/ICML19/Arxiv version/iALM_main.pdf new file mode 100644 index 0000000..d9355b7 Binary files /dev/null and b/ICML19/Arxiv version/iALM_main.pdf differ diff --git a/ICML19/Arxiv version/sections/experiments.tex b/ICML19/Arxiv version/sections/experiments.tex index 333ddb2..e173ef9 100644 --- a/ICML19/Arxiv version/sections/experiments.tex +++ b/ICML19/Arxiv version/sections/experiments.tex @@ -1,231 +1,276 @@ %!TEX root = ../iALM_main.tex \section{Numerical evidence \label{sec:experiments}} We first begin with a caveat: It is known that quasi-Newton methods, such as BFGS and lBFGS, might not converge for non-convex problems~\cite{dai2002convergence, mascarenhas2004bfgs}. For this reason, we have used the trust region method as the second-order solver in our analysis in Section~\ref{sec:cvg rate}, which is well-studied for non-convex problems~\cite{cartis2012complexity}. Empirically, however, BFGS and lBGFS are extremely successful and we have also opted for those solvers in this section since the subroutine does not affect Theorem~\ref{thm:main} as long as the subsolver can perform in practice. \subsection{k-Means Clustering} Given data points $\{z_i\}_{i=1}^n $, the entries of the corresponding Euclidean distance matrix $D \in \RR^{nxn}$ are $ D_{i, j} = \left\| z_i - z_j\right\|^2 $. Clustering is then the problem of finding a co-association matrix $Y\in \RR^{n\times n}$ such that $Y_{ij} = 1$ if points $z_i$ and $z_j$ are within the same cluster and $Y_{ij} = 0$ otherwise. In~\cite{Peng2007}, the authors provide a SDP relaxation of the clustering problem, specified as \begin{align} \begin{cases} \underset{Y \in \RR^{nxn}}{\min} \text{tr}(DY) \\ Y\mathbf{1} = \mathbf{1}, ~\text{tr}(Y) = k,~ Y\succeq 0,~Y \geq 0, \end{cases} \label{eq:sdp_svx} \end{align} where $k$ is the number of clusters and $Y $ is both positive semidefinite and has nonnegative entries. Standard SDP solvers do not scale well with the number of data points~$n$, since they often require projection onto the semidefinite cone with the complexity of $\mathcal{O}(n^3)$. We instead use the Burer-Monteiro splitting, sacrificing convexity to reduce the computational complexity. More specifically, we solve the program \begin{align} \label{eq:nc_cluster} \begin{cases} \underset{V \in \RR^{nxr}}{\min} \text{tr}(DVV^{\top}) \\ VV^{\top}\mathbf{1} = \mathbf{1},~~ \|V\|_F^2 \le k, ~~V \geq 0, \end{cases} \end{align} where $\mathbf{1}\in \RR^n$ is the vector of all ones. Note that $Y \geq 0$ in \eqref{eq:sdp_svx} is replaced above by the much stronger but easier to enforce constraint $V \geq 0$ constraint above, see~\cite{kulis2007fast} for the reasoning behind this relaxation. %. Trace constraint translates to a Frobenius norm constraint in factorized space. Semidefinite constraint is naturally removed due to factorization $Y=VV^{\top}$. %See~\citep{kulis2007fast} for the details of the relaxation. Now, we can cast~\eqref{eq:nc_cluster} as an instance of~\eqref{prob:01}. Indeed, for every $i\le n$, let $x_i \in \RR^r$ denote the $i$th row of $V$. We next form $x \in \RR^d$ with $d = nr$ by expanding the factorized variable $V$, namely, \begin{align*} x = [x_1^{\top}, \cdots, x_n^{\top}]^{\top} \in \RR^d, \end{align*} and then set \begin{align*} f(x) =\sum_{i,j=1}^n D_{i, j} \left\langle x_i, x_j \right\rangle, \qquad g = \delta_C, \end{align*} \begin{align} A(x) = [x_1^{\top}\sum_{j=1}^n x_j -1, \cdots, x_n^{\top}\sum_{j=1}^n x_j-1]^{\top}, \end{align} where $C$ is the intersection of the positive orthant in $\RR^d$ with the Euclidean ball of radius $\sqrt{k}$. In Appendix~\ref{sec:verification2}, we somewhat informally verify that Theorem~\ref{thm:main} applies to~\eqref{prob:01} with $f,g,A$ specified above. In our simulations, we use two different solvers for Step~2 of Algorithm~\ref{Algo:2}, namely, APGM and lBFGS. APGM is a solver for non-convex problems of the form~\eqref{e:exac} with convergence guarantees to first-order stationarity, as discussed in Section~\ref{sec:cvg rate}. lBFGS is a limited-memory version of BFGS algorithm in~\cite{fletcher2013practical} that approximately leverages the second-order information of the problem. We compare our approach against the following convex methods: \begin{itemize} \item HCGM: Homotopy-based Conditional Gradient Method in\cite{yurtsever2018conditional} which directly solves~\eqref{eq:sdp_svx}. \item SDPNAL+: A second-order augmented Lagrangian method for solving SDP's with nonnegativity constraints~\cite{yang2015sdpnal}. \end{itemize} As for the dataset, our experimental setup is similar to that described by~\cite{mixon2016clustering}. We use the publicly-available fashion-MNIST data in \cite{xiao2017/online}, which is released as a possible replacement for the MNIST handwritten digits. Each data point is a $28\times 28$ gray-scale image, associated with a label from ten classes, labeled from $0$ to $9$. First, we extract the meaningful features from this dataset using a simple two-layer neural network with a sigmoid activation function. Then, we apply this neural network to 1000 test samples from the same dataset, which gives us a vector of length $10$ for each data point, where each entry represents the posterior probability for each class. Then, we form the $\ell_2$ distance matrix ${D}$ from these probability vectors. The results are depicted in Figure~\ref{fig:clustering}. We implemented 3 algorithms on MATLAB and used the software package for SDPNAL+ which contains mex files. Convergence of the nonconvex approach will be much faster once mex implementation is used. \begin{figure}[] % \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf} \begin{center} {\includegraphics[width=.4\columnwidth]{figs/clustering_fig4_times_linearbeta_last.pdf}} {\includegraphics[width=.4\columnwidth]{figs/clustering_fig4_iter_linearbeta_last.pdf}} %\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}} \caption{Convergence of different algorithms for k-Means clustering with fashion MNIST dataset. %Here, we set the rank to be equal to 20 for the non-convex approaches. The solution rank for the template~\eqref{eq:sdp_svx} is known and it is equal to number of clusters $k$ (Theorem 1. \cite{kulis2007fast}). As discussed in~\cite{tepper2018clustering}, setting rank $r>k$ leads more accurate reconstruction in expense of speed. Therefore, we set the rank to 20.} \label{fig:clustering} %(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])} \end{center} \end{figure} \subsection{Basis Pursuit} Basis Pursuit (BP) finds sparsest solutions of an under-determined system of linear equations, namely, \begin{align} \begin{cases} \min_{z} \|z\|_1 \\ Bz = b, \end{cases} \label{eq:bp_main} \end{align} where $B \in \RR^{n \times d}$ and $b \in \RR^{n}$. BP has found many applications in machine learning, statistics and signal processing \cite{chen2001atomic, candes2008introduction, arora2018compressed}. A huge number of primal-dual convex optimization algorithms are proposed to solve BP, including, but not limited to~\cite{tran2018adaptive,chambolle2011first}. There also exists many line of works \cite{beck2009fast} to handle sparse regression problem via regularization with $\ell_1$ norm. %\textbf{AE: Fatih, maybe mention a few other algorithms including asgard and also say why we can't use fista and nista.} Here, we take a different approach and cast~(\ref{eq:bp_main}) as an instance of~\eqref{prob:01}. Note that any $z \in \RR^d$ can be decomposed as $z = z^+ - z^-$, where $z^+,z^-\in \RR^d$ are the positive and negative parts of $z$, respectively. Then consider the change of variables $z^+ = u_1^{\circ 2}$ and $z^-= u_2^{\circ 2} \in \RR^d$, where $\circ$ denotes element-wise power. Next, we concatenate $u_1$ and $u_2$ as $x := [ u_1^{\top}, u_2^{\top} ]^{\top} \in \RR^{2d}$ and define $\overline{B} := [B, -B] \in \RR^{n \times 2d}$. Then, \eqref{eq:bp_main} is equivalent to \eqref{prob:01} with \begin{align} f(x) =& \|x\|_2^2, \quad g(x) = 0\nonumber\\ A(x) =& \overline{B}x^{\circ 2}- b. \label{eq:bp-equiv} \end{align} In Appendix~\ref{sec:verification1}, we verify with minimal detail that Theorem~\ref{thm:main} indeed applies to~\eqref{prob:01} with the above $f,A$. %Let $\mu(B)$ denote the \emph{coherence} of ${B}$, namely, %\begin{align} %\mu = \max_{i,j} | \langle B_i, B_j \rangle |, %\end{align} %where $B_i\in \RR^n$ is the $i$th column of $B$. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. We rewrite the last line of \eqref{eq:cnd-bp-pre} as %\begin{align} %& 2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \| \nonumber\\ %& = 2 \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x^{\circ 2} -x_*^{\circ 2}) \| \nonumber\\ %& = 2 \| \text{diag}(x_k) {B}^\top B(x_k- x_*) \| \nonumber\\ %& = 2 \| \text{diag}(x_k) {B}^\top B(x_k- x_*) \|_{\infty} \nonumber\\ %& \ge 2 \| \text{diag}(x_k) \text{diag}({B}^\top B) (x_k- x_*) \|_{\infty}\nonumber\\ %& \qquad - 2 \| \text{diag}(x_k) \text{hollow}({B}^\top B) (x_k- x_*) \|_{\infty}, %\end{align} %where $\text{hollow}$ returns the hollow part of th % % and let $\overline{B}=U\Sigma V^\top$ be its thin singular value decomposition, where $U\in \RR^{n\times n}, V\in \RR^{d\times n}$ have orthonormal columns and the diagonal matrix $\Sigma\in \RR^{n\times n}$ contains the singular values $\{\sigma_i\}_{i=1}^r$. Let also $x_{k,i},U_i$ be the $i$th entry of $x_k$ and the $i$th row of $U_i$, respectively. Then we bound the last line above as %\begin{align} %& 2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \| \nonumber\\ %& = 2 \| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \| \nonumber\\ %& \ge 2 \| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \|_{\infty} \nonumber\\ %& = \max_i |x_{k,i}| \cdot | U_i ^\top \Sigma \cdot \Sigma U^\top ( x^{\circ 2} - x_*^{\circ 2}) | \nonumber\\ %& %\end{align} % %where $\tilde{b}_{i,j} = (\tilde{B})_{ij},~ i\in [1:n]$ and $j \in [1:2d]$. %\begin{align*} %-DA(x)^\top(A(x) - b) = -2x \odot (\tilde{B}^\top (A(x) - b)), %\end{align*} %where $\odot$ denotes hadamard product. %\begin{align*} %& \text{dist} \left( -DA(x)^\top (A(x)-b), \frac{\partial g(x)}{\b} \right) \nonumber\\ %& = \text{dist} \left( -DA(x)^\top (A(x)-b), \{0\} \right) \nonumber\\ %& = \left\| -DA(x)^\top (A(x)-b) \right\| \nonumber\\ %& \ge ?????, %\end{align*} %Hence, this completes the proof for regularity condition. We draw the entries of $B$ independently from a zero-mean and unit-variance Gaussian distribution. For a fixed sparsity level $k$, the support of $z_*\in \RR^d$ and its nonzero amplitudes are also drawn from the standard Gaussian distribution. %We then pick a sparsity level $k$ and choose $k$ indexes, i.e, $\Omega \subset [1:d]$, which are corresponding to nonzero entries of $z$. %We then assign values from normal distribution to those entries. Then the measurement vector is created as $b = Bz + \epsilon$, where $\epsilon$ is the noise vector with entries drawn independently from the zero-mean Gaussian distribution with variance $\sigma^2=10^{-6}$. \begin{figure}[] \begin{center} {\includegraphics[width=1\columnwidth]{figs/bp_fig1_subsolvers.pdf}} \caption{Convergence with different subsolvers for the aforementioned non-convex relaxation. } \label{fig:bp1} %(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])} \end{center} \end{figure} %\vspace{-3mm} Figure~\ref{fig:bp1} compiles our results for the proposed relaxation. It is, indeed, interesting to see that these type of non-convex relaxations gives the solution of convex one and first order methods succeed. \paragraph{Discussion:} The true potential of our reformulation is in dealing with more structured norms rather than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as \begin{align*} \|z\|_{\Omega} = \sum_{i=1}^I \| z_{\Omega_i} \|, \end{align*} where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the non-convex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work. \subsection{Adversarial Denoising with GANs} -In the appendix, we provide a contemporary application example that our template applies. +\label{exp:ellinf} +Projection onto the range of a neural network $G:\mathbb{R}^s \to \mathbb{R}^d$ has been considered +by~\cite{Samangouei2018, Ilyas2017} as a defense mechanism against adversarial +examples~\cite{Szegedy2013}. In their settings, samples are denoised by projecting on +the range of a pre-trained \textit{generator}, from either the GAN \cite{Goodfellow2014} +or VAE framework \cite{Kingma2013}, before being fed to a classifier. Even though the adversarial noise +introduced is typically bounded in $\ell_\infty$ norm \cite{Madry2018}, the +projection is performed in the $\ell_2$ metric. We instead propose to directly +project using the $\ell_\infty$ norm that limits the attacker, i.e. we solve the program +\begin{align} +\begin{cases} +\min_{x, z} \|x - x^\natural \|_\infty \\ +G(z) = x, +\end{cases} +\label{eq:darn} +\end{align} +with our proposed algorithm \eqref{Algo:2} (AL). On the optimization side, the $\ell_2$ +projection is usually performed by solving the non-convex program +\begin{equation} + \min_{z} \|G(z) - x^\natural \|_2^2 \\ +\label{eq:darn2} +\end{equation} +via off-the-shelf optimizers like gradient descent (GD) or ADAM +\cite{Kingma2014}. Indeed, we assign equal computational budget to ADAM and GD +for solving \eqref{eq:darn2}, and to our algorithm (AL) for solving +\eqref{eq:darn}. We evaluate on a test set of 100 adversarial examples from the +MNIST dataset, obtained with the Projected Gradient Algorithm of +\cite{Madry2018} with 30 iterations, stepsize 0.01 and attack size 0.2. For the +classifier, we use a standard convolutional network trained on clean MNIST +samples. + +\textbf{Results and discussion. } +We keep track of the denoised images along the trajectory of the algorithms, +sampling 1500 equally spaced points in time, and we plot the test error +achieved in Figure~\ref{fig:attack_error_time}. We observe that the +$\ell_\infty$ denoising is a superior defense mechanism against $\ell_\infty$ +bounded attacks, and achieves a lower error and does so faster. + +\begin{figure}[h] + \centering +\includegraphics[width=.4\columnwidth]{figs/attack_error_time_clean.pdf} +%%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}} +\caption{Error rate on denoised adversarial examples vs time} +\label{fig:attack_error_time} +\end{figure} + %This relaxation transformed a non-smooth objective into a smooth one while loosing the linearity on the equality constraint. %\subsection{Latent Group Lasso \label{sec:latent lasso}} % %For a collection of subsets $\Omega=\{\Omega_i\}_{i=1}^{I}\subset [1:p]$, the latent group Lasso norm on $\RR^p$ takes $z\in \RR^p$ to %\begin{align} %\|z\|_{\Omega,1} = \sum_{i=1}^I \| z_{\Omega_i} \|. %\end{align} %Note that we do not require $\Omega_i$ to be disjoint. For $B\in \RR^{n\times p}$, $b\in \RR^n$, and $\lambda>0$, consider the latent group lasso as %\begin{align} %\min_{z\in \RR^d} \frac{1}{2}\| Bz - b\|_2^2+ \lambda \| z\|_{\Omega,1}. %\label{eq:group_lasso} %\end{align} %Because $\Omega$ is not a partition of $[1:p]$, computing the proximal operator of $\|\cdot\|_{\Omega}$ is often intractable, ruling out proximal gradient descent and similar algorithms for solving Program~\eqref{eq:group_lasso}. Instead, often Program~\eqref{eq:group_lasso} is solved by Alternating Direction Method of Multipliers (ADMM). More concretely, ??? % %We take a radically different approach here and cast Program~\eqref{eq:group_lasso} as an instance of Program~\eqref{prob:01}. More specifically, let $z^+,z^-\in \RR^p$ be positive and negative parts of $z$, so that $z=z^+-z^-$. Let us introduce the nonnegative $u\in \RR^p$ such that $z^++z^- = u^{\circ 2}$, where we used $\circ$ notation to show entrywise power. We may now write that %\begin{align} %\| z^++z^- \|_{\Omega,1} %& = \| u^{\circ 2} \|_{\Omega,1 } =: \|u\|_{\Omega,2}^2, %\end{align} %Unlike $\|\cdot\|_{\Omega,1}$, note that $\|\cdot\|_{\Omega,2}^2$ is differentiable and, in fact, there exists a positive semidefinite matrix $Q\in \RR^{d\times d}$ such that $\|u\|_{\Omega,2}^2=u^\top Q_\Omega u$. Let us form $x=[(z^+)^\top,(z^-)^\top, u^\top]^\top\in \RR^{3d}$ and set %\begin{align*} %f(x) = \frac{1}{2}\| Bz^+-Bz^- -b\|_1+ \|u\|_{\Omega,2}^2, %\end{align*} %\begin{align} %g(x) = 0, \qquad A(x) = z^++z^--u^{\circ 2}. %\end{align} %We can now apply Algorithm~\ref{Algo:2} to solve Program~\eqref{prob:01} with $f,g,A$ specified above. % % %\paragraph{Convergence rate.} %Clearly, $f,A$ are strongly smooth in the sense that \eqref{eq:smoothness basic} holds with proper $\lambda_f,\lambda_A$. Likewise, both $f$ and the Jacobian $DA$ are continuous and the restricted Lipschitz constants $\lambda'_f,\lambda'_A$ in \eqref{eq:defn_lambda'} are consequently well-defined and finite for a fixed $\rho'>0$. We next verify the key regularity condition in Theorem~\ref{thm:main}, namely, \eqref{eq:regularity}. Note that %\begin{align*} %DA(x) & = %\left[ %\begin{array}{ccc} %I_p & I_p & -2\text{diag}(u) %\end{array} %\right]\in \RR^{d\times 3d}, %\end{align*} %\begin{align*} %-DA(x)^\top A(x) = %\left[ %\begin{array}{c} %-z^+-z^-+u^{\circ 2} \\ %-z^+-z^-+u^{\circ 2}\\ %2\text{diag}(u)( z^++z^--u^{\circ 2}) %\end{array} %\right], %\end{align*} %\begin{align*} %& \text{dist} \left( -DA(x)^\top A(x), \frac{\partial g(x)}{\b} \right) \nonumber\\ %& = \text{dist} \left( -DA(x)^\top A(x), \{0\} \right) \nonumber\\ %& = \left\| -DA(x)^\top A(x) \right\| \nonumber\\ %& \ge \sqrt{2} \| A(x)\|, %\end{align*} %namely, \eqref{eq:regularity} holds with $\nu=1$.