diff --git a/ICML19/Arxiv version/.DS_Store b/ICML19/Arxiv version/.DS_Store index 8b21086..27f4c44 100644 Binary files a/ICML19/Arxiv version/.DS_Store and b/ICML19/Arxiv version/.DS_Store differ diff --git a/ICML19/Drafts/main.tex b/ICML19/Drafts/main.tex index 6995253..2933903 100644 --- a/ICML19/Drafts/main.tex +++ b/ICML19/Drafts/main.tex @@ -1,174 +1,175 @@ %%%%%%%% ICML 2019 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%% \documentclass{article} % Recommended, but optional, packages for figures and better typesetting: \usepackage{microtype} \usepackage{graphicx} \usepackage{subfigure} \usepackage{booktabs} % for professional tables % hyperref makes hyperlinks in the resulting PDF. % If your build breaks (sometimes temporarily if a hyperlink spans a page) % please comment out the following usepackage line and replace % \usepackage{icml2019} with \usepackage[nohyperref]{icml2019} above. \usepackage{hyperref} % Attempt to make hyperref and algorithmic work together better: \newcommand{\theHalgorithm}{\arabic{algorithm}} % Use the following line for the initial blind version submitted for review: \usepackage{icml2019} % If accepted, instead use the following line for the camera-ready submission: %\usepackage[accepted]{icml2019} % The \icmltitle you define below is probably too long as a header. % Therefore, a short form for the running title is supplied here: \icmltitlerunning{Inexact Augmented Lagrangian Framework for Non-Convex Optimization} % Our own packages \usepackage{amsthm} \usepackage{amsmath} \usepackage{amssymb} \usepackage{graphicx} \usepackage{algorithm} \usepackage{algorithmic} \usepackage{color} %\usepackage{cite} \usepackage{enumitem} % AE's commands \newcommand{\RR}{\mathbb{R}} \newcommand{\edita}[1]{{\color{blue} #1}} \newcommand{\notea}[1]{{\color{magenta} \textbf{Note:AE: #1}}} \newcommand{\ol}{\overline} \newcommand{\Null}{\operatorname{null}} \newcommand{\relint}{\operatorname{relint}} \newcommand{\row}{\operatorname{row}} \newcommand{\s}{\sigma} \newcommand{\editaa}[1]{{\color{red} #1}} \renewcommand{\b}{\beta} \renewcommand{\L}{\mathcal{L}} % Lagrangian \renewcommand{\i}{\iota} \newcommand{\g}{\gamma} \newcommand{\cone}{\operatorname{cone}} \newcommand{\argmin}{\operatorname{argmin}} \newcommand{\dist}{\operatorname{dist}} % MFS's commands \newcommand{\editf}[1]{{\color[rgb]{1,0.4,0.4} #1}} %\DeclareMathOperator*{\argmax}{argmax} % thin space, limits underneath in displays % Theorem style \newtheorem{theorem}{Theorem}[section] % reset theorem numbering for each section \newtheorem{question}[theorem]{Question} \newtheorem{lemma}[theorem]{Lemma} \newtheorem{corollary}[theorem]{Corollary} \newtheorem{definition}{Definition} \newtheorem{proposition}{Proposition} \begin{document} \twocolumn[ \icmltitle{An Inexact Augmented Lagrangian Framework \\ for Non-Convex Optimization with Nonlinear Constraints} % It is OKAY to include author information, even for blind % submissions: the style file will automatically remove it for you % unless you've provided the [accepted] option to the icml2019 % package. % List of affiliations: The first argument should be a (short) % identifier you will use later to specify author affiliations % Academic affiliations should list Department, University, City, Region, Country % Industry affiliations should list Company, City, Region, Country % You can specify symbols, otherwise they are numbered in order. % Ideally, you should not use this facility. Affiliations will be numbered % in order of appearance and this is the preferred way. \icmlsetsymbol{equal}{*} \begin{icmlauthorlist} \icmlauthor{Aeiau Zzzz}{equal,to} \icmlauthor{Bauiu C.~Yyyy}{equal,to,goo} \icmlauthor{Cieua Vvvvv}{goo} \icmlauthor{Iaesut Saoeu}{ed} \icmlauthor{Fiuea Rrrr}{to} \icmlauthor{Tateu H.~Yasehe}{ed,to,goo} \icmlauthor{Aaoeu Iasoh}{goo} \icmlauthor{Buiui Eueu}{ed} \icmlauthor{Aeuia Zzzz}{ed} \icmlauthor{Bieea C.~Yyyy}{to,goo} \icmlauthor{Teoau Xxxx}{ed} \icmlauthor{Eee Pppp}{ed} \end{icmlauthorlist} \icmlaffiliation{to}{Department of Computation, University of Torontoland, Torontoland, Canada} \icmlaffiliation{goo}{Googol ShallowMind, New London, Michigan, USA} \icmlaffiliation{ed}{School of Computation, University of Edenborrow, Edenborrow, United Kingdom} %\icmlcorrespondingauthor{Cieua Vvvvv}{c.vvvvv@googol.com} %\icmlcorrespondingauthor{Eee Pppp}{ep@eden.co.uk} % You may provide any keywords that you % find helpful for describing your paper; these are used to populate % the "keywords" metadata in the PDF but will not be shown in the document \icmlkeywords{Machine Learning, ICML} \vskip 0.3in ] % this must go after the closing bracket ] following \twocolumn[ ... % This command actually creates the footnote in the first column % listing the affiliations and the copyright notice. % The command takes one argument, which is text to display at the start of the footnote. % The \icmlEqualContribution command is standard text for equal contribution. % Remove it (just {}) if you do not need this facility. \icmlcorrespondingauthor{}{} %\printAffiliationsAndNotice{} % leave blank if no need to mention equal contribution \printAffiliationsAndNotice{\icmlEqualContribution} % otherwise use the standard text. %\input{preamble.tex} \input{sections/abstract.tex} \input{sections/introduction.tex} \input{sections/preliminaries.tex} \input{sections/AL.tex} \input{sections/guarantees.tex} +\input{sections/slater.tex} %\input{sections/guarantees2.tex} \input{sections/related_works.tex} \input{sections/experiments.tex} %\begin{acknowledgements} %If you'd like to thank anyone, place your comments here %and remove the percent signs. %\end{acknowledgements} % BibTeX users please use one of -\newpage -\bibliographystyle{icml2019} % basic style, author-year citations -\bibliography{bibliography.bib} % name your BibTeX data base +%\newpage +%\bibliographystyle{icml2019} % basic style, author-year citations +%\bibliography{bibliography.bib} % name your BibTeX data base %\newpage -%\appendix +\appendix %\input{sections/AL_theory.tex} -%\input{sections/appendix.tex} +\input{sections/appendix.tex} \end{document} % end of file template.tex diff --git a/tex/Paper/bibliography.bib b/tex/Paper/bibliography.bib index 48592d2..87e481d 100644 --- a/tex/Paper/bibliography.bib +++ b/tex/Paper/bibliography.bib @@ -1,272 +1,280 @@ %% This BibTeX bibliography file was created using BibDesk. %% http://bibdesk.sourceforge.net/ %% Saved with string encoding Unicode (UTF-8) % Created by Mehmet Fatih Sahin for nonconvex inexact augmented lagrangian paper % December 14, Friday. 01:52 am @article{boumal2014manopt, Author = {Boumal, Nicolas and Mishra, Bamdev and Absil, P-A and Sepulchre, Rodolphe}, Journal = {The Journal of Machine Learning Research}, Number = {1}, Pages = {1455--1459}, Publisher = {JMLR. org}, Title = {Manopt, a Matlab toolbox for optimization on manifolds}, Volume = {15}, Year = {2014}} @article{boumal2016global, Author = {Boumal, Nicolas and Absil, P-A and Cartis, Coralia}, Journal = {arXiv preprint arXiv:1605.08101}, Title = {Global rates of convergence for nonconvex optimization on manifolds}, Year = {2016}} @inproceedings{ge2016efficient, Author = {Ge, Rong and Jin, Chi and Netrapalli, Praneeth and Sidford, Aaron and others}, Booktitle = {International Conference on Machine Learning}, Pages = {2741--2750}, Title = {Efficient algorithms for large-scale generalized eigenvector computation and canonical correlation analysis}, Year = {2016}} @inproceedings{boumal2016non, Author = {Boumal, Nicolas and Voroninski, Vlad and Bandeira, Afonso}, Booktitle = {Advances in Neural Information Processing Systems}, Pages = {2757--2765}, Title = {The non-convex Burer-Monteiro approach works on smooth semidefinite programs}, Year = {2016}} @article{davis2011university, Author = {Davis, Timothy A and Hu, Yifan}, Journal = {ACM Transactions on Mathematical Software (TOMS)}, Number = {1}, Pages = {1}, Publisher = {ACM}, Title = {The University of Florida sparse matrix collection}, Volume = {38}, Year = {2011}} @article{clason2018acceleration, Author = {Clason, Christian and Mazurenko, Stanislav and Valkonen, Tuomo}, Journal = {arXiv preprint arXiv:1802.03347}, Title = {Acceleration and global convergence of a first-order primal--dual method for nonconvex problems}, Year = {2018}} @article{chambolle2011first, Author = {Chambolle, Antonin and Pock, Thomas}, Journal = {Journal of mathematical imaging and vision}, Number = {1}, Pages = {120--145}, Publisher = {Springer}, Title = {A first-order primal-dual algorithm for convex problems with applications to imaging}, Volume = {40}, Year = {2011}} @article{tran2018smooth, Author = {Tran-Dinh, Quoc and Fercoq, Olivier and Cevher, Volkan}, Journal = {SIAM Journal on Optimization}, Number = {1}, Pages = {96--134}, Publisher = {SIAM}, Title = {A smooth primal-dual optimization framework for nonsmooth composite convex minimization}, Volume = {28}, Year = {2018}} @article{bhojanapalli2018smoothed, Author = {Bhojanapalli, Srinadh and Boumal, Nicolas and Jain, Prateek and Netrapalli, Praneeth}, Journal = {arXiv preprint arXiv:1803.00186}, Title = {Smoothed analysis for low-rank solutions to semidefinite programs in quadratic penalty form}, Year = {2018}} @article{park2016provable, Author = {Park, Dohyung and Kyrillidis, Anastasios and Bhojanapalli, Srinadh and Caramanis, Constantine and Sanghavi, Sujay}, Journal = {arXiv preprint arXiv:1606.01316}, Title = {Provable Burer-Monteiro factorization for a class of norm-constrained matrix problems}, Year = {2016}} @article{mixon2016clustering, Author = {Mixon, Dustin G and Villar, Soledad and Ward, Rachel}, Journal = {arXiv preprint arXiv:1602.06612}, Title = {Clustering subgaussian mixtures by semidefinite programming}, Year = {2016}} @article{yurtsever2018conditional, Author = {Yurtsever, Alp and Fercoq, Olivier and Locatello, Francesco and Cevher, Volkan}, Journal = {arXiv preprint arXiv:1804.08544}, Title = {A Conditional Gradient Framework for Composite Convex Minimization with Applications to Semidefinite Programming}, Year = {2018}} @inproceedings{jin2017escape, Author = {Jin, Chi and Ge, Rong and Netrapalli, Praneeth and Kakade, Sham M and Jordan, Michael I}, Booktitle = {International Conference on Machine Learning}, Pages = {1724--1732}, Title = {How to Escape Saddle Points Efficiently}, Year = {2017}} @article{doi:10.1137/15M1031631, Author = {Birgin, E. and Gardenghi, J. and Mart{\'\i}nez, J. and Santos, S. and Toint, P.}, Doi = {10.1137/15M1031631}, Eprint = {https://doi.org/10.1137/15M1031631}, Journal = {SIAM Journal on Optimization}, Number = {2}, Pages = {951-967}, Title = {Evaluation Complexity for Nonlinear Constrained Optimization Using Unscaled KKT Conditions and High-Order Models}, Url = {https://doi.org/10.1137/15M1031631}, Volume = {26}, Year = {2016}, Bdsk-Url-1 = {https://doi.org/10.1137/15M1031631}, Bdsk-Url-2 = {http://dx.doi.org/10.1137/15M1031631}} @article{liu2017linearized, Author = {Liu, Qinghua and Shen, Xinyue and Gu, Yuantao}, Journal = {arXiv preprint arXiv:1705.02502}, Title = {Linearized admm for non-convex non-smooth optimization with convergence analysis}, Year = {2017}} @article{wang2015global, Author = {Wang, Yu and Yin, Wotao and Zeng, Jinshan}, Journal = {arXiv preprint arXiv:1511.06324}, Title = {Global convergence of ADMM in nonconvex nonsmooth optimization}, Year = {2015}} @book{bertsekas2014constrained, title={Constrained optimization and Lagrange multiplier methods}, author={Bertsekas, Dimitri P}, year={2014}, publisher={Academic press} } @article{lan2016iteration, Author = {Lan, Guanghui and Monteiro, Renato DC}, Journal = {Mathematical Programming}, Number = {1-2}, Pages = {511--547}, Publisher = {Springer}, Title = {Iteration-complexity of first-order augmented Lagrangian methods for convex programming}, Volume = {155}, Year = {2016}} @article{nedelcu2014computational, title={Computational complexity of inexact gradient augmented Lagrangian methods: application to constrained MPC}, author={Nedelcu, Valentin and Necoara, Ion and Tran-Dinh, Quoc}, journal={SIAM Journal on Control and Optimization}, volume={52}, number={5}, pages={3109--3134}, year={2014}, publisher={SIAM} } @book{bertsekas1999nonlinear, title={Nonlinear programming}, author={Bertsekas, Dimitri P} } @article{hestenes1969multiplier, title={Multiplier and gradient methods}, author={Hestenes, Magnus R}, journal={Journal of optimization theory and applications}, volume={4}, number={5}, pages={303--320}, year={1969}, publisher={Springer} } @incollection{powell1978fast, title={A fast algorithm for nonlinearly constrained optimization calculations}, author={Powell, Michael JD}, booktitle={Numerical analysis}, pages={144--157}, year={1978}, publisher={Springer} } @article{burer2005local, title={Local minima and convergence in low-rank semidefinite programming}, author={Burer, Samuel and Monteiro, Renato DC}, journal={Mathematical Programming}, volume={103}, number={3}, pages={427--444}, year={2005}, publisher={Springer} } @article{burer2003nonlinear, title={A nonlinear programming algorithm for solving semidefinite programs via low-rank factorization}, author={Burer, Samuel and Monteiro, Renato DC}, journal={Mathematical Programming}, volume={95}, number={2}, pages={329--357}, year={2003}, publisher={Springer} } @article{singer2011angular, title={Angular synchronization by eigenvectors and semidefinite programming}, author={Singer, Amit}, journal={Applied and computational harmonic analysis}, volume={30}, number={1}, pages={20}, year={2011}, publisher={NIH Public Access} } @article{singer2011three, title={Three-dimensional structure determination from common lines in cryo-EM by eigenvectors and semidefinite programming}, author={Singer, Amit and Shkolnisky, Yoel}, journal={SIAM journal on imaging sciences}, volume={4}, number={2}, pages={543--572}, year={2011}, publisher={SIAM} } @inproceedings{song2007dependence, title={A dependence maximization view of clustering}, author={Song, Le and Smola, Alex and Gretton, Arthur and Borgwardt, Karsten M}, booktitle={Proceedings of the 24th international conference on Machine learning}, pages={815--822}, year={2007}, organization={ACM} } @inproceedings{mossel2015consistency, title={Consistency thresholds for the planted bisection model}, author={Mossel, Elchanan and Neeman, Joe and Sly, Allan}, booktitle={Proceedings of the forty-seventh annual ACM symposium on Theory of computing}, pages={69--75}, year={2015}, organization={ACM} } @incollection{lovasz2003semidefinite, title={Semidefinite programs and combinatorial optimization}, author={Lov{\'a}sz, L{\'a}szl{\'o}}, booktitle={Recent advances in algorithms and combinatorics}, pages={137--194}, year={2003}, publisher={Springer} } @article{khot2011grothendieck, title={Grothendieck-type inequalities in combinatorial optimization}, author={Khot, Subhash and Naor, Assaf}, journal={arXiv preprint arXiv:1108.2464}, year={2011} } @article{park2016provable, title={Provable Burer-Monteiro factorization for a class of norm-constrained matrix problems}, author={Park, Dohyung and Kyrillidis, Anastasios and Bhojanapalli, Srinadh and Caramanis, Constantine and Sanghavi, Sujay}, journal={arXiv preprint arXiv:1606.01316}, year={2016} } @article{mixon2016clustering, title={Clustering subgaussian mixtures by semidefinite programming}, author={Mixon, Dustin G and Villar, Soledad and Ward, Rachel}, journal={arXiv preprint arXiv:1602.06612}, year={2016} +} + +@article{bolte2018nonconvex, + title={Nonconvex Lagrangian-based optimization: monitoring schemes and global convergence}, + author={Bolte, J{\'e}r{\^o}me and Sabach, Shoham and Teboulle, Marc}, + journal={Mathematics of Operations Research}, + year={2018}, + publisher={INFORMS} } \ No newline at end of file diff --git a/tex/Paper/figures/Slater.pptx b/tex/Paper/figures/Slater.pptx new file mode 100644 index 0000000..6cb6377 Binary files /dev/null and b/tex/Paper/figures/Slater.pptx differ diff --git a/tex/Paper/main.tex b/tex/Paper/main.tex index b701db9..6c568ff 100644 --- a/tex/Paper/main.tex +++ b/tex/Paper/main.tex @@ -1,130 +1,131 @@ %%%%%%%%%%%%%%%%%%%%%%% file template.tex %%%%%%%%%%%%%%%%%%%%%%%%% % % This is a general template file for the LaTeX package SVJour3 % for Springer journals. Springer Heidelberg 2010/09/16 % % Copy it to a new file with a new name and use it as the basis % for your article. Delete % signs as needed. % % This template includes a few options for different layouts and % content for various journals. Please consult a previous issue of % your journal as needed. % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % % First comes an example EPS file -- just ignore it and % proceed on the \documentclass line % your LaTeX will extract the file if required -\begin{filecontents*}{example.eps} -%!PS-Adobe-3.0 EPSF-3.0 -%%BoundingBox: 19 19 221 221 -%%CreationDate: Mon Sep 29 1997 -%%Creator: programmed by hand (JK) -%%EndComments -gsave -newpath - 20 20 moveto - 20 220 lineto - 220 220 lineto - 220 20 lineto -closepath -2 setlinewidth -gsave - .4 setgray fill -grestore -stroke -grestore -\end{filecontents*} +%\begin{filecontents*}{example.eps} +%%!PS-Adobe-3.0 EPSF-3.0 +%%%BoundingBox: 19 19 221 221 +%%%CreationDate: Mon Sep 29 1997 +%%%Creator: programmed by hand (JK) +%%%EndComments +%gsave +%newpath +% 20 20 moveto +% 20 220 lineto +% 220 220 lineto +% 220 20 lineto +%closepath +%2 setlinewidth +%gsave +% .4 setgray fill +%grestore +%stroke +%grestore +%\end{filecontents*} % \RequirePackage{fix-cm} % %\documentclass{svjour3} % onecolumn (standard format) %\documentclass[smallcondensed]{svjour3} % onecolumn (ditto) \documentclass[smallextended]{svjour3} % onecolumn (second format) %\documentclass[twocolumn]{svjour3} % twocolumn % \smartqed % flush right qed marks, e.g. at end of proof % \usepackage{graphicx} % % \usepackage{mathptmx} % use Times fonts if available on your TeX system % % insert here the call for the packages your document requires %\usepackage{latexsym} % etc. % % please place your own definitions here and don't use \def but % \newcommand{}{} % % Insert the name of "your journal" with \journalname{Math Programming Comp.} %c \usepackage{amsmath} \usepackage{amssymb} \usepackage{graphicx} \usepackage{algorithm} \usepackage{color} \usepackage{cite} \usepackage{hyperref} \input{preamble.tex} \begin{document} \title{Linearized augmented Lagrangian framework for solving non-convex problems\thanks{This project has received funding from...} } %\subtitle{Do you have a subtitle?\\ If so, write it here} %\titlerunning{Short form of title} % if too long for running head \author{First author \and 2nd author } \authorrunning{author et. al.} % if too long for running head %\institute{F. Author \at % first address \\ % Tel.: +123-45-678910\\ % Fax: +123-45-678910\\ % \email{fauthor@example.com} % \\ % \emph{Present address:} of F. Author % if needed % \and % S. Author \at % second address % } \date{Received: / Accepted: } % The correct dates will be entered by the editor \maketitle \input{sections/abstract.tex} \input{sections/introduction.tex} \input{sections/preliminaries.tex} +\input{sections/slater.tex} \input{sections/LinAL.tex} \input{sections/ADMM.tex} \input{sections/related_works.tex} \input{sections/LinAL_theory.tex} %\input{sections/adaptive_theory.tex} \input{sections/ADMM_theory.tex} \appendix \input{sections/appendix.tex} %\begin{acknowledgements} %If you'd like to thank anyone, place your comments here %and remove the percent signs. %\end{acknowledgements} % BibTeX users please use one of %\bibliographystyle{spbasic} % basic style, author-year citations %\bibliographystyle{spmpsci} % mathematics and physical sciences %\bibliographystyle{spphys} % APS-like style for physics %\bibliography{bibliography.bib} % name your BibTeX data base \end{document} % end of file template.tex diff --git a/tex/Paper/preamble.tex b/tex/Paper/preamble.tex index c48a7a1..2fe5541 100644 --- a/tex/Paper/preamble.tex +++ b/tex/Paper/preamble.tex @@ -1,147 +1,150 @@ \PassOptionsToPackage{normalem}{ulem} \newcommand{\minimize}[2]{\ensuremath{\underset{\substack{{#1}}}% {\mathrm{minimize}}\;\;#2 }} \newcommand{\Argmind}[2]{\ensuremath{\underset{\substack{{#1}}}% {\mathrm{Argmin}}\;\;#2 }} \newcommand{\Frac}[2]{\displaystyle{\frac{#1}{#2}}} \newcommand{\menge}[2]{\big\{{#1} \;|\; {#2}\big\}} \newcommand{\Pair}[2]{{\big\langle{{#1},{#2}}\big\rangle}} \newcommand{\pair}[2]{{\langle{{#1},{#2}}\rangle}} \newcommand{\Scal}[2]{{\bigg\langle{{#1}\:\bigg |~{#2}}\bigg\rangle}} \newcommand{\Menge}[2]{\bigg\{{#1}~\bigg|~{#2}\bigg\}} \newcommand{\lev}[1]{\ensuremath{\mathrm{lev}_{\leq #1}\:}} \newcommand{\enve}[1]{\ensuremath{\operatorname{env}#1}} \newcommand{\emp}{\ensuremath{{\varnothing}}} \newcommand{\bdry}{\ensuremath{\operatorname{bdry}}} \newcommand{\yosi}[2]{\ensuremath{\sideset{^{#2}}{}{\operatorname{}}\!\!#1}} \newcommand{\infconv}{\ensuremath{\mbox{\small$\,\square\,$}}} \newcommand{\scal}[2]{\left\langle{#1}\mid {#2} \right\rangle} \newcommand{\pscal}[2]{\langle\langle{#1}\mid{#2}\rangle\rangle} \newcommand{\norm}[1]{\|#1\|} \newcommand{\vuo}{\ensuremath{\mbox{\footnotesize$\square$}}} \newcommand{\exi}{\ensuremath{\exists\,}} \newcommand{\zeroun}{\ensuremath{\left]0,1\right[}} \newcommand{\HH}{\ensuremath{\mathcal H}} \newcommand{\GG}{\ensuremath{\mathcal G}} \newcommand{\YY}{\ensuremath{\mathcal Y}} \newcommand{\XX}{\ensuremath{\mathcal X}} \newcommand{\bary}{\ensuremath{\widetilde{\boldsymbol{y}}}} \newcommand{\barp}{\ensuremath{\widetilde{\boldsymbol{p}}}} \newcommand{\barq}{\ensuremath{\widetilde{\boldsymbol{q}}}} \newcommand{\barx}{\ensuremath{\widetilde{\boldsymbol{x}}}} \newcommand{\BL}{\ensuremath{\EuScript B}\,} \newcommand{\BP}{\ensuremath{\EuScript P}} \newcommand{\HHH}{\ensuremath{\boldsymbol{\mathcal H}}} \newcommand{\WW}{\ensuremath{\boldsymbol{\mathcal W}}} \newcommand{\KKK}{\ensuremath{\boldsymbol{\mathcal K}}} \newcommand{\GGG}{\ensuremath{\boldsymbol{\mathcal G}}} \newcommand{\VV}{\ensuremath{\boldsymbol{V}}} \newcommand{\CCC}{\ensuremath{\boldsymbol{C}}} \newcommand{\MM}{\ensuremath{\boldsymbol{M}}} \newcommand{\SG}{\ensuremath{\boldsymbol{S}}} \newcommand{\EE}{\ensuremath{\boldsymbol{E}}} \newcommand{\XP}{\ensuremath{\mathcal X}^*} \newcommand{\sri}{\ensuremath{\operatorname{sri}}} \newcommand{\ri}{\ensuremath{\operatorname{ri}\,}} \newcommand{\RR}{\ensuremath{\mathbb R}} \newcommand{\KK}{\ensuremath{\mathcal K}} \newcommand{\RP}{\ensuremath{\left[0,+\infty\right[}} \newcommand{\RPP}{\ensuremath{\,\left]0,+\infty\right[}} \newcommand{\RX}{\ensuremath{\,\left]-\infty,+\infty\right]}} \newcommand{\NN}{\ensuremath{\mathbb N}} \newcommand{\SL}{\ensuremath{\EuScript S}\,} \newcommand{\dom}{\ensuremath{\operatorname{dom}}} \newcommand{\cont}{\ensuremath{\operatorname{cont}}} %\newcommand{\gr}{\ensuremath{\operatorname{gra}}} \newcommand{\prox}{\ensuremath{\operatorname{prox}}} \newcommand{\Prox}{\ensuremath{\operatorname{Prox}}} \newcommand{\intdom}{\ensuremath{\operatorname{int}\operatorname{dom}}\,} \newcommand{\inte}{\ensuremath{\operatorname{int}}} \newcommand{\cart}{\ensuremath{\mbox{\huge{$\times$}}}} \newcommand{\scart}{\ensuremath{\mbox{\LARGE{$\times$}}}} \newcommand{\WC}{\ensuremath{{\mathfrak W}}} \newcommand{\TT}{\ensuremath{{\mathbf T}}} \newcommand{\SC}{\ensuremath{{\mathfrak S}}} \newcommand{\rh}{\ensuremath{{\mathrm a}}} \newcommand{\og}{\ensuremath{{\mathrm b}}} \newcommand{\rk}{\ensuremath{{\mathsf {\mathbf a}}}} \newcommand{\ck}{\ensuremath{{\mathsf {\mathbf u}}}} \newcommand{\xk}{\ensuremath{{\mathsf{\mathbf x}}}} \newcommand{\yk}{\ensuremath{{\mathsf{\mathbf y}}}} \newcommand{\RPX}{\ensuremath{{[0,\pinf]}}} \newcommand{\card}{\ensuremath{\operatorname{card}}} \newcommand{\bd}{\ensuremath{\operatorname{bdry}}} \newcommand{\Argmin}{\ensuremath{\operatorname{Argmin}}} \newcommand{\argmin}{\ensuremath{\operatorname{argmin}}} \newcommand{\ran}{\ensuremath{\operatorname{ran}}} \newcommand{\zer}{\ensuremath{\operatorname{zer}}} \newcommand{\gra}{\ensuremath{\operatorname{gra}}} \newcommand{\conv}{\ensuremath{\operatorname{conv}}} \newcommand{\vv}{\ensuremath{\boldsymbol{v}}} \newcommand{\sss}{\ensuremath{\boldsymbol{s}}} \newcommand{\xx}{\ensuremath{\boldsymbol{x}}} \newcommand{\xs}{\ensuremath{\textsf{x}}} \newcommand{\xo}{\ensuremath{\overline{\boldsymbol{x}}}} \newcommand{\pp}{\ensuremath{\boldsymbol{p}}} \newcommand{\qq}{\ensuremath{\boldsymbol{q}}} \newcommand{\yy}{\ensuremath{\boldsymbol{y}}} \newcommand{\ff}{\ensuremath{\boldsymbol{f}}} \newcommand{\hh}{\ensuremath{\boldsymbol{h}}} \newcommand{\ttt}{\ensuremath{\boldsymbol{t}}} \newcommand{\ee}{\ensuremath{\boldsymbol{e}}} \newcommand{\rr}{\ensuremath{\boldsymbol{r}}} \newcommand{\gggg}{\ensuremath{\boldsymbol{g}}} \newcommand{\zz}{\ensuremath{\boldsymbol{z}}} \newcommand{\bb}{\ensuremath{\boldsymbol{b}}} \newcommand{\uu}{\ensuremath{\boldsymbol{u}}} \newcommand{\cc}{\ensuremath{\boldsymbol{c}}} \newcommand{\dd}{\ensuremath{\boldsymbol{d}}} \newcommand{\aaa}{\ensuremath{\boldsymbol{a}}} \newcommand{\ww}{\ensuremath{\boldsymbol{w}}} \newcommand{\BB}{\ensuremath{\boldsymbol{B}}} \newcommand{\LL}{\ensuremath{\boldsymbol{L}}} \newcommand{\PPP}{\ensuremath{\boldsymbol{\mathsf{P}}}} \newcommand{\UU}{\ensuremath{\boldsymbol{U}}} \newcommand{\EEE}{\ensuremath{\boldsymbol{E}}} \newcommand{\E}{\ensuremath{\mathbf{E}}} \newcommand{\D}{\ensuremath{\mathbf{D}}} \newcommand{\ep}{\ensuremath{\boldsymbol{\varepsilon}}} \newcommand{\RRR}{\ensuremath{\boldsymbol{R}}} %\newcommand{\RRR}{\ensuremath{\boldsymbol{R}}} \newcommand{\AAA}{\ensuremath{\boldsymbol{A}}} \newcommand{\BBB}{\ensuremath{\boldsymbol{B}}} \newcommand{\QQ}{\ensuremath{\boldsymbol{Q}}} \newcommand{\SSS}{\ensuremath{\boldsymbol{S}}} \newcommand{\DD}{\ensuremath{\boldsymbol{D}}} \newcommand{\PP}{\ensuremath{\boldsymbol{P}}} \newcommand{\FF}{\ensuremath{\boldsymbol{\mathcal{F}}}} \newcommand{\cone}{\ensuremath{\operatorname{cone}}} \newcommand{\Fix}{\ensuremath{\operatorname{Fix}}} \newcommand{\Id}{\ensuremath{\operatorname{Id}}} \newcommand{\diam}{\ensuremath{\operatorname{diam}}} \newcommand{\IId}{\ensuremath{\boldsymbol{\operatorname{Id}}}} \newcommand{\weakly}{\ensuremath{\rightharpoonup}} \newcommand{\minf}{\ensuremath{-\infty}} \newcommand{\pinf}{\ensuremath{+\infty}} \newcommand{\LLambda}{\ensuremath{\boldsymbol{\Lambda}}} \newcommand{\vva}{\ensuremath{\boldsymbol{\epsilon}}} \newcommand{\trace}{\operatorname{tr}} % AE's commands \newcommand{\edita}[1]{{\color{blue} #1}} -\newcommand{\notea}[1]{{\color{magenta} \textbf{Note:AE: #1}}} +\newcommand{\notea}[1]{{\color{magenta} \textbf{AE: #1}}} \newcommand{\ol}{\overline} \newcommand{\Null}{\operatorname{null}} \newcommand{\relint}{\operatorname{relint}} \newcommand{\row}{\operatorname{row}} \newcommand{\s}{\sigma} \newcommand{\editaa}[1]{{\color{red} #1}} \renewcommand{\b}{\beta} \renewcommand{\L}{\mathcal{L}} % Lagrangian \renewcommand{\i}{\iota} \newcommand{\g}{\gamma} \renewcommand{\cone}{\operatorname{cone}} +\newcommand{\dist}{\operatorname{dist}} +\renewcommand{\D}{\operatorname{D}} +\newcommand{\X}{X} % MFS's commands \newcommand{\editf}[1]{{\color[rgb]{1,0.4,0.4} #1}} \DeclareMathOperator*{\argmax}{argmax} % thin space, limits underneath in displays diff --git a/tex/Paper/sections/ADMM.tex b/tex/Paper/sections/ADMM.tex index ac55ec3..ea61520 100644 --- a/tex/Paper/sections/ADMM.tex +++ b/tex/Paper/sections/ADMM.tex @@ -1,161 +1,161 @@ \section{Linearized ADMM} Consider the program \begin{align} \begin{cases} \underset{x,z}{\min} \,\, f(x)+g(x)+h(z)+l(z)\\ A(x)+B(z) = 0, \end{cases} \label{eq:admm 2} \end{align} where $f,h:\RR^d\rightarrow\RR$ and $A,B:\RR^d\rightarrow\RR^d$ are smooth in the sense that \begin{align*} \| \nabla f(x) - \nabla f(x') \| \le \lambda_f \|x-x'\|, \,\, \| DA(x) - DA(x') \| \le \lambda_A \|x-x'\|, \end{align*} \begin{align} \| \nabla h(z) - \nabla h(z') \| \le \lambda_h \|z-z'\|, \,\, \| DB(z) - DB(z') \| \le \lambda_B \|z-z'\|, \label{eq:smoothness basics admm} \end{align} for every $x,x',z,z'\in \RR^d$. Above, $g,l:\RR^d\rightarrow\RR$ are convex functions. For example, when $C,D\subset \RR^d$ and $g=1_C, l=1_D$ are the indicator functions on $C,D$, respectively, Program~\eqref{eq:admm 2} becomes \begin{align} \begin{cases} \underset{x,z}{\min} \,\, f(x)+h(z)\\ A(x)+B(z) = 0\\ -x\in C\\ +x\in C,\qquad z\in D. \end{cases} \label{eq:admm 2} \end{align} The augmented Lagrangian corresponding to Program~\eqref{eq:admm 2} is \begin{align} \L_{\b}(x,z,y) & = f(x)+g(x)+h(z)+l(z) \nonumber\\ & \qquad + \langle A(x)+B(z),y \rangle + \frac{\b}{2}\| A(x)+B(z) \|^2, \label{eq:lagrangian admm} \end{align} and Program \eqref{eq:admm 2} is therefore equivalent to the minimax program \begin{align} \underset{x,z}{\min} \underset{y}{\max}\,\, \L_\b(x,z,y). \label{eq:admm main} \end{align} To solve the equivalent formulation in Program \eqref{eq:admm main}, we propose the linearized ADMM, detailed in Algorithm~\ref{Algo:3}. To parse this algorithm, we need to slightly change the gradient map in Definition~\ref{def:grad map} and the line search procedure in Lemma~\ref{lem:eval Lipsc cte} to match $\L_\b$ in \eqref{eq:lagrangian admm}. More specifically, the corresponding gradient maps are defined as \begin{equation} G_{\b,\gamma}(x,z,y) = \frac{x-x^+}{\gamma}, \qquad H_{\b,\iota}(x,z,y) = \frac{z-z^+}{\iota}, \label{eq:grad map admm} \end{equation} where \begin{align} x^+=P_{g}(x-\gamma \nabla_x \mathcal{L}_ {\beta}(x,z,y)), \qquad z^+=P_{l}(z-\iota \nabla_z \mathcal{L}_ {\beta}(x,z,y)), \end{align} and $\g,\iota$ are the primal step sizes. The line search procedure is similar to Lemma~\ref{lem:eval Lipsc cte} and we set \begin{align*} x^+_{\gamma'} = P_g(x - \gamma' \nabla_x \mathcal{L}_\beta(x,z,y)), \end{align*} \begin{align} z^+_{\iota'} = P_l(z - \iota' \nabla_z \mathcal{L}_\beta(x,z,y)), \end{align} \begin{align} \gamma & := \max \Big\{ \gamma' ={\gamma}_0 \theta^i : \mathcal{L}_\beta (x^+_{\gamma'},z,y) \nonumber\\ & \qquad \qquad \le \mathcal{L}_\beta (x,z,y) + \left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle + \frac{1}{2\gamma'} \| x^+_{\gamma'} - x\|^2 \Big\}, \label{eq:line search x admm} \end{align} \begin{align} \iota & := \max \Big\{ \iota' ={\iota}_0 \theta^i : \mathcal{L}_\beta (x,z^+_{\iota'},y) \nonumber\\ & \qquad \qquad \le \mathcal{L}_\beta (x,z,y) + \left\langle z^+_{\iota'} - z , \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle + \frac{1}{2\iota'} \| z^+_{\iota'} - z\|^2 \Big\}. -\label{eq:defn of gamma line search} +\label{eq:defn of gamma line search admm} \end{align} \begin{algorithm} \label{Algo:3} Input: Parameters $\beta_1,\s_1,\rho,\rho',\rho'',\tau> 0$, initialization $x_{1},z_1\in \RR^d$ with $\|A(x_1)+B(z_1)\|\le \rho$ and $\|x_1\|,\|z_1\|\le \rho'$, dual initialization $y_0\in \RR^m$. \\ For $k=1,2,\ldots$, execute\\ \begin{enumerate} \item \textbf{(Update penalty weight)} Set \begin{align} \beta_k = \frac{\beta_{1}}{\log 2}\sqrt{k \log^2(k+1)} . \end{align} \item \textbf{(Line search in $x$)} Use the line search procedure in \eqref{eq:line search x admm} by replacing $x=x_k,z=z_k,y=y_k,\b=\b_k$ and let $\g_k = \g$ be the output.\\ \item \textbf{(Descent step in $x$)} Set $ x_{k+1} = P_{g }(x_{k} - \gamma_{k} \nabla_x \mathcal{L}_{\beta_{k}}(x_{k},z_k,y_{k}))$, where $\L_{\b_k}$ is the augmented Lagrangian and $P_g$ denotes the proximal operator, defined in (\ref{eq:Lagrangian},\ref{eq:defn of prox}), respectively.\\ \item \textbf{(Line search in $z$)} Use the line search procedure in \eqref{eq:defn of gamma line search} by replacing $x=x_{k+1},z=z_k,y=y_k,\b=\b_k$ and let $\iota_k = \iota$ be the output.\\ \item \textbf{(Descent step in $z$)} Set $ z_{k+1} = P_{l }(z_{k} - \iota_{k} \nabla_z \mathcal{L}_{\beta_{k}}(x_{k+1},z_k,y_{k}))$, where $P_l$ denotes the proximal operator for $l$.\\ \item \textbf{(Control)} Properly modify $x_{k+1},z_{k+1}$ to ensure that $\|x_{k+1}\|,\|z_{k+1}\|\le \rho'$ or $\|x_{k+1}-x_k\|\le \rho''$.\\ \item \textbf{(Stopping criterion)} If $\g_k \|G_{\b_k,\g_k}(x_k,z_k,y_k)\|^2+ \iota_k \|G_{\b_k,\iota_k}(x_{k+1},z_k,y_k)\|^2 + \s_k \|A(x_k)+B(z_k)\|^2 \le \tau$, then quit and return $x_{k+1},z_{k+1}$ as approximate minimizers of Program \eqref{prob:01}. See \eqref{eq:grad map} for the definition of the gradient mapping. \\ \item \textbf{(Update dual step size)} Set \begin{align} \s_{k+1} = \s_{1} \min\left( {\frac{1}{\sqrt{k+1}}}, \frac{\|A(x_1)+B(z_1)\|}{|A(x_{k+1})+B(z_{k+1})\|}\cdot \frac{ \log^2 2 }{(k+1)\log^2(k+2)} \right). \end{align}\\ \item \textbf{(Dual ascent step)} Set $y_{k+1} = y_{k} + \sigma_{k+1}(A(x_{k+1})+B(z_{k+1}))$. \end{enumerate} \caption{ADMM for solving Program \eqref{eq:admm main}} \end{algorithm} \begin{theorem}[Linearized ADMM] \label{thm:main admm} For a sufficiently large $k_0$, consider an integer interval $K=[k_0:k_1]$, and suppose that \begin{align} \inf_{k\ge k_0} f(x_k) + g(x_k)+ h(z_k) + l(z_k) + \langle A(x_k)+B(z_k) ,y_{k_0}\rangle >- \infty, \end{align} and that $\rho$ is large enough such that (\ref{eq:suff cnd on rho final}) holds . For $\rho'>0$, in addition to the strong smoothness of $f$ and $A$ quantified in (\ref{eq:smoothness basic}), let us define \begin{align} \lambda'_f = \max_{\|x\|\le \rho'} \|\nabla f(x)\|, \qquad \lambda'_A = \max_{\|x\| \le \rho'} \|DA(x)\|, \qquad \lambda'_B = \max_{\|x\|\le \rho'} \|DB(x)\|, \end{align} to be the (restricted) Lipschitz constants of $f,A,B$, respectively. Moreover, suppose that the nonconvex Slater's condition, namely, Lemma \ref{lem:bnd bnd Ak}, is in force, and \begin{align} \nu_A \ge 2\lambda_A \rho''. \end{align} Then the output of linearized ADMM in Algorithm~\ref{Algo:2} satisfies \textbf{we need a better expression below to better show the dependence on various parameters.} \begin{align} & \min_{k\in K}\,\, \min(\overline{\g},\overline{\iota})( \|G_k\|^2 + \|H_k\|^2) \sqrt{\frac{k_0 \log^2(k_0+1)}{k_1\log^2(k_1+1)}} \nonumber\\ & \qquad + \|A(x_k)\|^2 \lesssim \frac{1}{k_1-k_0}, \end{align} where $\overline{\gamma},\overline{\iota}$ are independent of $k_1$ and $\lesssim$ suppresses the dependence on all parameters except $k_1$. The exact expressions in terms of $\lambda_f,\lambda'_f,\lambda_A,\lambda'_A, \b_{k_0},\s_{k_0}, \nu_A,y_0$ are found in (\ref{eq:raw bnd on sum admm},\ref{eq:lower_band_iotas_admm}). \end{theorem} diff --git a/tex/Paper/sections/ADMM_theory.tex b/tex/Paper/sections/ADMM_theory.tex index 5203e5f..dd52d10 100644 --- a/tex/Paper/sections/ADMM_theory.tex +++ b/tex/Paper/sections/ADMM_theory.tex @@ -1,195 +1,195 @@ \section{Proof of Theorem \ref{thm:main admm} \label{sec:ADMM theory}} Let us repeat the technical lemmas and definitions in Section \ref{sec:preliminaries}, slightly adjusted for the augmented Lagrangian of Program~\eqref{eq:admm main}, see \eqref{eq:lagrangian admm}. These standard results are stated without proof. \begin{lemma}[Smoothness]\label{lem:smoothness admm} For $\rho,\rho'\ge 0$, it holds that \begin{align*} \| \nabla_x \mathcal{L}_{\beta}(x, z,y)- \nabla_x \mathcal{L}_{\beta}(x', z, y) \| \le \lambda_{\b,z} \|x-x'\|, \end{align*} \begin{align} \| \nabla_z \L_\b(x,z,y) - \nabla_z \L_\b(x,z',y) \| \le \lambda_{\b,x} \|z-z'\|, \end{align} for every $x,x',z,z' \in \{ x'': \|A(x'')+B(z)\| \le \rho, \|x''\|\le \rho'\}$ and $y\in\RR^m$, where \begin{align*} \lambda_{\beta,z} & \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) + \b d \lambda'^2_A. \end{align*} \begin{align} \lambda_{\beta,x} & \le \lambda_h + \sqrt{m}\lambda_B \left(\|y\| + \b\rho \right) + \b d \lambda'^2_B. \label{eq:smoothness of Lagrangian admm} \end{align} Above, $\lambda_f,\lambda_A,\lambda_h,\lambda_B$ were defined in (\ref{eq:smoothness basics admm}) and \begin{align} \lambda'_A := \max_{\|x\|\le \rho'}\|DA(x)\|, \qquad \lambda'_B := \max_{\|z\|\le \rho'}\|DB(z)\|. \end{align} \end{lemma} \begin{definition}\label{def:grad map admm}\textbf{{(Gradient Mapping)}} Given $x\in \RR^d$ and $\gamma >0$, the gradient mappings $G_{\beta,\gamma}(\cdot,z, y), G_{\b,\iota}(x,\cdot,y): \RR^d\rightarrow\RR^d$ takes, respectively, $x,z\in \RR^d$ to \begin{equation} G_{\b,\gamma}(x,z,y) = \frac{x-x^+}{\gamma}, \qquad H_{\b,\iota}(x,z,y) = \frac{z-z^+}{\iota}, \label{eq:grad map admm} \end{equation} where $x^+=P_{g}(x-\gamma \nabla_x \mathcal{L}_ {\beta}(x,z,y))$ and $z^+=P_{l}(z-\iota \nabla_z \mathcal{L}_ {\beta}(x,z,y))$. \end{definition} \begin{lemma}[Descent lemma]\label{lem:11 admm} For $x,z\in \RR^d$ and $y\in \RR^m$, let $x^+,z^+$ be as in Definition \ref{def:grad map admm} with $\g<1/\lambda_{\b,z}$ and $\iota<1/\lambda_{\b,x}$. For $\rho,\rho'\ge 0$, suppose that \begin{align} x,x^+,z,z^+ \in \{ x': \|A(x')\| \le \rho, \|x'\| \le \rho' \}. \end{align} Then it holds that \begin{equation*} \label{e:deslem} \| G_{\beta,\gamma}(x,z,y)\|^{2}\leq \frac{2}{\gamma} (\mathcal{L}_\beta(x,z,y) + g (x)- \mathcal{L}_\beta(x^+,z,y)- g(x^+) ), \end{equation*} \begin{align} \| H_{\beta,\iota}(x,z,y)\|^{2}\leq \frac{2}{\iota} (\mathcal{L}_\beta(x,z,y) + l (x)- \mathcal{L}_\beta(x,z^+,y)- l(x^+) ). \label{eq:deslem admm} \end{align} \end{lemma} \begin{lemma}[Line search] \label{lem:eval Lipsc cte admm} Fix $\theta \in (0,1)$ and ${\gamma}_0,\iota_0>0$. For $\gamma',\iota'>0$, let \begin{align} x^+_{\gamma'} = P_g(x - \gamma' \nabla_x \mathcal{L}_\beta(x,z,y)), \qquad z^+_{\iota'} = P_l(z - \iota' \nabla_z \mathcal{L}_\beta(x,z,y)), \end{align} and define \begin{align*} \gamma & := \max \Big\{ \gamma' ={\gamma}_0 \theta^i : \mathcal{L}_\beta (x^+_{\gamma'},z,y) \nonumber\\ & \qquad \qquad \quad \le \mathcal{L}_\beta (x,z,y) + \left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle + \frac{1}{2\gamma'} \| x^+_{\gamma'} - x\|^2 \Big\}, \end{align*} \begin{align} \iota & := \max \Big\{ \iota' ={\iota}_0 \theta^i : \mathcal{L}_\beta (x,z^+_{\iota'},y) \nonumber\\ & \qquad \qquad \quad \le \mathcal{L}_\beta (x,z,y) + \left\langle z^+_{\iota'} - z , \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle + \frac{1}{2\iota'} \| z^+_{\iota'} - z\|^2 \Big\}. -\label{eq:defn of gamma line search} +\label{eq:defn of gamma line search admm} \end{align} Then, (\ref{eq:deslem admm}) holds and, moreover, we have that \begin{align} \gamma \ge \frac{\theta}{\lambda_{\beta,x}}, \qquad \iota \ge \frac{\theta}{\lambda_{\beta,z}}. - \label{eq:moreover} + \label{eq:moreover admm} \end{align} \end{lemma} For the reader's convenience, let us also recall the updates of the nonconvex ADMM algorithm in iteration $k$ as \begin{align} x_{k+1}& =P_g(x_k -\g_k \nabla_x \L_{\b_k} (x_k,z_k,y_k)),\nonumber\\ z_{k+1}& =P_l(z_k - \i_k \nabla_z \L_{\b_k} (x_{k+1},z_k,y_k)),\nonumber\\ y_{k+1}& = y_k+\s_{k+1} (A(x_{k+1})+B(z_{k+1})). \label{eq:ADMM updates recall} \end{align} For every $k\in K$, recall that the primal step sizes $\g_k,\i_k$ are determined by line search in Lemma \ref{lem:eval Lipsc cte admm} and the penalty weights and dual step sizes are set as \begin{align} \b_k & = \b_{k_0} \sqrt{\frac{k\log^2(k+1)}{k_0 \log^2(k_0+1)}}, \nonumber\\ \s_k & = \s_{k_0}\min\left( \sqrt{\frac{k_0}{k}} , \frac{\|A(u_{k_0})+B(v_{k_0})\|}{\|A(u_k)+B(v_k)\|} \cdot \frac{k_0 \log^2 (k_0+1)}{k \log^2 (k+1)} \right). \end{align} For every $k\in K$, let us set \begin{align*} G_k = G_{\b_k,\g_k}(x_k,z_k,y_k) = \frac{x_k-x_{k+1}}{\g_k}, \end{align*} \begin{align} H_k = H_{\b_k,\iota_k}(x_{k+1},z_k,y_k)= \frac{z_k-z_{k+1}}{\i_k}, \end{align} for short. The convergence analysis only slightly differs from the proof of Theorem \ref{thm:main} and we therefore focus only on the differences. Similar to the proof of Theorem~\ref{thm:main}, two applications of Lemma~\ref{lem:11 admm} yields that \begin{align} \frac{\g_k \|G_k\|^2}{2} \le \L_{\b_k}(x_k,z_k,y_k)- \L_{\b_k}(x_{k+1},z_k,y_k), \nonumber\\ \frac{\i_k \|H_k\|^2}{2} \le \L_{\b_k}(x_{k+1},z_k,y_k)- \L_{\b_k}(x_{k+1},z_{k+1},y_k), \label{eq:decrease twice} \end{align} for every $k$. By setting \begin{align*} u_k = [ \begin{array}{cc} x_k^\top & z_k^\top \end{array} ]^\top \in \RR^{2d}, \qquad Q_k = [ \begin{array}{cc} G_k^\top & H_k^\top \end{array} ]^\top \in \RR^{2d}, \end{align*} \begin{align} q(u) = f(x)+g(x)+h(z)+l(z), \qquad \kappa_k = \min(\g_k,\iota_k), \label{eq:defn of w} \end{align} for every $k\in K$ and after summing up the two inequalities in \eqref{eq:decrease twice}, we reach \begin{align} \frac{\kappa_k \|Q_k\|^2}{2} & \le \L_{\b_k}(x_k,z_k,y_k) - \L_{\b_k}(x_{k+1},z_{k+1},y_k), \qquad \forall k\in K. \end{align} By following the same steps as in the proof of Theorem \ref{thm:main}, we find that \begin{align} \sum_{k=k_0}^{k_1} \frac{\kappa_k \|Q_k\|^2}{2} & \le \mu+ 2\sum_{k=k_0}^{k_1} \| A(x_k) + B(z_k) \|^2 , \end{align} where \begin{align} \mu & := \sup_{k\ge k_0} \Big( q(u_{k_0}) - q(u_{k})+ \langle A(x_{k_0})+B(z_{k_0})-A(x_k)-B(z_k) ,y_{k_0}\rangle \nonumber\\ & \qquad \qquad + \frac{\beta_{k_0}}{2} \|A(x_{k_0})+B(z_{k_0})\|^2 \Big) < \infty. \end{align} On the other hand, the $x$ update in \eqref{eq:ADMM updates recall} implies that \begin{align} & G_k - \nabla f(x_k) - DA(x_k)^\top y_k \nonumber\\ & \qquad - \b_k DA(x_k)^\top (A(x_k) + B(z_k) ) \in \partial g(x_{k+1}). \end{align} For $\rho,\rho',\rho''>0$, suppose that \begin{align} \max_{k\in K} \|A(x_k)+B(z_k) \| \le \rho, \quad \max_{k\in K} \|x_k\| \le \rho', \quad \max_{k\in K} \|x_{k+1}-x_k\| \le \rho'', \label{eq:feas to be checked later} \end{align} \begin{align} \nu_A \ge 2\lambda_A \rho'', \end{align} where $\nu_A$ was defined in \eqref{eq:new slater defn}. Then, following the same line of argument, a similar version of Lemma~\ref{lem:bnd bnd Ak} holds and \begin{align} \|A(x_k)+B(z_k) \| \le \frac{2}{\nu_A\b_k} (\|G_k\|+ \lambda'_h+ \lambda'_A\|y_{k-1}\|). \end{align} If we assume that (\ref{eq:to be used later on},\ref{eq:suff cnd on rho}) hold, then the first bound in \eqref{eq:feas to be checked later} is in force and \begin{align} & \sum_{k=k_0}^{k_1} \kappa_k \|Q_k\|^2 + \|A(x_k)+B(z_k)\|^2 \nonumber\\ & \le \frac{24c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b^2_{k_0}} + \frac{24c\lambda'^2_A y_{\max}^2 k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2} + 5\mu. \label{eq:raw bnd on sum admm} \end{align} To better interpret the above bound, we next estimate the primal step sizes $\{\g_k,\i_k\}$, which are determined by line search. Let $\lambda_{\b,u}$ denote the Lipschitz constant of $\L_{\b_k}(u,v)$ with respect to $u$. Similar to the proof of Theorem \ref{thm:main}, we find that \begin{align*} \gamma_k & \ge \frac{\theta }{ 2 \b_{k_0} \left( \sqrt{m}\lambda_A \rho + d \lambda'^2_A \right)}\sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}} \nonumber\\ & =: \overline{\g} \sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}}, \end{align*} \begin{align} \iota_k & \ge \frac{\theta }{ 2 \b_{k_0} \left( \sqrt{m}\lambda_B \rho + d \lambda'^2_B \right)}\sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}} \nonumber\\ & =: \overline{\iota} \sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}}, \label{eq:lower_band_iotas_admm} \end{align} for sufficiently large $k_0$. Substituting the above bounds in \eqref{eq:raw bnd on sum admm} and following the steps in the proof of Theorem \ref{thm:main} completes the proof of Theorem \ref{thm:main admm}. diff --git a/tex/Paper/sections/LinAL.tex b/tex/Paper/sections/LinAL.tex index b362c80..c8b7d2a 100644 --- a/tex/Paper/sections/LinAL.tex +++ b/tex/Paper/sections/LinAL.tex @@ -1,149 +1,200 @@ \section{Linearized AL Algorithm} To solve the equivalent formulation presented in Program \eqref{eq:minmax}, we first propose a linearized augmented Lagrangian algorithm, detailed in Algorithm~\ref{Algo:2}. \begin{algorithm} \label{Algo:2} Input: Parameters $\beta_1,\s_1,\rho,\rho',\rho'',\tau> 0$, initialization $x_{1}\in \RR^d$ with $\|A(x_1)\|\le \rho$ and $\|x_1\|\le \rho'$, dual initialization $y_0\in \RR^m$. \\ For $k=1,2,\ldots$ and until convergence, execute\\ \begin{enumerate} \item \textbf{(Update penalty weight)} Set \begin{align} \beta_k = \frac{\beta_{1}}{\log 2}\sqrt{k \log^2(k+1)} . \end{align} \item \textbf{(Line search)} Use the line search procedure in \eqref{eq:defn of gamma line search} with $x=x_k,y=y_k,\b=\b_k$ and let $\g_k = \g$ be the output.\\ \item \textbf{(Primal descent step)} Set $ x_{k+1} = P_{g }(x_{k} - \gamma_{k} \nabla \mathcal{L}_{\beta_{k}}(x_{k},y_{k}))$, where $\L_{\b_k}$ is the augmented Lagrangian and $P_g$ denotes the proximal operator, defined in (\ref{eq:Lagrangian},\ref{eq:defn of prox}), respectively.\\ \item \textbf{(Control)} Properly modify $x_{k+1}$ to ensure that $\|x_{k+1}\|\le \rho'$ or $\|x_{k+1}-x_k\|\le \rho''$.\\ \item \textbf{(Stopping criterion)} If $\g_k \|G_{\b_k,\g_k}(x_k,y_k)\|^2 + \s_k \|A(x_k)\|^2 \le \tau$, then quit and return $x_{k+1}$ as an approximate minimizer of Program \eqref{prob:01}. See \eqref{eq:grad map} for the definition of the gradient mapping. \\ \item \textbf{(Update dual step size)} Set \begin{align} \s_{k+1} = \s_{1} \min\left( {\frac{1}{\sqrt{k+1}}},\frac{\|A(x_1)\|}{\|A(x_{k+1})\|}\cdot \frac{ \log^2 2 }{(k+1)\log^2(k+2)} \right). \end{align}\\ \item \textbf{(Dual ascent step)} Set $y_{k+1} = y_{k} + \sigma_{k+1}A(x_{k+1})$. \end{enumerate} \caption{Linearized AL for solving Program \eqref{prob:01}} \end{algorithm} Slater's condition plays a key role in convex optimization and we require a similar condition for the success of our algorithm. \begin{definition}\label{defn:nonconvex slater} \textbf{(Nonconvex Slater's condition)} Consider an interval $K=[k_0:k_1]$ and a sequence $\{x_k\}_{k=k_0}^{k_1}\subset \RR^d$. Let us set \begin{align} \cone(x_{k+1}) := \text{cone}(\partial g(x_{k+1}) )^* \subset \RR^d, \qquad \forall k\in K, \end{align} for short, and let $S_K\subset \RR^d$ be a subspace such that \begin{align} S _{K} \supseteq \bigcup_{k\in K} P_{\cone(x_{k+1})}\left( DA(x_{k+1})^\top A(x_{k+1}) \right), \label{eq:defn of S defn} \end{align} and, to simplify the notation, let $S_{K}$ also denote an orthonormal basis for this subspace. For $\rho,\rho'>0$, we set \begin{align} \nu_A := \begin{cases} \underset{v,x}{\min} \, \frac{\left\| S_{K}^\top P_{\cone(x)} ( DA(x)^\top v) \right\|}{\|v\|} \\ \|v\|\le \rho\\ \|x\|\le \rho', \end{cases} -\label{eq:new slater defn} +%\label{eq:new slater defn} \end{align} and say that Program (\ref{prob:01}) satisfies the nonconvex Slater's condition if $\nu_A >0$. \end{definition} To gain some insight about Definition \ref{defn:nonconvex slater}, suppose that $f:\RR^d\rightarrow\RR$ is convex. Consider a nonempty and bounded convex set $C\subseteq\RR^d$ and let us restrict ourselves to the choice of $g = 1_C:\RR^d\rightarrow\RR$, which takes $x\in \RR^d$ to \begin{align} g(x) = 1_C(x) = \begin{cases} 0 & x \in C\\ \infty & x\notin C. \end{cases} \label{eq:indicator} \end{align} We also let $T_C(x)$ denote the tangent cone to $C$ at $x$, and $P_{T_C(x)}:\RR^d\rightarrow\RR^d$ denotes the orthogonal projection onto this tangent cone. We also set $S_K = \RR^d$ to simplify the discussion. Then, it is not difficult to verify that \eqref{eq:new slater defn} is equivalent to \begin{align} \nu_A := \begin{cases} \underset{v,x}{\min} \, \, \frac{\left\| P_{T_C(x)} A^\top v \right\|}{\|v\|} \\ \|v\|\le \rho\\ \|x\|\le \rho' \end{cases} = \begin{cases} \underset{x}{\min} \,\, \eta_{m}\left( P_{T_C(x)} A^\top \right) \\ \|x\|\le \rho', \end{cases} -\label{eq:nu for cvx} +%\label{eq:nu for cvx} \end{align} where $\eta_{m}$ returns the $m$th largest singular value of its input matrix. The nonconvex Slater condition here, namely, $\nu_A>0$, is closely related to the standard Slater's condition in convex optimization. -\begin{proposition}\label{prop:convex} -In Program (\ref{prob:01}), suppose that $f$ is a convex function, $g$ is as in (\ref{eq:indicator}), and $A:\RR^d\rightarrow\RR^m$ a linear operator, represented with the matrix $A\in \RR^{m\times d}$. Assume also that Program (\ref{prob:01}) is feasible, namely, there exists $x\in C$ such that $Ax=0$. In (\ref{eq:nu for cvx}), let us take $S_K =\RR^d$. Then the standard Slater's condition holds if the nonconvex Slater's condition ($\nu_A>0$) holds. +\begin{proposition}%\label{prop:convex} +In problem (\ref{prob:01}), suppose that $f$ is a convex function, $g$ is as in (\ref{eq:indicator}), and $A:\RR^d\rightarrow\RR^m$ a linear operator, represented with the matrix $A\in \RR^{m\times d}$. Assume also that Program (\ref{prob:01}) is feasible, namely, there exists $x\in C$ such that $Ax=0$. In (\ref{eq:nu for cvx}), let us take $S_K =\RR^d$. Then the standard Slater's condition holds if the nonconvex Slater's condition ($\nu_A>0$) holds. \end{proposition} \begin{proof} Suppose that the standard Slater's condition does not hold, namely, that \begin{equation} \relint(\Null(A) \cap C) = \Null(A)\cap \relint(C) = \emptyset, \label{eq:no feas} \end{equation} where $\Null(A)$ and $\relint(C)$ denote the null space of the matrix $A$ and the relative interior of $C$, respectively. By assumption, there exists $x_0\in C$ such that $Ax_0=0$. It follows from \eqref{eq:no feas} that $x_0\in \text{boundary}(C)$ and that $\Null(A)$ supports $C$ at $x_0$, namely, $A x_0\ge 0$, for every $x_0\in C$. Consequently, $\Null(A) \cap T_C(x) \ne \{0\}$, where $T_C(x_0)$ is the tangent cone of the set $C$ at $x_0$. Equivalently, it holds that $\row(A)\cap N_C(u) \ne \{0\}$, where $\row(A)$ is the row space of the matrix $A$. That is, there exists a unit-norm vector $v$ such that $P_{T_C(x_0)}A^\top v=0$, and, consequently, $ \eta_{m}(P_{T_C(x_0)}A^\top) = 0 $. Let us take $\rho'=\|x_0\|$ in \eqref{eq:nu for cvx}. We then conclude that $\nu_A=0$, namely, the nonconvex Slater's condition also does not hold, thereby completing the proof of Proposition \ref{prop:convex}.\hfill \qed \end{proof} -The nonconvex Slater's condition is necessary for the success of Algorithm~\ref{Algo:2}. \textbf{Please add and discuss the convex visualization discussed with Volkan.} +The nonconvex Slater's condition is necessary for the success of Algorithm~\ref{Algo:2}. +The following result, proved in Appendix \ref{sec:proof of bnd Ak}, uses the nonconvex Slater's condition to find a converse, thus bounding the feasibility gap with the gradient mapping. +\begin{lemma}\label{lem:bnd bnd Ak} \textbf{\emph{(Nonconvex Slater's condition)}} +For an integer interval $K=[k_0:k_1]$, let us set +\begin{align} +\cone(x_{k+1}) := \text{cone}(\partial g(x_{k+1}) )^* \subset \RR^d, +\qquad \forall k\in K, +\end{align} +for short, and +consider a subspace $S_K\subseteq \mathbb{R}^d$ such that +\begin{align} +S _{K} \supseteq \bigcup_{k\in K} P_{\cone(x_{k+1})}\left( DA(x_{k+1})^\top A(x_{k+1}) \right), +\label{eq:defn of S lemma} +\end{align} +and, to simplify the notation, let $S_{K}$ also denote an orthonormal basis for this subspace. +For $\rho,\rho',\rho''>0$, assume that +\begin{align} +\max_{k\in K}\|A(x_k)\| \le \rho, +\qquad +\max_{k\in K}\|x_k\| \le \rho', +\qquad +\max_{k\in K} \|x_{k+1}-x_k\|\le \rho''. +\label{eq:good neighb} +\end{align} +Let us also set +\begin{align} +\nu_A := +\begin{cases} +\min_{v,x} \, \frac{\left\| S_{K}^\top P_{\cone(x)} ( DA(x)^\top v) \right\|}{\|v\|} \\ +\|v\|\le \rho\\ +\|x\|\le \rho'. +\end{cases} +\label{eq:new slater lemma} +\end{align} +and assume that $\nu_A\ge 2\lambda_A\rho''$. +Then it holds that +\begin{align} +\|A(x_k)\| +& \le \frac{2}{\nu_A\b_k} \left( \|G_k\| + \lambda'_f + \lambda'_A \| y_k\| \right), +\qquad \forall k\in K, +\label{eq:bnd on Ak final} +\end{align} +where +\begin{align} +\lambda'_f := \max_{\|x\| \le \rho'} \|\nabla f(x)\|, +\qquad +\lambda'_A := \max_{\|x\|\le \rho'} \| DA(x)\|. +\end{align} +\end{lemma} + + +Roughly speaking, \eqref{eq:bnd on Ak final} states that the feasibility gap is itself bounded by the gradient map. + -\textbf{We should add a discussion of the result.} \begin{theorem}[Linearized AL] \label{thm:main} For a sufficiently large $k_0$, consider an integer interval $K=[k_0:k_1]$, and suppose that \begin{align} \inf_{k\ge k_0} f(x_k) + g(x_k) + \langle A(x_k) ,y_{k_0}\rangle >- \infty, \end{align} and that $\rho$ is large enough such that (\ref{eq:suff cnd on rho final}) holds . For $\rho'>0$, in addition to the strong smoothness of $f$ and $A$ quantified in (\ref{eq:smoothness basic}), let us define \begin{align} \lambda'_f = \max_{\|x\|\le \rho'} \|\nabla f(x)\|, \qquad \lambda'_A = \max_{\|x\| \le \rho'} \|DA(x)\|, \end{align} to be the (restricted) Lipschitz constants of $f$ and $A$, respectively. Moreover, suppose that the nonconvex Slater's condition, namely, Lemma \ref{lem:bnd bnd Ak}, is in force, and \begin{align} \nu_A \ge 2\lambda_A \rho''. \end{align} Then the output of linearized AL in Algorithm~\ref{Algo:2} satisfies \textbf{we need a better expression below to better show the dependence on various parameters.} \begin{align} \min_{k\in K}\overline{\g} \|G_k\|^2 \sqrt{\frac{k_0 \log^2(k_0+1)}{k_1\log^2(k_1+1)}} + \|A(x_k)\|^2 \lesssim \frac{1}{k_1-k_0}, \end{align} where $\overline{\gamma}$ is independent of $k_1$ and $\lesssim$ suppresses the dependence on all parameters except $k_1$. The exact expressions in terms of $\lambda_f,\lambda'_f,\lambda_A,\lambda'_A, \b_{k_0},\s_{k_0}, \nu_A,y_0$ are found in (\ref{eq:low bnd on gammas},\ref{eq:exact dependences}). -\textbf{Please add a thorough discussion of the result. } +\notea{Please add a discussion of the result. } \end{theorem} diff --git a/tex/Paper/sections/LinAL_theory.tex b/tex/Paper/sections/LinAL_theory.tex index 2435849..a29d864 100644 --- a/tex/Paper/sections/LinAL_theory.tex +++ b/tex/Paper/sections/LinAL_theory.tex @@ -1,335 +1,291 @@ \section{Proof of Theorem \ref{thm:main}} For the reader's convenience, let us recall the updates of the algorithm in iteration $k$, namely, \begin{align} x_{k+1} & = P_g( x_k - \gamma_k \nabla_x \mathcal{L}_{\beta_k} (x_k,y_k) ) \nonumber\\ & = P_g\Big( x_k - \gamma_k \nabla f(x_k)\nonumber\\ & \qquad \qquad - \gamma_k DA(x_k) ^\top \left( y_k + \b_k A(x_k) \right) \Big), \qquad \text{(see \eqref{eq:Lagrangian})} \label{eq:update uk recall} \end{align} \begin{align} y_{k+1} =y_k + \s_{k+1} A(x_{k+1}), \label{eq:y update recall} \end{align} and throughout we also use the shorthand \begin{equation} G_k = G_{\beta_k,\gamma_k}(x_k,y_k) = \frac{x_k-x_{k+1}}{\gamma_k}. \qquad \text{(see \eqref{eq:grad map})} \label{eq:grad map recall} \end{equation} For integers $k_0\le k_1$, consider the interval \begin{equation} K = [k_0:k_1]=\{k_0,\cdots,k_1\}. \end{equation} Since $\gamma_k$ is determined by the line search subroutine in Lemma \ref{lem:eval Lipsc cte}, we may now apply Lemma \ref{lem:11} for every iteration in this interval to find that \begin{align} \frac{\gamma_k \|G_k\|^2}{2} & \le \mathcal{L}_{\beta_k}(x_k,y_k) + g(x_k) - \mathcal{L}_{\beta_k}(x_{k+1},y_k) - g(x_{k+1}) \qquad \text{(see Lemma \ref{lem:11})} \nonumber\\ & = f(x_k) + g(x_k) - f(x_{k+1})- g(x_{k+1}) + \langle A(x_k)-A(x_{k+1}) , y_k \rangle \nonumber\\ & \qquad + \frac{\b_k}{2}( \|A(x_k)\|^2 - \|A(x_{k+1})\|^2), \qquad \text{(see \eqref{eq:Lagrangian})} \label{eq:smoothness rep} \end{align} for every $k\in K$. On the other hand, note that \begin{align} y_k = y_{k_0-1} + \sum_{i=k_0}^k \sigma_i A(x_i), \qquad \text{(see \eqref{eq:y update recall})} \label{eq:y update unfolded} \end{align} which, after substituting in \eqref{eq:smoothness rep}, yields that \begin{align} \frac{\gamma_k \|G_k\|^2}{2} & \le f(x_k) + g(x_k) - f(x_{k+1})-g(x_{k+1}) \nonumber\\ & \qquad + \left\langle A(x_k) - A(x_{k+1}) , y_{k_0} + \sum_{i=k_0+1}^k \s_i A(x_i) \right\rangle \nonumber\\ & \qquad + \frac{\b_k}{2}( \|A(x_k)\|^2-\|A(x_{k+1})\|^2) . \label{eq:key ineq} \end{align} By summing up \eqref{eq:key ineq} over $k$ from $k_0$ to $k_1$, we argue that \begin{align} & \sum_{k=k_0}^{k_1} \frac{\gamma_k \|G_k\|^2}{2} \nonumber\\ & \le f(x_{k_0})+g(x_{k_0}) - f(x_{k_1+1})-g(x_{k_1+1}) + \langle A(x_{k_0}) - A(x_{k_1+1}) , y_{k_0}\rangle \nonumber\\ & \qquad + \sum_{k=k_0}^{k_1} \sum_{i=k_0+1}^k \sigma_i \left\langle A(x_k) - A(x_{k+1}) , A(x_i) \right\rangle \nonumber\\ & \qquad + \sum_{k=k_0}^{k_1} \frac{\beta_k}{2} \|A(x_k)\|^2 - \sum_{k=k_0}^{k_1} \frac{\beta_k}{2} \|A(x_{k+1})\|^2 \qquad \text{(see \eqref{eq:key ineq})} \nonumber\\ & = f(x_{k_0})+g(x_{k_0}) - f(x_{{k_1}+1}) - g(x_{k_1+1}) + \langle A(x_{k_0}) - A(x_{{k_1}+1}) , y_{k_0} \rangle \nonumber\\ & \qquad + \sum_{k=k_0}^{k_1} \sum_{i=k_0+1}^k \sigma_i \left\langle A(x_k) - A(x_{k+1}) , A(x_i) \right\rangle\nonumber\\ -& \qquad + \sum_{k=k_0}^{k_1} \frac{\beta_k}{2}\|A(x_k)\|^2- \sum_{k=k_0+1}^{{k_1}+1} \frac{\beta_{k-1}}{2} \|A(x_{k})\|^2 \nonumber\\ -& = f(x_{k_0})+g(x_{k_0}) - f(x_{{k_1}+1})-g(x_{k_1+1}) + \langle A(x_{k_0}) - A(x_{{k_1}+1}) , y_{k_0} \rangle \nonumber\\ +& \qquad + \sum_{k=k_0}^{k_1} \frac{\beta_k}{2}\|A(x_k)\|^2- \sum_{k=k_0+1}^{{k_1}+1} \frac{\beta_{k-1}}{2} \|A(x_{k})\|^2. +\end{align} +We continue by rewriting the last line above as +\begin{align} +& \sum_{k=k_0}^{k_1} \frac{\gamma_k \|G_k\|^2}{2} \nonumber\\ +& \le f(x_{k_0})+g(x_{k_0}) - f(x_{{k_1}+1})-g(x_{k_1+1}) + \langle A(x_{k_0}) - A(x_{{k_1}+1}) , y_{k_0} \rangle \nonumber\\ & \qquad + \frac{\beta_{k_0}}{2}\|A(x_{k_0})\|^2 + \sum_{i=k_0+1}^{k_1} \sum_{k=i}^{k_1} \sigma_i \left\langle A(x_k) - A(x_{k+1}) , A(x_i) \right\rangle \nonumber\\ & \qquad + \sum_{k=k_0+1}^{k_1} \frac{\beta_k - \beta_{k-1}}{2} \|A(x_k)\|^2 - \frac{\beta_{k_1}}{2} \| A(x_{k_1+1})\|^2 \nonumber\\ & \le \mu + \sum_{i=k_0+1}^{k_1} \sigma_i \left\langle A(x_i) - A(x_{{k_1}+1}), A(x_i)\right\rangle \nonumber\\ & \qquad + \sum_{k=k_0+1}^{k_1} \frac{\beta_k - \beta_{k-1}}{2} \|A(x_k)\|^2 - \frac{\beta_{k_1}}{2} \| A(x_{k_1+1})\|^2 \qquad \text{(see \eqref{eq:defn mu})} \nonumber\\ & = \mu + \sum_{k=k_0+1}^{k_1} \left( \sigma_k +\frac{\beta_k - \beta_{k-1}}{2} \right) \|A(x_k)\|^2 \nonumber\\ & \qquad - \sum_{k=k_0+1}^{k_1} \sigma_k \left \langle A(x_{{k_1}+1}), A(x_k)\right\rangle - \frac{\beta_{k_1}}{2} \| A(x_{k_1+1})\|^2, \label{eq:long chain broken} -\end{align} +\end{align} where we assumed that \begin{align} \mu & := \sup_{k\ge k_0} \Big( f(x_{k_0})+g(x_{k_0}) - f(x_k) - g(x_k) + \langle A(x_{k_0})-A(x_k) ,y_{k_0}\rangle \nonumber\\ & \qquad \qquad + \frac{\beta_{k_0}}{2} \|A(x_{k_0})\|^2 \Big) < \infty. \label{eq:defn mu} \end{align} For $\b_{k_0},\s_{k_0}>0$, recall the penalty weights and the dual step sizes as \begin{align*} \beta_k = \b_{k_0} \sqrt{\frac{k\log^2 (k+1)}{k_0 \log^2 (k_0+1)}}, \end{align*} \begin{align} \sigma_k = \s_{k_0} \min\left( \sqrt{\frac{k_0 }{k }}, \frac{ \|A(x_{k_0})\| k_0 \log^2(k_0+1)}{\|A(x_k)\| k \log^2(k+1)} \right), \qquad \forall k \in K. \label{eq:beta n sigma} \end{align} For the record, the above choices imply that \begin{align} \beta_k- \beta_{k-1} & = \beta_{k-1} \left( \sqrt{\frac{k\log^{{2}}(k+1)}{(k-1)\log^{{2}}k}} -1 \right) \nonumber\\ & \le \beta_{k-1} \cdot \frac{k\log^{{2}}(k+1) - (k-1) \log^{{2}} k}{(k-1)\log^{{2}} k} \nonumber\\ & \le \b_{k-1} \left( \frac{k\log^2(1+ \frac{1}{k})}{(k-1)\log^2k} + \frac{1}{k-1} \right) \nonumber\\ & \le \frac{2 \beta_{k-1}}{k-1} \qquad ( k_0 \gg 1) \nonumber\\ & \le \frac{{2}\beta_{k_0}}{k-1} \sqrt{\frac{(k-1)\log^{{2}}k}{k_0 \log^{{2}}(k_0+1)}} \nonumber\\ & = \frac{ 2\beta_{k_0} \log k}{\sqrt{(k-1)k_0} \log (k_0+1)}, \qquad \forall k \in K, \label{eq:consequences} \end{align} when $k_0$ is sufficiently large. for sufficiently large $k_0$. We can therefore further simplify the last line of \eqref{eq:long chain broken} as \begin{align} & \sum_{k=k_0}^{k_1} \frac{\g_k \|G_k\|^2}{2} \nonumber\\ & \le\mu + \sum_{k=k_0}^{k_1} \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}}{2} \right) \|A(x_k)\|^2 \nonumber\\ & \qquad + \sum_{k=k_0}^{k_1} \sigma_k \|A(x_{{k_1}+1})\| \|A(x_k)\| - \frac{\beta_{k_1}}{2} \| A(x_{k_1+1})\|^2 \qquad \text{(see (\ref{eq:long chain broken}))} \nonumber\\ & \le \mu + \sum_{k=k_0}^{k_1} \left(\sigma_k+ \frac{\beta_k-\beta_{k-1}+1}{2} \right) \|A(x_k)\|^2 \nonumber\\ & \qquad + \frac{1}{2} \left( \sum_{k=k_0}^{k_1} \sigma_k^2 -\beta_{k_1} \right) \| A(x_{k_1+1})\|^2 \qquad (2ab \le a^2+b^2) \nonumber\\ & \le \mu+ 2 \sum_{k=k_0}^{k_1} \|A(x_k)\|^2, \qquad \text{(see (\ref{eq:beta n sigma},\ref{eq:consequences}))} \label{eq:long chain broken 10} \end{align} for sufficiently large $k_0$. Indeed, the coefficient of $\|A(x_{k_1+1})\|$ above is negative because \begin{align} & \sum_{k=k_0}^{k_1} \s_k^2 - \b_{k_1} \nonumber\\ & \le \sum_{k=k_0}^{k_1} \frac{\s_{k_0}^2 k_0}{k} - \b_{k_0}\sqrt{\frac{k_1\log^2(k_1+1)}{k_0 \log^2(k_0+1)}} \nonumber\\ & \le 2\s_{k_0}^2 k_0 \int_{k_0}^{k_1} \frac{da}{a} - \b_{k_0}\sqrt{\frac{k_1\log^2(k_1+1)}{k_0 \log^2(k_0+1)}} \nonumber\\ & \le 0, \end{align} -when $k_0$ is sufficiently large. Note that \eqref{eq:long chain broken} bounds the gradient mapping with the feasibility gap. The following result, proved in Appendix \ref{sec:proof of bnd Ak}, uses the nonconvex Slater's condition to find a converse, thus bounding the feasibility gap with the gradient mapping. -\begin{lemma}\label{lem:bnd bnd Ak} \textbf{\emph{(Nonconvex Slater's condition)}} -For an integer interval $K=[k_0:k_1]$, let us set -\begin{align} -\cone(x_{k+1}) := \text{cone}(\partial g(x_{k+1}) )^* \subset \RR^d, -\qquad \forall k\in K, -\end{align} -for short, and -consider a subspace $S_K\subseteq \mathbb{R}^d$ such that -\begin{align} -S _{K} \supseteq \bigcup_{k\in K} P_{\cone(x_{k+1})}\left( DA(x_{k+1})^\top A(x_{k+1}) \right), -\label{eq:defn of S lemma} -\end{align} -and, to simplify the notation, let $S_{K}$ also denote an orthonormal basis for this subspace. -For $\rho,\rho',\rho''>0$, assume that -\begin{align} -\max_{k\in K}\|A(x_k)\| \le \rho, -\qquad -\max_{k\in K}\|x_k\| \le \rho', -\qquad -\max_{k\in K} \|x_{k+1}-x_k\|\le \rho''. -\label{eq:good neighb} -\end{align} -Let us also set -\begin{align} -\nu_A := -\begin{cases} -\min_{v,x} \, \frac{\left\| S_{K}^\top P_{\cone(x)} ( DA(x)^\top v) \right\|}{\|v\|} \\ -\|v\|\le \rho\\ -\|x\|\le \rho'. -\end{cases} -\label{eq:new slater lemma} -\end{align} -and assume that $\nu_A\ge 2\lambda_A\rho''$. -Then it holds that -\begin{align} -\|A(x_k)\| -& \le \frac{2}{\nu_A\b_k} \left( \|G_k\| + \lambda'_f + \lambda'_A \| y_k\| \right), -\qquad \forall k\in K, -\label{eq:bnd on Ak final} -\end{align} -where -\begin{align} -\lambda'_f := \max_{\|x\| \le \rho'} \|\nabla f(x)\|, -\qquad -\lambda'_A := \max_{\|x\|\le \rho'} \| DA(x)\|. -\end{align} -\end{lemma} - - -Roughly speaking, \eqref{eq:bnd on Ak final} states that the feasibility gap is itself bounded by the gradient map. In order to apply Lemma \ref{lem:bnd bnd Ak}, let us assume that (\ref{eq:good neighb}) holds. Lemma \ref{lem:bnd bnd Ak} is then in force and we may now substitute \eqref{eq:bnd on Ak final} back into \eqref{eq:long chain broken 10} to find that +when $k_0$ is sufficiently large. Note that \eqref{eq:long chain broken} bounds the gradient mapping with the feasibility gap. +\notea{removed slater lemma to the body.} +In order to apply Lemma \ref{lem:bnd bnd Ak}, let us assume that (\ref{eq:good neighb}) holds. Lemma \ref{lem:bnd bnd Ak} is then in force and we may now substitute \eqref{eq:bnd on Ak final} back into \eqref{eq:long chain broken 10} to find that \begin{align} & \sum_{k=k_0}^{k_1} \gamma_k \|G_k\|^2 \nonumber\\ & \le 2\sum_{k=k_0}^{{k_1}}\|A(x_k)\|^2 +2 \mu \qquad \text{(see \eqref{eq:long chain broken 10})} \nonumber\\ & \le 2\sum_{k=k_0}^{k_1} \left(\frac{2}{\nu_A\b_k} \left( \|G_k\| + \lambda'_f + \lambda'_A \| y_k\| \right) \right)^2 + 2\mu \qquad \text{(see \eqref{eq:bnd on Ak final})} \nonumber\\ & \le\sum_{k=k_0}^{k_1} \frac{32\|G_k\|^2}{ \nu_A^2 \b_k^2} + \sum_{k=k_0}^{k_1} \frac{8 \lambda'^2_f}{ \nu_A^2 \b_k^2 } + \sum_{k=k_0}^{k_1} \frac{8\lambda_A'^2 \|y_k\|^2}{ \nu_A^2\b_k^2} + 2\mu, \label{eq:to be used for feas pre} \end{align} where $|K|:=k_1-k_0+1$ is the size of interval $K$ and the last line above uses the inequality \begin{align} \left(\sum_{i=1}^p a_i\right)^2 \le p \sum_{i=1}^p a_i^2, \end{align} for integer $p$ and scalars $\{a_i\}_i$. If we set \begin{align} B_K = \sum_{k=k_0}^{k_1} \frac{\|y_k\|^2}{k\log^2 (k+1)}, \qquad c \ge \sum_{k=1}^{\infty} \frac{1}{k\log^2 (k+1)}, \label{eq:defn of BK} \end{align} and, after recalling the choice of $\{\b_k\}_k$ in \eqref{eq:beta n sigma}, the last line of \eqref{eq:to be used for feas pre} can be simplified as \begin{align} \sum_{k=k_0}^{k_1} \g_k \|G_k\|^2 & \le 2\sum_{k=k_0}^{k_1} \|A(x_k)\|^2 + 2\mu \nonumber\\ & \le \sum_{k=k_0}^{k_1} \frac{32\|G_k\|^2 k_0 \log^2(k_0+1)}{ \nu_A^2 \b_{k_0}^2 k\log^2 (k+1)} + \sum_{k=k_0}^{k_1} \frac{8\lambda'^2_f k_0 \log^2 (k_0+1) }{\nu_A^2 \b_{k_0}^2 k\log^2 (k+1) } +2\mu \nonumber\\ & \qquad + \sum_{k=k_0}^{k_1} \frac{8\lambda'^2_A k_0 \log^2(k_0+1) \|y_k\|^2}{\nu_A^2 k \log^2(k+1)} +2\mu \qquad \text{(see \eqref{eq:beta n sigma})} \nonumber\\ & \le \sum_{k=k_0}^{k_1} \frac{32 \|G_k\|^2 k_0 \log^2(k_0+1)}{ \nu_A^2 \b_{k_0}^2 k\log^2 (k+1)} + \frac{8c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b^2_{k_0}} \nonumber\\ & \qquad + \frac{8\lambda'^2_A B_K k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2} +2\mu. \qquad \text{(see \eqref{eq:defn of BK})} \label{eq:to be used for feas} \end{align} To simplify the above bound, let us assume that \begin{equation} \frac{32 k_0 \log^2(k_0+1) }{\nu_A^2 \b_{k_0}^2 k\log^2(k+1)} \le \frac{\gamma_k}{2}, \qquad \forall k\in K. \label{eq:to be used later on} \end{equation} After rearranging \eqref{eq:to be used for feas} and applying \eqref{eq:to be used later on}, we arrive at \begin{align} & \sum_{k=k_0}^{k_1} \frac{\gamma_k}{2} \|G_k\|^2 \nonumber\\ & \le \sum_{k=k_0}^{k_1} \left( \gamma_k - \frac{32k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2 k\log^2(k+1)} \right) \|G_k\|^2 \qquad \text{(see \eqref{eq:to be used later on})} \nonumber\\ & \le \frac{8c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b^2_{k_0}} + \frac{8\lambda'^2_A B_K k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2} + 2\mu, \label{eq:final bnd on G} \end{align} where the last line above uses \eqref{eq:defn of BK}. In turn, the bound above on the gradient mapping controls the feasibility gap, namely, \begin{align} & \sum_{k=k_0}^{{k_1}} \|A(x_k)\|^2 \nonumber\\ & \le \sum_{k=k_0}^{k_1} \frac{\g_k \|G_k\|^2}{4 } + \frac{4c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}} \nonumber\\ & \qquad + \frac{4\lambda'^2_A B_K k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2} \qquad \text{(see (\ref{eq:to be used for feas},\ref{eq:to be used later on}))} \nonumber\\ & \le \frac{8c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b^2_{k_0}} \nonumber\\ & \qquad + \frac{8\lambda'^2_A B_K k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2} + \mu. \qquad \text{(see (\ref{eq:final bnd on G}))} \label{eq:final bnd on feas gap} \end{align} By adding (\ref{eq:final bnd on G},\ref{eq:final bnd on feas gap}), we find that \begin{align} \sum_{k=k_0}^{k_1} \g_k \|G_k\|^2 + \|A(x_k)\|^2 & \le \frac{24c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b^2_{k_0}} \nonumber\\ & \qquad + \frac{24\lambda'^2_A B_K k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2} + 5\mu. \label{eq:overall bnd raw} \end{align} When we applied Lemma \ref{lem:bnd bnd Ak} earlier, we assumed that (\ref{eq:good neighb}) holds. Let us revisit the first assumption in \eqref{eq:good neighb}; the other two assumptions in \eqref{eq:good neighb} are forced directly by the algorithm. We first derive a weaker but uniform bound on the feasibility gap. For every $k\in K$, it holds that \begin{align} \|A(x_k)\|^2 & \le \sum_{i=k_0}^{k_1} \|A(x_i)\|^2 \nonumber\\ & \le \frac{8c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b^2_{k_0}} + \frac{8\lambda'^2_A B_K k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2} + \mu. \label{eq:rate of feas gap} \end{align} Therefore, we may replace the first assumption in \eqref{eq:good neighb} with the assumption that \begin{align} \frac{8c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b^2_{k_0}} + \frac{8\lambda'^2_A B_K k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2} + \mu \le \rho^2. \label{eq:suff cnd on rho} \end{align} Note that, for \eqref{eq:suff cnd on rho} to hold, it is in particular necessary that $\|A(x_{k_0})\| \le \rho \sqrt{2/\b_{k_0}}$, as seen in \eqref{eq:defn mu}. In order to interpret (\ref{eq:overall bnd raw},\ref{eq:suff cnd on rho}), we next estimate $B_{K}$ in \eqref{eq:defn of BK}. To that end, let us first control the growth of the dual sequence $\{y_k\}_k$. Recalling \eqref{eq:y update recall} and for every $k\in K$, we write that \begin{align} \|y_k\| & \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \s_i \|A(x_i)\| \qquad \text{(see \eqref{eq:y update recall})} \nonumber\\ & \le \|y_{k_0}\|+ \sum_{i=k_0+1}^k \frac{\rho \s_{k_0} k_0 \log^2(k_0+1)}{ k\log^2(k+1)} \qquad \text{(see (\ref{eq:good neighb},\ref{eq:beta n sigma}))} \nonumber\\ & \le \|y_{k_0}\|+ c\rho \s_{k_0} k_0 \log^2(k_0+1) \nonumber\\ & =: y_{\max}. \label{eq:dual growth} \end{align} With the growth of the dual sequence uncovered above, we evaluate $B_{K}$ as \begin{align} B_{K} & = \sum_{k=k_0}^{k_1} \frac{\|y_k\|^2}{k\log^2 (k+1)} \qquad \text{(see \eqref{eq:defn of BK})} \nonumber\\ & \le \sum_{k=k_0}^{k_1} \frac{y_{\max}^2}{k\log^2 (k+1)} \qquad \text{(see \eqref{eq:dual growth})} \nonumber\\ & \le cy_{\max}^2. \qquad \text{(see \eqref{eq:defn of BK})} \label{eq:Bk evaluated} \end{align} In order to interpret (\ref{eq:overall bnd raw},\ref{eq:suff cnd on rho}), it still remains to estimate the primal step sizes $\{\g_k\}_k$. To invoke \eqref{eq:moreover}, we in turn need to gauge how smooth the augmented Lagrangian $\mathcal{L}_{\beta_k}(\cdot,y_k)$ is. For every $k\in K$, note that \begin{align} \lambda_{\beta_k} & \le \lambda_f+ \sqrt{m}\lambda_A \left( \|y_k\| + \b_k \rho \right)+ \b_k d \lambda'^2_A \qquad \text{(see \eqref{eq:smoothness of Lagrangian})} \nonumber\\ & \le (\lambda_f+ \sqrt{m}\lambda_A y_{\max} ) + \b_k \left( \sqrt{m}\lambda_A \rho + d \lambda'^2_A \right). \qquad \text{(see \eqref{eq:dual growth})} \label{eq:smoothness at k} \end{align} We are now in position to invoke \eqref{eq:moreover} by writing that \begin{align} \gamma_k & \ge \frac{\theta}{\lambda_{\beta_k}} \qquad \text{(see \eqref{eq:moreover})}\nonumber\\ & \ge \frac{\theta}{ (\lambda_h+ \sqrt{m}\lambda_A y_{\max} )+ \b_k \left( \sqrt{m}\lambda_A \rho + d \lambda'^2_A \right) } \qquad \text{(see \eqref{eq:smoothness at k})} \nonumber\\ & \ge \frac{\theta}{ 2 \b_k \left( \sqrt{m}\lambda_A \rho + d \lambda'^2_A \right)} \qquad ( \text{(\ref{eq:beta n sigma}) and } k_0 \gg 1 ) \nonumber\\ & \ge \frac{\theta }{ 2 \b_{k_0} \left( \sqrt{m}\lambda_A \rho + d \lambda'^2_A \right)}\sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}} \qquad \text{(see \eqref{eq:beta n sigma})} \nonumber\\ & =: \overline{\g} \sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}}, \label{eq:low bnd on gammas} \end{align} for every $k\in K$. The first consequence of \eqref{eq:low bnd on gammas} is that \eqref{eq:to be used later on} holds automatically when $k_0$ is sufficiently large. Having estimated $B_K$ and $\{\gamma_k\}_k$, we can also rewrite~(\ref{eq:overall bnd raw},\ref{eq:suff cnd on rho}). Indeed, (\ref{eq:overall bnd raw},\ref{eq:Bk evaluated},\ref{eq:low bnd on gammas}) together imply that \begin{align} & \sum_{k=k_0}^{k_1} \overline{\g} \|G_k\|^2 \sqrt{\frac{k_0 \log^2(k_0+1)}{k\log^2(k+1)}} + \|A(x_k)\|^2 \nonumber\\ & \le \frac{24c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b^2_{k_0}} + \frac{24c\lambda'^2_A y_{\max}^2 k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2} + 5\mu, \end{align} and, consequently, \begin{align} & \min_{k\in K}\overline{\g} \|G_k\|^2 \sqrt{\frac{k_0 \log^2(k_0+1)}{k_1\log^2(k_1+1)}} + \|A(x_k)\|^2\nonumber\\ & \le \frac{24c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b^2_{k_0}(k_1-k_0)} + \frac{24c\lambda'^2_A y_{\max}^2 k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2(k_1-k_0)} + \frac{5\mu}{k_1-k_0}. \label{eq:exact dependences} \end{align} Using (\ref{eq:Bk evaluated}), we also see that, for \eqref{eq:suff cnd on rho} to hold, it suffices to have that \begin{align} \frac{8c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b^2_{k_0}} + \frac{8c\lambda'^2_A y_{\max}^2 k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2} + \mu \le \rho^2. \label{eq:suff cnd on rho final} \end{align} This completes the proof of Theorem \ref{thm:main}. diff --git a/tex/Paper/sections/appendix.tex b/tex/Paper/sections/appendix.tex index 23f87e4..f1c50db 100644 --- a/tex/Paper/sections/appendix.tex +++ b/tex/Paper/sections/appendix.tex @@ -1,217 +1,265 @@ \section{Proof of Lemma \ref{lem:smoothness}\label{sec:proof of smoothness lemma}} \textbf{We assume Hessian exists. We shouldn't assume that for a strictly correct proof!} Note that \begin{align} \mathcal{L}_{\beta}(x,y) = f(x) + \sum_{i=1}^m y_i A_i (x) + \frac{\b}{2} \sum_{i=1}^m (A_i(x))^2, \end{align} which implies that \begin{align} \nabla_x \mathcal{L}_\beta(x,y) & = \nabla f(x) + \sum_{i=1}^m y_i \nabla A_i(x) + \frac{\b}{2} \sum_{i=1}^m A_i(x) \nabla A_i(x) \nonumber\\ & = \nabla f(x) + DA(x)^\top y + \b DA(x)^\top A(x), \end{align} where $DA(x)$ is the Jacobian of $A$ at $x$. By taking another derivative with respect to $x$, we reach \begin{align} \nabla^2_x \mathcal{L}_\beta(x,y) & = \nabla^2 f(x) + \sum_{i=1}^m \left( y_i + \b A_i(x) \right) \nabla^2 A_i(x) + \b \sum_{i=1}^m \nabla A_i(x) \nabla A_i(x)^\top. \end{align} It follows that \begin{align} \|\nabla_x^2 \mathcal{L}_\beta(x,y)\| & \le \| \nabla^2 f(x) \|+ \max_i \| \nabla^2 A_i(x)\| \left (\|y\|_1+\b \|A(x)\|_1 \right) + \beta\sum_{i=1}^m \|\nabla A_i(x)\|^2 \nonumber\\ & \le \lambda_h+ \sqrt{m} \lambda_A \left (\|y\|+\b \|A(x)\| \right) + \b \|DA(x)\|^2_F. \end{align} For every $x$ such that $\|A(x)\|\le \rho$ and $\|x\|\le \rho'$, we conclude that \begin{align} \|\nabla_x^2 \mathcal{L}_\beta(x,y)\| & \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) + \b \max_{\|x\|\le \rho'}\|DA(x)\|_F^2, \end{align} which completes the proof of Lemma \ref{lem:smoothness}. \section{Proof of Lemma \ref{lem:11}\label{sec:proof of descent lemma}} Throughout, let \begin{align} G = G_{\b,\g}(x,y) = \frac{x-x^+}{\g}, \end{align} for short. Suppose that $\|A(x)\|\le \rho$, $\|x\|\le \rho$, and similarly $\|A(x^+)\|\le \rho$, $\|x^+\|\le \rho'$. An application of Lemma \ref{lem:smoothness} yields that \begin{align} \L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y)+ \langle x^+-x,\nabla_x \L_\b(x,y) \rangle + \frac{\lambda_\b}{2} \|x^+ - x\|^2 + g(x^+) \nonumber\\ & = \L_\b(x,y)-\g \langle G ,\nabla_x \L_\b(x,y) \rangle + \frac{\g^2 \lambda_\b }{2} \|G\|^2 + g(x^+) \label{eq:descent pr 1} \end{align} Since $x^+ = P_g(x - \g \nabla_x \L_\b(x,y))$, we also have that \begin{align} G - \nabla_x \L_\b(x,y) = \xi \in \partial g(x^+). \label{eq:opt of prox} \end{align} By combining (\ref{eq:descent pr 1},\ref{eq:opt of prox}), we find that \begin{align} \L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y) -\g \|G\|^2 + \g \langle G, \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\ & = \L_\b(x,y) -\g \|G\|^2 + \langle x- x^+ , \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\ & \le \L_\b(x,y) + g(x) - \g\left( 1-\frac{\g\lambda_\b}{2}\right) \|G\|^2, \end{align} where the last line above uses the convexity of $g$. Recalling that $\g\le 1/\lambda_\b$ completes the proof of Lemma \ref{lem:11}. \section{Proof of Lemma \ref{lem:eval Lipsc cte}\label{sec:proof of linesearch}} -Recalling $x^+_{\gamma}$ in \eqref{eq:defn of gamma line search}, we note that +By optimality of $x^+_{\gamma}$ in \eqref{eq:defn of x+gamma}, we note that \begin{equation} -x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = - \g\xi \in -\partial g (x^+_{\gamma}). +x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = - \g\xi \in -\g \partial g (x^+_{\gamma}). \label{eq:optimality of uplus} \end{equation} -Lastly, $\gamma$ by definition in \eqref{eq:defn of gamma line search} satisfies +By definition in \eqref{eq:defn of gamma line search}, $\gamma$ also satisfies \begin{align} & \mathcal{L}_{\beta}(x^+_{\gamma},y) + g(x_\g^+) \nonumber\\ & \le \mathcal{L}_\beta(x,y) + \left\langle x^+_{\gamma} - x , \nabla_x \mathcal{L}_\beta (x,y) \right\rangle + \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+) \nonumber\\ & = \mathcal{L}_\beta(x,y) + \left\langle x - x^+_{\gamma} ,\xi \right\rangle - \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+)\nonumber\\ & \le \mathcal{L}_\beta(x,y) - \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x) - g(x^+_\g) \qquad (\text{convexity of } g) \nonumber\\ & = \mathcal{L}_\beta(x,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(x,y)\|^2 +g(x) - g(x^+_\g), \qquad \text{(see Definition \ref{def:grad map})} \end{align} -which completes the proof of Lemma \ref{lem:eval Lipsc cte}. +which completes the proof of Lemma \ref{lem:eval Lipsc cte} since \eqref{eq:moreover} follows directly from \eqref{eq:defn of gamma line search}. +\section{Proof of Proposition \ref{prop:convex} \label{sec:proof of prop}} +To prove the first claim of the proposition, suppose that the Slater's condition does not hold, namely, suppose that +\begin{equation} +\Null(A)\cap \relint(C) = \emptyset, +\label{eq:no feas} +\end{equation} +where $\Null(A)$ and $\relint(C)$ denote the null space of the matrix $A$ and the relative interior of $C$, respectively. +We have assumed that \eqref{prob:01} is feasible, namely, there exists $x\in C$ such that $Ax=0$. It follows from \eqref{eq:no feas} that $x\in \text{boundary}(C)$ and that $\Null(A)$ supports $C$ at $x$, namely, +$A x\ge 0$, for every $x\in C$. (The inequality applies to each entry of the vector $Ax$.) +Consequently, $\Null(A) \cap T_C(x) \ne \{0\}$, where $T_C(x)$ is the tangent cone of the set $C$ at $x_0$. +Equivalently, it holds that +$\row(A)\cap N_C(x) \ne \{0\}$, where $\row(A)$ is the row space of the matrix $A$ and $N_C(x)$ is the normal cone to $C$ at $x_0$. That is, there exists a unit-norm vector $v$ such that +$P_{T_C(x)}A^\top v=0$ +and, consequently, $P_S P_{T_C(x)}A^\top v=0$. Let us take $\rho'=\|x\|$ in \eqref{eq:nu for cvx}. We then conclude that $\nu(g,A,S,1,\|x\|)=\nu(g,A,S,\infty,\|x\|)=0$, namely, the uniform regularity also does not hold for any $\rho\ge 0$ and $\rho'=\|x\|$. The last identity follows from the homogenity +Because increasing $\rho'$ cannot increase the right-hand side of \eqref{eq:new slater defn}, we find that $\nu(g,A,S,\infty,\rho')=0$, for any $\rho'\ge \max_{x\in C} \|x\|$, which proves the first claim in Proposition \ref{prop:convex}. + +For the converse, we can verify that it suffices to take $\text{row}(A) \subseteq S$. +Next, suppose that uniform regularity does not hold, namely, there exists $x\in \RR^d$ such that +\begin{align*} +\eta_m ( (I_d - P_S P_{N_C(x)}) A^\top ) =0, +\quad A(x) = 0, \quad x\in C. +\end{align*} +The first identity above can be rewritten as +\begin{align*} +\eta_m ( (I_d - P_S P_{N_C(x)}) A^\top ) +& = +\eta_m ( P_S (\text{id} - P_{N_C(x)}) A^\top ) +\qquad (\text{row}(A) \subseteq S ) + \nonumber\\ + & += \eta_m(P_S P_{T_C(x)} A^\top) +\qquad \text{(Moreau decomposition)} +\nonumber\\ +& = \eta_m( P_{T_C(x)} A^\top) =0, +\qquad (S = \text{aff}(C)) +\end{align*} +where $\text{aff}(C)$ is the affine hull of $C$. +Then it must be the case that $x\in \text{boundary}(C)$. Indeed, if $x\in \text{relint}(C)$, we have that $T_C(x)=S$ and thus +\begin{align*} +\eta_m(P_{S}P_{T_C(x)} A^\top) & = \eta_m( P_S A^\top)\nonumber\\ +& = \eta_m( A^\top) +\qquad ( \text{row}(A) \subseteq S) \nonumber\\ +& >0, +\end{align*} +where the last line holds because, by assumption, $A$ is full-rank. +Therefore, $x\in\text{boundary}(C)$ and it follows that $\text{dim}(T_C(x)) \ge m-1$. That is, $\text{row}(A)$ contains a unique direction orthogonal to $T_C(x)$. In particular, it follows that $\text{null}(A) \subseteq T_C(x)$. That is, $\text{null}(A)$ supports $C$, which in turn implies that $\text{null}(A)\cap \text{int}(C) =\emptyset $, namely, the Slater's condition does not hold. This proves the second claim in Proposition \ref{prop:convex}. + + \section{Proof of Lemma \ref{lem:bnd bnd Ak} \label{sec:proof of bnd Ak}} Throughout, we assume that \begin{align} \max_{k\in K} \|A(x_k)\| \le \rho, \qquad \max_{k\in K} \| x_k\| \le \rho', \qquad \max_{k\in K} \| x_{k+1}-x_k \| \le \rho''. \label{eq:bndness in slater proof} \end{align} From the $x$ update in \eqref{eq:update uk recall}, it follows that \begin{align} x_{k+1}-x_k + \g_k \nabla f(x_k) + \g_k DA(x_k)^\top (y_k + \b_k A(x_k) ) =- \xi \in - \partial g(x_{k+1}), \end{align} which, after recalling \eqref{eq:grad map recall}, can be written as \begin{align} -\frac{ G_k}{\b_k} + \frac{ \nabla f(x_k)}{\b_k} + \frac{ DA(x_k)^\top y_k }{\b_k} + DA(x_k)^\top A(x_k) \in - \frac{\partial g(x_{k+1})}{\b_k \g_k}. \label{eq:to be projected} \end{align} Let \begin{align} \cone(x_{k+1}) = \text{cone}(\partial g(x_{k+1}) )^*, \end{align} be the shorthand for the dual cone associated with \begin{equation} \cone(\partial g(x_{k+1})) = \bigcup_{\alpha \ge 0} \alpha \cdot \partial g(x_{k+1}) \subseteq \RR^d. \label{eq:defn of dual cone} \end{equation} By projecting both sides \eqref{eq:to be projected} onto $\cone_{k+1}$, we find that \begin{align} & P_{\cone(x_{k+1})}\left( - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{DA(x_k)^\top y_k}{\b_k} + DA(x_k)^\top A(x_k) \right) \nonumber\\ & \in P_{\cone(x_{k+1})}\left( - \frac{\partial g(x_{k+1}) }{\b_k \g_k} \right) = \{ 0\}, \label{eq:before Sk} \end{align} where the equality follows from the duality of $\cone(x_{k+1})$ and $\cone(\partial g(x_{k+1}))$, see \eqref{eq:defn of dual cone}. Consider a subspace $S_K\subseteq \RR^d$ such that \begin{align} S _{K} \supseteq \bigcup_{k\in K} P_{\cone(x_{k+1})}\left( DA(x_{k+1})^\top A(x_{k+1}) \right), \label{eq:defn of S} \end{align} and project both sides of \eqref{eq:before Sk} onto $S_K$ to reach \begin{align} & P_{S_K}P_{\cone(x_{k+1})}\left( - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{DA(x_k)^\top y_k}{\b_k} + DA(x_k)^\top A(x_k) \right) = 0. \label{eq:pre norm} \end{align} By taking the norm and then applying the triangle inequality, we argue that \begin{align} & \left\| P_{S_K} P_{\cone(x_{k+1})} ( DA(x_k)^\top A(x_k) ) \right\| \nonumber\\ & \le \left\| P_{S_K} P_{\cone(x_{k+1})} \left( - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{DA(x_k)^\top y_k}{\b_k} \right) \right\| \qquad \text{(see \eqref{eq:pre norm})}. \label{eq:post norm} \end{align} Because proximal map is non-expansive and $P_{S_K}P_{\cone_{k+1}}(0) = 0$, we may upper bound the last line above as \begin{align} & \left\| P_{S_K} P_{\cone(x_{k+1})} ( DA(x_k)^\top A(x_k) ) \right\| \nonumber\\ & \le \left\| - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{DA(x_k)^\top y_k}{\b_k} \right\| \nonumber\\ & \le \frac{1}{\b_k} \left( \|G_k\| + \|\nabla f(x_k) \| + \|DA(x_k)^\top y_k\| \right). \qquad \text{(triangle inequality)} \nonumber\\ & \le \frac{1}{\b_k} \left( \|G_k\| +\lambda'_f+ \lambda'_A \|y_k\| \right) , \label{eq:slater proof 0} \end{align} where \begin{align} \lambda'_f := \max_{\|x\| \le \rho'} \| \nabla f(x)\|, \qquad \lambda'_A := \max_{\|x\| \le \rho'} \| DA(x)\|. \end{align} To lower bound the first line of \eqref{eq:slater proof 0}, we must impose a restricted injectivity as follows. We let $S_{K}$ with orthonormal columns denote a basis for the subspace $S_K$ in \eqref{eq:defn of S}, to simplify the notation. We then assume that $ \nu_A>0$, where \begin{align} \nu_A := \begin{cases} \min_{v,x} \, \frac{\left\| S_{K}^\top P_{\cone(x_{k+1})} ( DA(x)^\top v) \right\|}{\|v\|} \\ \|v\|\le \rho\\ \|x\|\le \rho'. \end{cases} \label{eq:new slater proof} \end{align} Recalling the first bound in \eqref{eq:bndness in slater proof}, for every $k\in K $, we may therefore write that \begin{align} & \left\| P_{S_K} P_{\cone(x_{k+1})} ( DA(x_{k})^\top A(x_k)) \right\| \nonumber\\ & \ge \left\| P_{S_K} P_{\cone(x_{k+1})} ( DA(x_{k+k})^\top A(x_{k})) \right\| - \left\| (DA(x_{k+1}) - DA(x_{k})) ^\top A(x_{k}) \right\| \nonumber\\ & \ge \nu_A \|A(x_k)\| - \| DA(x_{k+1})-DA(x_k) \| \|A(x_k) \|, \qquad \text{(see \eqref{eq:new slater proof})} \label{eq:slater proof 1} \end{align} where the second line above again uses the non-expansiveness of $P_{S_K}$ and $P_{\cone_{k+1}}$. The remaining term in \eqref{eq:slater proof 1} is bounded as \begin{align} \| DA(x_{k+1})-DA(x_k) \| & \le \lambda_A \|x_{k+1}-x_k\| \le \lambda_A \rho''. \qquad \text{(see (\ref{eq:smoothness basic},\ref{eq:bndness in slater proof}))} \end{align} Assuming that \begin{align} \lambda_A \rho'' \le \nu_A/2, \end{align} allows us to simplify the last line of \eqref{eq:slater proof 1} as \begin{align} \left\| P_{S_K} P_{\cone(x_{k+1})} ( DA(x_{k})^\top A(x_k)) \right\| \ge \frac{\nu_A }{2} \|A(x_k)\|, \end{align} which, after substituting in \eqref{eq:slater proof 0}, yields that \begin{align} \|A(x_k)\| & \le \frac{2}{\b_k\nu_A} \left( \|G_k\| + \lambda'_f + \lambda'_A \|y_k\| \right), \end{align} and completes the proof of Lemma \ref{lem:bnd bnd Ak}. diff --git a/tex/Paper/sections/introduction.tex b/tex/Paper/sections/introduction.tex index b22c589..3946aad 100644 --- a/tex/Paper/sections/introduction.tex +++ b/tex/Paper/sections/introduction.tex @@ -1,73 +1,117 @@ \newpage \section{Introduction} \label{intro} -We study the non-convex optimization program +We study the nonconvex optimization program \begin{equation} \label{prob:01} \begin{cases} \underset{x}{\min}\,\, f(x)+g(x)\\ A(x) = 0, \end{cases} \end{equation} where (possibly nonconvex) $f:\mathbb{R}^d\rightarrow\mathbb{R}$ and (possibly nonlinear) $A:\mathbb{R}^d\rightarrow\mathbb{R}^m$ satisfy \begin{align} \| \nabla f(x) - \nabla f(x')\| \le \lambda_f \|x-x'\|, -\qquad -\| DA(x) - DA(x') \| \le \lambda_A \|x-x'\|, +\quad +\| \D A(x) -\D A(x') \| \le \lambda_A \|x-x'\|, \label{eq:smoothness basic} \end{align} -for every $x,x'\in \mathbb{R}^d$. Above, $\nabla f(x)\in \mathbb{R}^d$ is the gradient of $f$ and $DA(x)\in \mathbb{R}^{m\times d}$ is the Jacobian of $A$. Moreover, we assume that $g:\mathbb{R}^d\rightarrow\mathbb{R}$ is convex. For instance, for a convex set $C\subset\RR^d$ and letting $g=1_C$ be the convex indicator function on $C$, Program~\eqref{prob:01} is a noncnovex optimization problem with the convex constraint $x\in C$. +for every $x,x'\in \mathbb{R}^d$. Above, $\nabla f(x)\in \mathbb{R}^d$ is the gradient of $f$ at $x$ and $\D A(x)\in \mathbb{R}^{m\times d}$ is the Jacobian of $A$ at $x$. Moreover, we assume that $g:\mathbb{R}^d\rightarrow\mathbb{R}$ is a proximal-friendly (but possibly nonsmooth) convex function. +%For instance, for a convex set $C\subset\RR^d$ and letting $g=1_C$ be the convex indicator function on $C$, Program~\eqref{prob:01} is a noncnovex optimization problem with the convex constraint $x\in C$. -A broad range of problems in computer science \cite{khot2011grothendieck, lovasz2003semidefinite}, machine learning \cite{mossel2015consistency, song2007dependence}, and signal processing \cite{singer2011angular, singer2011three} naturally fall under this template, including but not limited to maximum cut, clustering, generalized eigenvalue, as well as community detection. See Section ? for several examples. \textbf{We should explain this better and give more high level examples to motivate this program.} +A host of problems in computer science \cite{khot2011grothendieck, lovasz2003semidefinite}, machine learning \cite{mossel2015consistency, song2007dependence}, and signal processing \cite{singer2011angular, singer2011three} naturally fall under the template of~\eqref{prob:01}, including max-cut, clustering, generalized eigenvalue, as well as community detection. % %An example of our template in \eqref{prob:01} is semi-definite programming which provides powerful relaxations to above problems. Denote the space of $d'\times d'$ symmetric matrices by $\mathbb{S}^{d'\times d'}$ and consider % %\begin{equation} % \label{sdp} % \min_x \{h'(x): A'(x) = b', ~~x \in \mathcal{C'}~~\text{and}~~x\succeq0 \} %\end{equation} % %where $h': \mathbb{S}^{d'\times d'} \to \RR$, $A'\colon\mathbb{S}^{d'\times d'}\to\RR^m$, $b'\in\RR^m$, and $C' \subseteq \mathbb{R}^{d'\times d'}$. This template clearly can be put to the form of \eqref{prob:01} by using \emph{Burer-Monteiro factorization} \cite{burer2003nonlinear, burer2005local}. -The \emph{augmented Lagrangian method} \cite{luenberger1984linear} is a powerful approach to solve Program \eqref{prob:01}, see Section \ref{sec:related work} for a review of the related literature as well as other approaches to solve Program \eqref{prob:01}. Indeed, for positive $\beta$, it is easy to verify that Program \eqref{prob:01} is equivalent to +To address these applications, this paper builds up on the classical ideas in linearized augmented Lagrangian framework and proposes a simple, intuitive, and easy-to-implement algorithm to solve~\ref{prob:01} with provable convergence rate and under an interpretable geometric condition. In this context, we also develop and analyze the Alternating Direction Method of Multipliers (ADMM). Before we elaborate on the results, let us first motivate~\eqref{prob:01} with an important application to Semi-Definite Programming (SDP): + +\vspace{-3mm} +\paragraph{\textbf{Vignette: Burer-Monteiro splitting.}} +A powerful convex relaxation for max-cut, clustering, and several other problems mentioned above is provided by the SDP +\begin{equation} +\label{eq:sdp} +\begin{cases} +\underset{X\in\mathbb{S}^{d \times d}}{\min} \langle C, X \rangle \\ +B(X) = b, \,\, X \succeq 0, +\end{cases} +\end{equation} +% +where $C\in \RR^{d\times d}$ and $X$ is a positive semidefinite and symmetric $d\times d$ matrix, +%$\mathbb{S}^{d \times d}$ denotes the set of real symmetric matrices, +and ${B}: \mathbb{S}^{d\times d} \to \mathbb{R}^m$ is a linear operator. If the unique-games conjecture is true, SDPs achieve the best approximation for the underlying discrete problem~\cite{raghavendra2008optimal}. + +Since $d$ is often large, many first- and second-order methods for solving such SDPs are immediately ruled out, not only due to their high computational complexity, but also due to their storage requirements, which are $\mathcal{O}(d^2)$. + +A contemporary challenge in optimization therefore is to solve SDPs in small space and in a scalable fashion. A recent algorithm, namely, homotopy conditional gradient method based on Linear Minimization Oracles (LMO), can address this template in small space via sketching \cite{yurtsever2018conditional}; however, such LMO-based methods are extremely slow in obtaining accurate solutions. + + +%In practice, $d$ is often very large which makes interior point methods, with their poor scaling in $d$, an unattractive option for solving ~\eqref{eq:sdp}. Attempts to resolve this issue prompted extensive research in computationally- and memory-efficient SDP solvers. The first such solvers relied on the so-called Linear Minimization Oracle (LMO), reviewed in Section~\ref{sec:related work}, alongside other scalabe SDP solvers. + +A key approach for solving \eqref{prob:01}, dating back to~\cite{burer2003nonlinear, burer2005local}, is the so-called Burer-Monteiro (BR) splitting $X=UU^\top$, where $U\in\mathbb{R}^{d\times r}$ and $r$ is selected according to the guidelines in~\cite{pataki1998rank, barvinok1995problems}. +%so as to remove spurious local minima of the nonconvex factorized problem. Evidently, BR splitting has the advantage of lower storage and faster iterations. +This splitting results in the following nonconvex problem +\begin{equation} +\label{prob:nc} +\begin{cases} +\underset{U\in\mathbb{R}^{d \times r}}{\min} \langle C, UU^\top \rangle \\ +B(UU^\top) = b, +\end{cases} +\end{equation} +which can be written in the form of~\eqref{prob:01}. When $r$ is sufficiently large and under some additional assumptions, \eqref{eq:sdp} provably does not have any spurious local minima~\cite{boumal2016non,waldspurger2018rank}. + + + +The {augmented Lagrangian method} \cite{luenberger1984linear} provides a powerful framework to solve~\eqref{prob:01}, reviewed carefully in Section \ref{sec:related work}. Indeed, for positive $\beta$, it is easy to verify that \eqref{prob:01} is equivalent to \begin{align} \min_{x} \max_y \,\,\mathcal{L}_\beta(x,y) + g(x), \label{eq:minmax} \end{align} where \begin{align} \label{eq:Lagrangian} \mathcal{L}_\beta(x,y) := f(x) + \langle A(x), y \rangle + \frac{\beta}{2}\|A(x)-b\|^2, \end{align} -is the augmented Lagrangian corresponding to Program \eqref{prob:01}. The equivalent formulation in Program \eqref{eq:minmax} naturally suggests the following algorithm to solve Program \eqref{prob:01}: +is the augmented Lagrangian corresponding to \eqref{prob:01}. The equivalent formulation in \eqref{eq:minmax} naturally suggests the following iterative algorithm to solve \eqref{prob:01}: \begin{equation}\label{e:exac} x_{k+1} \in \underset{x}{\argmin} \, \mathcal{L}_{\beta}(x,y_k)+g(x), \end{equation} \begin{equation} y_{k+1} = y_k+\s_k A(x_{k+1}). +\label{eq:dual-update} \end{equation} -Updating $x$ in the augmented Lagrangian method requires solving the non-convex Program \eqref{e:exac} to global optimality, which is often intractable. The key contribution of this paper is to provably and efficiently address this challenge by proposing and analyzing a linearized augmented Lagrangian algorithm. Moreover, we extend our results to solve the program -\begin{align} -\begin{cases} -\underset{x,z}{\min}\,\, f(x)+g(x)+h(z)+l(z)\\ -A(x)+B(z) = 0, -\end{cases} -\label{eq:admm} -\end{align} -by developing and analyzing an Alternating Direction Method of Multipliers (ADMM). +Updating $x$ above requires solving the nonconvex problem \eqref{e:exac} to global optimality, which is often intractable. The key contribution of this paper is to provably and efficiently address this challenge by proposing and analyzing a linearized augmented Lagrangian algorithm, as well as its ADMM variant. +%extend our results to solve the problem +%\begin{align} +%\begin{cases} +%\underset{x,z}{\min}\,\, f(x)+g(x)+h(z)+l(z)\\ +%A(x)+B(z) = 0, +%\end{cases} +%\label{eq:admm} +%\end{align} +%by developing and analyzing an Alternating Direction Method of Multipliers (ADMM). \paragraph{\emph{\textbf{Contributions.}} } -In order to solve Program \eqref{prob:01}, this paper proposes to replace the (intractable) Program \eqref{e:exac} with the update +In order to solve \eqref{prob:01}, this paper proposes to replace the (intractable) problem \eqref{e:exac} with the simple update \begin{equation} x_{k+1} = P_g (x_k - \gamma_k \nabla \mathcal{L}_{\beta_k} (x_k,y_k)), \label{eq:new update} \end{equation} -for carefully selected sequences $\{\beta_k,\gamma_k\}_k$. Here, $P_g$ is the proximal operator corresponding to $g$ which is often easy to compute in various applications and consequently the update in \eqref{eq:new update} is inexpensive and fast. +for carefully selected sequences $\{\beta_k,\gamma_k\}_k$. Here, $P_g$ is the proximal operator corresponding to $g$, which is often computationally inexpensive. + +Put differently, instead of fully solving~\eqref{e:exac}, this paper proposes to apply one iteration of the proximal gradient algorithm for every primal update, which is then followed by a dual update in \eqref{eq:dual-update} and an increase in the penalty weight $\b$ to gradually enforce the (nonlinear) constraints in \eqref{prob:01}. + +We prove that this fast and scalable Homotopy Linearized Augmented Lagrangian (HoLAL) converges to a first-order stationary point of \eqref{prob:01} at the rate of \notea{add rate...} Under standard additional conditions, we establish that this stationary point is also of second order, namely, HoLAL converges to a local minimum of \eqref{prob:01}. We also provide an ADMM variant of HoLAL, with the same convergence rate. \notea{How do the rates compare with competitors? Any high level advantage we might have over competitors? Like easy implementation guidelines?} -Put differently, instead of fully solving Program \eqref{e:exac}, this paper proposes to apply one iteration of the projected gradient algorithm for every update. We provide the convergence guarantees for this fast and scalable new algorithm. +As with several other nonconvex solvers, the success of of HoLAL relies on (a variant of) the \emph{uniform regularity} \notea{what to cite?}, a geometric condition akin to the well-established Slater's condition in convex optimization. In fact, we establish that uniform regularity, when limited to convex problems, is equivalent to the Slater's condition. We also verify the uniform regularity in several important examples. -\textbf{Incomplete... We also have to be very careful here...} diff --git a/tex/Paper/sections/preliminaries.tex b/tex/Paper/sections/preliminaries.tex index 983063d..ffae735 100644 --- a/tex/Paper/sections/preliminaries.tex +++ b/tex/Paper/sections/preliminaries.tex @@ -1,101 +1,106 @@ \section{Preliminaries \label{sec:preliminaries}} \paragraph{\textbf{\emph{Notation.}}} -We use the notations $\langle\cdot ,\cdot \rangle $ and $\|\cdot\|$ for the {standard inner} product and norm on $\RR^d${, respectively}. For a convex function $g:\RR^d\rightarrow\RR$, the subdifferential at $x\in \RR^d$ is denoted by $\partial g(x)$ and the proximal operator $P_g:\RR^d\rightarrow\RR^d$ takes $x$ to +We use the notations $\langle\cdot ,\cdot \rangle $ and $\|\cdot\|$ for the {standard inner} product and norm on $\RR^d${, respectively}. Gradient of differentiable $f:\RR^d\rightarrow\RR$ at $x$ is denoted by $\nabla f(x)$. For a differentiable map $A:\RR^d\rightarrow\RR^m$, $\D A(x$ denote its Jacobian at $x$. +For a convex function $g:\RR^d\rightarrow\RR$, the subdifferential at $x$ is denoted by $\partial g(x)$ and the proximal operator $P_g:\RR^d\rightarrow\RR^d$ takes $x$ to \begin{align} P_g(x) = \underset{y}{\argmin} \, g(y) + \frac{1}{2}\|x-y\|^2. \label{eq:defn of prox} \end{align} -In addition, if $g=1_C$ is the indicator function of a convex set or cone, we use the simpler notation $P_C$ instead of $P_{1_C}$ to denote the orthogonal projection onto $C$. -Throughout, $g^*$ and $\partial g^*$ will denote the Fenchel conjugate of $g$ and its subdifferential, respectively. For a cone $C$, we let $C^*$ denote its polar, namely, +In addition, if $g=1_C$ is the indicator function of a convex set or cone, we use the simpler notation $P_C$, instead of $P_{1_C}$, to denote the orthogonal projection onto $C$. +Throughout, $g^*$ and $\partial g^*$ will denote the Fenchel conjugate of $g$ and its subdifferential, respectively. For a cone $C$, we denote its polar by $C^*$, namely, \begin{align} -C^* = \{x: \langle x,x'\rangle \le 0,\,\, \forall x' \}. +C^* = \{x: \langle x,x'\rangle \le 0,\,\, \forall x' \in C \}. \end{align} An integer interval is denoted by $[k_0:k_1]=\{k_0,\cdots,k_1\}$ for integers $k_0 \le k_1$. For matrices, $\|\cdot\|$ and $\|\cdot\|_F$ denote the spectral and Frobenius norms, respectively. \paragraph{\textbf{\emph{{Necessary optimality conditions.}}} \label{sec:opt cnds}} -{First order necessary optimality conditions} for {Program} \eqref{prob:01} are well studied in the literature \cite{rockafellar2009variational}. {Indeed, $u$ is a (first-order) stationary point of Program \eqref{prob:01} if there exists $y\in \RR^m$ for which +{First-order necessary optimality conditions} for \eqref{prob:01} are well-studied~\cite{rockafellar2009variational}. {Indeed, $x$ is a (first-order) stationary point of \eqref{prob:01} if there exists $y\in \RR^m$ for which \begin{align} \begin{cases} --\nabla h(x) - DA(x)^\top y \in \partial g(x)\\ -A(x) = 0, +-\nabla f(x) - \D A(x)^\top y \in \partial g(x)\\ +A(x) = 0. \end{cases} \label{e:inclu1} \end{align} -where $DA(x)$ is the Jacobian of $A$ at $x$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to +Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to \begin{align} \begin{cases} -\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x)\\ A(x) = 0, \end{cases} \label{e:inclu2} \end{align} -which is in turn the necessary optimality condition for Program \eqref{eq:minmax}. } +which is in turn the first-order optimality condition for \eqref{eq:minmax}. } \notea{have to add 2nd optimality?} -\paragraph{\emph{\textbf{Technical lemmas.}}} The proof of the following result is trivial but included in Appendix for completeness. +\paragraph{\emph{\textbf{Technical lemmas.}}} The following standard results and notions are frequently used throughout this paper and proved in the appendix for completeness. The first result below shows that the augmented Lagrangian is smooth, see Appendix~\ref{sec:proof of smoothness lemma} for the proof. \begin{lemma}[Smoothness]\label{lem:smoothness} - For fixed $y\in \RR^m$ and $\rho,\rho'\ge 0$, it holds that + For fixed $y\in \RR^m$ and $\b,\rho,\rho'\ge 0$, it holds that \begin{align} \| \nabla_x \mathcal{L}_{\beta}(x, y)- \nabla_x \mathcal{L}_{\beta}(x', y) \| \le \lambda_\b \|x-x'\|, +\quad \forall x,x'\in \X_{\rho,\rho'}, \end{align} - for every $x,x' \in \{ x'': \|A(x'')\|\le \rho, \|x''\|\le \rho'\}$, where + where +\begin{align} +\X_{\rho,\rho'} := \{ x'': \|A(x'')\|\le \rho, \|x''\|\le \rho'\} \subset \RR^d, +\end{align} \begin{align} \lambda_\beta -& \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) + \b d \lambda'^2_A. +& := \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) + \b d \lambda'^2_A, \label{eq:smoothness of Lagrangian} \end{align} -Above, $\lambda_f,\lambda_A$ were defined in (\ref{eq:smoothness basic}) and \begin{align} -\lambda'_A := \max_{\|x\|\le \rho'}\|DA(x)\|. +\lambda'_A := \max_{\|x\|\le \rho'}\|\D A(x)\|, \end{align} +and $\lambda_f,\lambda_A$ were defined in (\ref{eq:smoothness basic}). \end{lemma} -Gradient mapping, defined below, plays an important role in our convergence analysis. -\begin{definition}\label{def:grad map}\textbf{{(Gradient Mapping)}} Given $x\in \RR^d$ and $\gamma >0$, the gradient mapping $G_{\beta,\gamma}(\cdot; y): \RR^d\rightarrow\RR^d$ takes $x\in \RR^d$ to +Gradient mapping~\notea{what to cite?}, defined below, plays an important role in our convergence analysis. +\begin{definition}\label{def:grad map}\textbf{{(Gradient mapping)}} Given $y\in \RR^d$ and $\gamma >0$, the gradient mapping $G_{\beta,\gamma}(\cdot; y): \RR^d\rightarrow\RR^d$ takes $x\in \RR^d$ to \begin{equation} G_{\b,\gamma}(x,y) = \frac{x-x^+}{\gamma}, \label{eq:grad map} \end{equation} -where $x^+=P_{g}(x-\gamma \nabla \mathcal{L}_ {\beta}(x,y))$. +where $x^+=P_{g}(x-\gamma \nabla_x \mathcal{L}_ {\beta}(x,y))$. \end{definition} -In particular, if we remove $g$ from Program \eqref{prob:01}, the gradient mapping reduces to $G_{\beta,\gamma}(x,y)=\nabla h(x) $. For a sufficiently small step size $\g$, the gradient mapping determines the corresponding descent in the objective function of Program \eqref{eq:minmax}. The following result is standard \cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}, but the proof is given in Appendix \ref{sec:proof of descent lemma} for completeness. +As the name suggests, if in particular we set $g\equiv 0$ in \eqref{prob:01}, the gradient mapping reduces to $G_{\beta,\gamma}(x,y)=\nabla f(x) $. For a sufficiently small step size $\g$, the gradient mapping controls the descent in the objective function of \eqref{eq:minmax}. The following result is standard \cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}, but the proof is given in Appendix \ref{sec:proof of descent lemma} for completeness. \begin{lemma}[Descent lemma]\label{lem:11} For $x\in \RR^d$ and $y\in \RR^m$, let $x^+ = P_g(x - \g \nabla_x \L_\b(x,y) )$, where $\g< 1/\lambda_\b$. For $\rho,\rho'\ge 0$, suppose that \begin{align} -x,x^+ \in \{ x': \|A(x')\| \le \rho, \|x'\| \le \rho' \}. +x,x^+ \in \X_{\rho,\rho'}= \{ x': \|A(x')\| \le \rho, \|x'\| \le \rho' \}. \end{align} Then it holds that \begin{equation} \label{e:deslem} \| G_{\beta,\gamma}(x,y)\|^{2}\leq \frac{2}{\gamma} (\mathcal{L}_\beta(x,y) + g (x)- \mathcal{L}_\beta(x^+,y)- g(x^+) ). \end{equation} \end{lemma} -In {practice}, determining the step size $\gamma$ by computing the right-hand side of \eqref{eq:smoothness of Lagrangian} might be intractable. Instead, we often resort to the classic line search technique, reviewed below and proved in Appendix \ref{sec:proof of linesearch} for completeness. +In {practice}, determining the step size $\gamma$ by computing the right-hand side of \eqref{eq:smoothness of Lagrangian} is infeasible, since $\lambda_f,\lambda_A,\lambda'_A$ are often unknown. Instead, we can resort to the line search technique, reviewed below and proved in Appendix~\ref{sec:proof of linesearch}. \begin{lemma}[Line search] \label{lem:eval Lipsc cte} Fix $\theta \in (0,1)$ and ${\gamma}_0>0$. For $\gamma'>0$, let \begin{align} x^+_{\gamma'} = P_g(x - \gamma' \nabla_x \mathcal{L}_\beta(x,y)), \label{eq:defn of x+gamma} \end{align} and define \begin{align} \gamma & := \max \Big\{ \gamma' ={\gamma}_0 \theta^i : \mathcal{L}_\beta (x^+_{\gamma'},y) \nonumber\\ & \qquad \qquad \quad \le \mathcal{L}_\beta (x,y) + \left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,y) \right\rangle + \frac{1}{2\gamma'} \| x^+_{\gamma'} - x\|^2 \Big\}. \label{eq:defn of gamma line search} \end{align} Then, (\ref{e:deslem}) holds and, moreover, we have that \begin{align} \gamma \ge \frac{\theta}{\lambda_\beta}. \label{eq:moreover} \end{align} \end{lemma} diff --git a/tex/Paper/sections/slater.tex b/tex/Paper/sections/slater.tex index 81da799..6464153 100644 --- a/tex/Paper/sections/slater.tex +++ b/tex/Paper/sections/slater.tex @@ -1,106 +1,142 @@ -\section{Nonconvex Slater's Condition \label{sec:slater}} +\section{Uniform Regularity \label{sec:slater}} -The (convex) Slater's condition (CSC) plays a key role in convex optimization as a sufficient condition for strong duality. As a result, CSC guarantees the success of a variety of primal-dual algorithms for convex and constrained programming. As a visual example, in Program~\eqref{prob:01}, when $f=0$, $g=1_C$ is the indicator function of a convex set $C$, and $A$ is an affine operator, CSC removes any pathological cases by ensuring that the affine subspace is not tangent to $C$, see Figure~\ref{fig:convex_slater}. \textbf{We should add a figure here with circle and line.} +The Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality. As a result, SC guarantees the success of a variety of primal-dual algorithms for constrained convex programming. As a visual example, in problem~\eqref{prob:01}, when $f=0$, $g=1_C$ is the indicator function of a bounded convex set $C\subset\RR^d$, and $A$ is an affine operator, the Slater's condition removes any pathological cases, such as Figure~\ref{fig:convex_slater}, by ensuring that the affine subspace is not tangent to $C$. +\begin{figure}[H] +\begin{center} +\includegraphics[scale=.5]{figures/Slater.pdf} +\caption{Solving (1) can be particularly difficult, even when it is a convex program. As an example, this figure shows a pathological case, where the Slater’s condition does not apply. See Section~\ref{sec:slater} for more details.} +\label{fig:convex_slater} +\end{center} +\end{figure} -Likewise, to successfully solve Program~\eqref{prob:01} with nonlinear constraints, we require the following condition which, loosely speaking, extends CSC to the nonconvex setting, as clarified shortly afterwards. -\begin{definition}\label{defn:nonconvex slater} \textbf{\emph{(Nonconvex Slater's condition)}} -For $\rho,\rho'>0$ and subspace $S\subset \RR^{d}$, let + +\noindent Likewise, to successfully solve problem~\eqref{prob:01} in the presence of nonlinear constraints, we require the following condition which, loosely speaking, extends the Slater's condition to the nonconvex setting, as clarified shortly afterwards. This condition, in a sense, also extends the uniform regularity, introduced in \cite[Definition 2.3]{bolte2018nonconvex}, to the more general problem~\ref{prob:01}. +\begin{definition}\label{defn:nonconvex slater} \textbf{{(Uniform regularity)}} In problem~(\ref{prob:01}), for $\rho,\rho'>0$ and subspace $S\subset \RR^{d}$, let us define \begin{align} -\nu(g,A,S) := +\nu(g,A,S,\rho,\rho') := \begin{cases} -\underset{v,x}{\min} \, \frac{\left\| \left( \operatorname{id} - P_S P_{\partial g(x)/\b} \right) ( DA(x)^\top v) \right\|}{\|v\|} \\ +\underset{v,x}{\min} \, \frac{\left\| \left( \operatorname{id} - P_S P_{\partial g(x)/\b} \right) ( \D A(x)^\top v) \right\|}{\|v\|} \\ \|v\|\le \rho\\ \|x\|\le \rho', \end{cases} \label{eq:new slater defn} \end{align} -where $\operatorname{id}$ is the identity operator, $P_{\partial g(x)/\b}$ projects onto the set $\partial g(x)/\b$, and $DA(x)$ is the Jacobian of $A$. We say that Program (\ref{prob:01}) satisfies the nonconvex Slater's condition if $\nu(g,A,S) >0$. +where $\operatorname{id}$ is the identity operator, $P_{\partial g(x)/\b}$ projects onto the set $\partial g(x)/\b$, and $\D A(x)$ is the Jacobian of $A$. We say that (\ref{prob:01}) satisfies the $(\rho,\rho')$ uniform regularity if $\nu(g,A,S,\rho,\rho') >0$. \end{definition} -A few remarks about the nonconvex Slater's condition (NSC) is in order. +Throughout, we will occasionally suppress the dependence of $\nu$ on its parameters to unburden the notation. A few remarks about uniform regularity are in order. -\paragraph{Jacobian $DA$.} -As we will see later, $DA(x)^\top \overset{\operatorname{QR}}{=} Q(x) R(x)$ in NSC might be replaced with its orthonormal basis, namely, $Q(x)$. For simplicity, we will avoid this minor change and instead, whenever needed, assume that $DA(x)$ is nonsingular; otherwise a simple projection can remove any redundancy from $A(x)=0$ in Program~\eqref{prob:01}. +\paragraph{\textbf{\emph{Jacobian $\D A$.}}} Let $\D A(x)^\top \overset{\operatorname{QR}}{=} Q(x) R(x)$ be the QR decomposition of $\D A(x)^\top$. As we will see shortly, $\D A(x)^\top $ in \eqref{eq:new slater defn} might be replaced with its orthonormal basis, namely, $Q(x)$, to broaden the applicability of uniform regularity. For simplicity, we will avoid this minor change and instead, whenever needed, assume that $\D A(x)$ is nonsingular; otherwise a simple QR decomposition can remove any redundancy from $A(x)=0$ in~\eqref{prob:01}. -\paragraph{Subspace $S$.} The choice of subspace $S$ helps broaden the generality of NSC. In particular, when $S = \RR^{d}$, \eqref{eq:new slater defn} takes the simpler form of +\paragraph{\textbf{\emph{Subspace $S$.}}} The introduction of a subspace $S$ in \eqref{eq:new slater defn} broadens the applicability of uniform regularity, as we will see shortly. In particular, when $S = \RR^{d}$, then \eqref{eq:new slater defn} takes the simpler form of \begin{align} -\nu(g,A,S) := +\nu(g,A,S,\rho,\rho') := \begin{cases} -\underset{v,x}{\min} \, \frac{ \dist( DA(x)^\top v , \partial g(x)/\b) }{\|v\|} \\ +\underset{v,x}{\min} \, \frac{ \dist( \D A(x)^\top v , \partial g(x)/\b) }{\|v\|} \\ \|v\|\le \rho\\ \|x\|\le \rho', \end{cases} \label{eq:defn_nu_A} \end{align} where $\dist(\cdot,\partial g(x)/\b)$ returns the Euclidean distance to the set $\partial g(x)/\b$. -\paragraph{Convex case.} -To better parse Definition \ref{defn:nonconvex slater}, let us assume that $f:\RR^d\rightarrow\RR$ is convex, $g = 1_C:\RR^d\rightarrow\RR$ is the indicator function for a convex and bounded set $C$, and $A$ is a nonsingular linear operator represented with the full-rank matrix $A\in \RR^{m\times d}$. -%We also let $T_C(x)$ denote the tangent cone to $C$ at $x$, and reserve $P_{T_C(x)}:\RR^d\rightarrow\RR^d$ for the orthogonal projection onto this cone. + + + + +\paragraph{\emph{\textbf{Convex case.}}} +To better parse Definition \ref{defn:nonconvex slater}, let us consider the specific example where $f:\RR^d\rightarrow\RR$ is convex, $g = 1_C$ is the indicator function for a bounded convex set $C\subset \RR^d$, and $A$ is a nonsingular linear operator represented with the full-rank matrix $A\in \RR^{m\times d}$. We also let $T_C(x)$ denote the tangent cone to $C$ at $x$, and reserve $P_{T_C(x)}:\RR^d\rightarrow\RR^d$ for the orthogonal projection onto this cone. Assume also that $S=\RR^d$ for clarity. %We also set $S_K = \RR^d$ to simplify the discussion. -We can now study the geometric interpretation of $\nu()$. -Assuming that $S=\RR^d$ and using the well-known Moreau decomposition, it is not difficult to rewrite \eqref{eq:new slater defn} as + +We can now study the geometric interpretation of uniform regularity in this setting. +Using the well-known Moreau decomposition, it is not difficult to rewrite \eqref{eq:new slater defn} as \begin{align} -\nu(g,A,S) & := +\nu(g,A,S,\rho,\rho') & := \begin{cases} \underset{v,x}{\min} \, \, \frac{\left\| P_{T_C(x)} A^\top v \right\|}{\|v\|} \\ \|v\|\le \rho\\ \|x\|\le \rho' \end{cases} \nonumber\\ & = \begin{cases} \underset{x}{\min} \,\, \eta_{m}\left( P_{T_C(x)} A^\top \right) \\ \|x\|\le \rho', \end{cases} \label{eq:nu for cvx} \end{align} -where $P_{T_C(x)}$ is the projection onto the tangent cone of $C$ at $x$, and $\eta_{m}(\cdot)$ returns the $m$th largest singular value of its input matrix. Intuitively then, NSC ensures that the the row span of $A$ is not tangent to $C$, similar to CSC. This close relationship between NSC and CSC is formalized next and proved in Appendix~\ref{sec:proof of prop}. +where $\eta_{m}(\cdot)$ returns the $m^{\text{th}}$ largest singular value of its input matrix. Intuitively then, the uniform regularity ensures that the row span of $A$ is not tangent to $C$, similar to the Slater's condition. This close relationship between the uniform regularity and the Slater's condition is formalized next and proved in Appendix~\ref{sec:proof of prop}. \begin{proposition}\label{prop:convex} -In Program (\ref{prob:01}), suppose that +In (\ref{prob:01}), suppose that \begin{itemize} \item $f:\RR^d\rightarrow\RR$ is convex, -\item $g=1_C$ is the indicator on a convex and bounded set $C\subset\RR^d$, -\item $A:\RR^d\rightarrow\RR^m$ is a nonsingular linear operator, represented with the full-rank matrix $A\in \RR^{m\times d}$.\footnote{As mentioned earlier, it is easy to remove the full-rank assumption by replacing $DA(x)$ in \eqref{eq:new slater defn} with its orthonormal basis. We assume $A$ to be full-rank for clarity at the cost of a simple projection to remove any ``redundant measurements'' from Program~\eqref{prob:01}.} +\item $g=1_C$ is the indicator on a bounded convex set $C\subset\RR^d$, +\item $A:\RR^d\rightarrow\RR^m$ is a nonsingular linear operator, represented with the full-rank matrix $A\in \RR^{m\times d}$,\footnote{As mentioned earlier, it is easy to remove the full-rank assumption by replacing $\D A(x)$ in \eqref{eq:new slater defn} with its orthonormal basis. We assume $A$ to be full-rank for the sake of clarity at the cost of a simple QR decomposition to remove any ``redundant measurements'' from problem~\eqref{prob:01}.} +\item and, lastly, the problem is feasible, namely, there exists $x\in C$ such that $Ax=0$. +\end{itemize} + Then, +\begin{itemize} +\item problem (\ref{prob:01}) satisfies the Slater's condition if (\ref{prob:01}) satisfies ($\infty,\max_{x\in C}\|x\|$) uniform regularity. +\item Moreover, suppose that $S$ is the affine hull of $C$. + Then, (\ref{prob:01}) satisfies the Slater's condition if and only if the uniform regularity with $\rho=0$ holds. \end{itemize} -Assume also that Program (\ref{prob:01}) is feasible, namely, there exists $x\in C$ such that $Ax=0$. Then CSC holds if NSC holds. -Moreover, suppose that $S$ is the affine hull of $C$. Then CSC holds if and only if NSC holds. \end{proposition} -\paragraph{Boundedness of $C$.} -Let us add that the boundedness of $C$ in Proposition~\ref{prop:convex} is necessary. For example, suppose that $A\in \RR^{1\times d}$ is the vector of all ones and, for a small perturbation vector $\epsilon\in \RR^d$, let $C=\{x\in \RR^d: (A+\epsilon) x\ge 0\}$ be a half space. Then CSC holds but $\nu(g,A,S)$ (with $S=\RR^d$) can be made arbitrarily small by making $\epsilon$ small. - - -\section{Proof of Proposition \ref{prop:convex} \label{sec:proof of prop}} -To prove the first claim of the proposition, suppose that CSC does not hold, namely, that -\begin{equation} -\relint(\Null(A) \cap C) = \Null(A)\cap \relint(C) = \emptyset, -\label{eq:no feas} -\end{equation} -where $\Null(A)$ and $\relint(C)$ denote the null space of the matrix $A$ and the relative interior of $C$, respectively. -By assumption, there exists $x_0\in C$ such that $Ax_0=0$. It follows from \eqref{eq:no feas} that $x_0\in \text{boundary}(C)$ and that $\Null(A)$ supports $C$ at $x_0$, namely, -$A x\ge 0$, for every $x\in C$. (The inequality applies to each entry of the vector $Ax$.) -Consequently, $\Null(A) \cap T_C(x_0) \ne \{0\}$, where $T_C(x_0)$ is the tangent cone of the set $C$ at $x_0$. -Equivalently, it holds that -$\row(A)\cap N_C(x_0) \ne \{0\}$, where $\row(A)$ is the row space of the matrix $A$. That is, there exists a unit-norm vector $v$ such that -$P_{T_C(x_0)}A^\top v=0$ -and, consequently, $P_S P_{T_C(x_0)}A^\top v=0$. Let us take $\rho'=\|x_0\|$ in \eqref{eq:nu for cvx}. We then conclude that $\nu(g,A,S)=0$, namely, NSC also does not hold, which proves the first claim in Proposition \ref{prop:convex}. - -For the converse, suppose that NSC does not hold with $\rho'=0$, namely, there exists $x\in \RR^d$ such that -\begin{align*} -\eta_m(P_S P_{T_C(x)} A^\top) & = \eta_m( P_{T_C(x)} A^\top) =0, -\end{align*} -\begin{align*} -\quad A(x) = 0, \quad x\in C, -\end{align*} -where the first equality above holds because $S$ is the affine hull of $C$. - Then, thanks to the boundedness of $C$, it must be the case that $x\in \text{boundary}(C)$. Indeed, if $x\in \text{relint}(C)$, we have that $T_C(x)=S$ and thus -\begin{equation*} -\eta_m(P_{S}P_{T_C(x)} A^\top)= \eta_m( P_S A^\top)>0, -\end{equation*} -where the inequality holds because, by assumption, $A$ is full-rank. -Since $x\in\text{boundary}(C)$, it follows that $\text{dim}(T_C(x)) \ge m-1$. That is, $\text{row}(A)$ contains a unique direction orthogonal to $T_C(x)$. In particular, it follows that $\text{null}(A) \subset T_C(x)$ and, consequently, $\text{int}(\text{null}(A)\cap C) =\emptyset $, namely, CSC does not hold. This proves the second (and last) claim in Proposition \ref{prop:convex}. +\paragraph{\emph{\textbf{Boundedness of $C$.}}} +Assuming the boundedness of $C$ in Proposition~\ref{prop:convex} is, in a practical sense, necessary. For example, suppose that $m=1$, so that $A$ is a $1\times d$ row-vector. For a small perturbation vector $\epsilon\in \RR^d$, let $C=\{x\in \RR^d: (A+\epsilon) x\ge 0\}$ be a half-space. Then the Slater's condition holds regardless of $\|\epsilon\|$. However, even though positive, $\nu(g,A,\RR^d)$ can be made arbitrarily small by making $\|\epsilon\|$ small, which can lead to arbitrarily slow convergence. Put differently, unlike the Slater's condition, $\nu(g,A,\RR^d)$ contains information about the convergence rate, as we will see shortly. + + + +The following result, proved in Appendix \ref{sec:proof of bnd Ak}, uses the nonconvex Slater's condition to find a converse, thus bounding the feasibility gap with the gradient mapping. +\begin{lemma}\label{lem:bnd bnd Ak} \textbf{\emph{(Nonconvex Slater's condition)}} +For an integer interval $K=[k_0:k_1]$, let us set +\begin{align} +\cone(x_{k+1}) := \text{cone}(\partial g(x_{k+1}) )^* \subset \RR^d, +\qquad \forall k\in K, +\end{align} +for short, and +consider a subspace $S_K\subseteq \mathbb{R}^d$ such that +\begin{align} +S _{K} \supseteq \bigcup_{k\in K} P_{\cone(x_{k+1})}\left( DA(x_{k+1})^\top A(x_{k+1}) \right), +\label{eq:defn of S lemma} +\end{align} +and, to simplify the notation, let $S_{K}$ also denote an orthonormal basis for this subspace. +For $\rho,\rho',\rho''>0$, assume that +\begin{align} +\max_{k\in K}\|A(x_k)\| \le \rho, +\qquad +\max_{k\in K}\|x_k\| \le \rho', +\qquad +\max_{k\in K} \|x_{k+1}-x_k\|\le \rho''. +\label{eq:good neighb} +\end{align} +Let us also set +\begin{align} +\nu_A := +\begin{cases} +\min_{v,x} \, \frac{\left\| S_{K}^\top P_{\cone(x)} ( DA(x)^\top v) \right\|}{\|v\|} \\ +\|v\|\le \rho\\ +\|x\|\le \rho'. +\end{cases} +\label{eq:new slater lemma} +\end{align} +and assume that $\nu_A\ge 2\lambda_A\rho''$. +Then it holds that +\begin{align} +\|A(x_k)\| +& \le \frac{2}{\nu_A\b_k} \left( \|G_k\| + \lambda'_f + \lambda'_A \| y_k\| \right), +\qquad \forall k\in K, +\label{eq:bnd on Ak final} +\end{align} +where +\begin{align} +\lambda'_f := \max_{\|x\| \le \rho'} \|\nabla f(x)\|, +\qquad +\lambda'_A := \max_{\|x\|\le \rho'} \| DA(x)\|. +\end{align} +\end{lemma} +Roughly speaking, \eqref{eq:bnd on Ak final} states that the feasibility gap is itself bounded by the gradient map.