diff --git a/ICML19/Drafts/bibliography.bib b/ICML19/Drafts/bibliography.bib index 4ef943b..3b7ef2f 100644 --- a/ICML19/Drafts/bibliography.bib +++ b/ICML19/Drafts/bibliography.bib @@ -1,310 +1,324 @@ %% This BibTeX bibliography file was created using BibDesk. %% http://bibdesk.sourceforge.net/ %% Saved with string encoding Unicode (UTF-8) % Created by Mehmet Fatih Sahin for nonconvex inexact augmented lagrangian paper % December 14, Friday. 01:52 am + +@article{cartis2012complexity, + title={Complexity bounds for second-order optimality in unconstrained optimization}, + author={Cartis, Coralia and Gould, Nicholas IM and Toint, Ph L}, + journal={Journal of Complexity}, + volume={28}, + number={1}, + pages={93--108}, + year={2012}, + publisher={Elsevier} +} + + + @article{ghadimi2016accelerated, title={Accelerated gradient methods for nonconvex nonlinear and stochastic programming}, author={Ghadimi, Saeed and Lan, Guanghui}, journal={Mathematical Programming}, volume={156}, number={1-2}, pages={59--99}, year={2016}, publisher={Springer} } @article{boumal2014manopt, Author = {Boumal, Nicolas and Mishra, Bamdev and Absil, P-A and Sepulchre, Rodolphe}, Journal = {The Journal of Machine Learning Research}, Number = {1}, Pages = {1455--1459}, Publisher = {JMLR. org}, Title = {Manopt, a Matlab toolbox for optimization on manifolds}, Volume = {15}, Year = {2014}} @article{boumal2016global, Author = {Boumal, Nicolas and Absil, P-A and Cartis, Coralia}, Journal = {arXiv preprint arXiv:1605.08101}, Title = {Global rates of convergence for nonconvex optimization on manifolds}, Year = {2016}} @inproceedings{ge2016efficient, Author = {Ge, Rong and Jin, Chi and Netrapalli, Praneeth and Sidford, Aaron and others}, Booktitle = {International Conference on Machine Learning}, Pages = {2741--2750}, Title = {Efficient algorithms for large-scale generalized eigenvector computation and canonical correlation analysis}, Year = {2016}} @inproceedings{boumal2016non, Author = {Boumal, Nicolas and Voroninski, Vlad and Bandeira, Afonso}, Booktitle = {Advances in Neural Information Processing Systems}, Pages = {2757--2765}, Title = {The non-convex Burer-Monteiro approach works on smooth semidefinite programs}, Year = {2016}} @article{davis2011university, Author = {Davis, Timothy A and Hu, Yifan}, Journal = {ACM Transactions on Mathematical Software (TOMS)}, Number = {1}, Pages = {1}, Publisher = {ACM}, Title = {The University of Florida sparse matrix collection}, Volume = {38}, Year = {2011}} @article{clason2018acceleration, Author = {Clason, Christian and Mazurenko, Stanislav and Valkonen, Tuomo}, Journal = {arXiv preprint arXiv:1802.03347}, Title = {Acceleration and global convergence of a first-order primal--dual method for nonconvex problems}, Year = {2018}} @article{chambolle2011first, Author = {Chambolle, Antonin and Pock, Thomas}, Journal = {Journal of mathematical imaging and vision}, Number = {1}, Pages = {120--145}, Publisher = {Springer}, Title = {A first-order primal-dual algorithm for convex problems with applications to imaging}, Volume = {40}, Year = {2011}} @article{tran2018smooth, Author = {Tran-Dinh, Quoc and Fercoq, Olivier and Cevher, Volkan}, Journal = {SIAM Journal on Optimization}, Number = {1}, Pages = {96--134}, Publisher = {SIAM}, Title = {A smooth primal-dual optimization framework for nonsmooth composite convex minimization}, Volume = {28}, Year = {2018}} @article{bhojanapalli2018smoothed, Author = {Bhojanapalli, Srinadh and Boumal, Nicolas and Jain, Prateek and Netrapalli, Praneeth}, Journal = {arXiv preprint arXiv:1803.00186}, Title = {Smoothed analysis for low-rank solutions to semidefinite programs in quadratic penalty form}, Year = {2018}} @article{park2016provable, Author = {Park, Dohyung and Kyrillidis, Anastasios and Bhojanapalli, Srinadh and Caramanis, Constantine and Sanghavi, Sujay}, Journal = {arXiv preprint arXiv:1606.01316}, Title = {Provable Burer-Monteiro factorization for a class of norm-constrained matrix problems}, Year = {2016}} @article{mixon2016clustering, Author = {Mixon, Dustin G and Villar, Soledad and Ward, Rachel}, Journal = {arXiv preprint arXiv:1602.06612}, Title = {Clustering subgaussian mixtures by semidefinite programming}, Year = {2016}} @article{yurtsever2018conditional, Author = {Yurtsever, Alp and Fercoq, Olivier and Locatello, Francesco and Cevher, Volkan}, Journal = {arXiv preprint arXiv:1804.08544}, Title = {A Conditional Gradient Framework for Composite Convex Minimization with Applications to Semidefinite Programming}, Year = {2018}} @inproceedings{jin2017escape, Author = {Jin, Chi and Ge, Rong and Netrapalli, Praneeth and Kakade, Sham M and Jordan, Michael I}, Booktitle = {International Conference on Machine Learning}, Pages = {1724--1732}, Title = {How to Escape Saddle Points Efficiently}, Year = {2017}} @article{doi:10.1137/15M1031631, Author = {Birgin, E. and Gardenghi, J. and Mart{\'\i}nez, J. and Santos, S. and Toint, P.}, Doi = {10.1137/15M1031631}, Eprint = {https://doi.org/10.1137/15M1031631}, Journal = {SIAM Journal on Optimization}, Number = {2}, Pages = {951-967}, Title = {Evaluation Complexity for Nonlinear Constrained Optimization Using Unscaled KKT Conditions and High-Order Models}, Url = {https://doi.org/10.1137/15M1031631}, Volume = {26}, Year = {2016}, Bdsk-Url-1 = {https://doi.org/10.1137/15M1031631}, Bdsk-Url-2 = {http://dx.doi.org/10.1137/15M1031631}} @article{liu2017linearized, Author = {Liu, Qinghua and Shen, Xinyue and Gu, Yuantao}, Journal = {arXiv preprint arXiv:1705.02502}, Title = {Linearized admm for non-convex non-smooth optimization with convergence analysis}, Year = {2017}} @article{wang2015global, Author = {Wang, Yu and Yin, Wotao and Zeng, Jinshan}, Journal = {arXiv preprint arXiv:1511.06324}, Title = {Global convergence of ADMM in nonconvex nonsmooth optimization}, Year = {2015}} @book{bertsekas2014constrained, title={Constrained optimization and Lagrange multiplier methods}, author={Bertsekas, Dimitri P}, year={2014}, publisher={Academic press} } @article{lan2016iteration, Author = {Lan, Guanghui and Monteiro, Renato DC}, Journal = {Mathematical Programming}, Number = {1-2}, Pages = {511--547}, Publisher = {Springer}, Title = {Iteration-complexity of first-order augmented Lagrangian methods for convex programming}, Volume = {155}, Year = {2016}} @article{nedelcu2014computational, title={Computational complexity of inexact gradient augmented Lagrangian methods: application to constrained MPC}, author={Nedelcu, Valentin and Necoara, Ion and Tran-Dinh, Quoc}, journal={SIAM Journal on Control and Optimization}, volume={52}, number={5}, pages={3109--3134}, year={2014}, publisher={SIAM} } @book{bertsekas1999nonlinear, title={Nonlinear programming}, author={Bertsekas, Dimitri P} } @article{hestenes1969multiplier, title={Multiplier and gradient methods}, author={Hestenes, Magnus R}, journal={Journal of optimization theory and applications}, volume={4}, number={5}, pages={303--320}, year={1969}, publisher={Springer} } @incollection{powell1978fast, title={A fast algorithm for nonlinearly constrained optimization calculations}, author={Powell, Michael JD}, booktitle={Numerical analysis}, pages={144--157}, year={1978}, publisher={Springer} } @article{burer2005local, title={Local minima and convergence in low-rank semidefinite programming}, author={Burer, Samuel and Monteiro, Renato DC}, journal={Mathematical Programming}, volume={103}, number={3}, pages={427--444}, year={2005}, publisher={Springer} } @article{burer2003nonlinear, title={A nonlinear programming algorithm for solving semidefinite programs via low-rank factorization}, author={Burer, Samuel and Monteiro, Renato DC}, journal={Mathematical Programming}, volume={95}, number={2}, pages={329--357}, year={2003}, publisher={Springer} } @article{singer2011angular, title={Angular synchronization by eigenvectors and semidefinite programming}, author={Singer, Amit}, journal={Applied and computational harmonic analysis}, volume={30}, number={1}, pages={20}, year={2011}, publisher={NIH Public Access} } @article{singer2011three, title={Three-dimensional structure determination from common lines in cryo-EM by eigenvectors and semidefinite programming}, author={Singer, Amit and Shkolnisky, Yoel}, journal={SIAM journal on imaging sciences}, volume={4}, number={2}, pages={543--572}, year={2011}, publisher={SIAM} } @inproceedings{song2007dependence, title={A dependence maximization view of clustering}, author={Song, Le and Smola, Alex and Gretton, Arthur and Borgwardt, Karsten M}, booktitle={Proceedings of the 24th international conference on Machine learning}, pages={815--822}, year={2007}, organization={ACM} } @inproceedings{mossel2015consistency, title={Consistency thresholds for the planted bisection model}, author={Mossel, Elchanan and Neeman, Joe and Sly, Allan}, booktitle={Proceedings of the forty-seventh annual ACM symposium on Theory of computing}, pages={69--75}, year={2015}, organization={ACM} } @incollection{lovasz2003semidefinite, title={Semidefinite programs and combinatorial optimization}, author={Lov{\'a}sz, L{\'a}szl{\'o}}, booktitle={Recent advances in algorithms and combinatorics}, pages={137--194}, year={2003}, publisher={Springer} } @article{khot2011grothendieck, title={Grothendieck-type inequalities in combinatorial optimization}, author={Khot, Subhash and Naor, Assaf}, journal={arXiv preprint arXiv:1108.2464}, year={2011} } @article{park2016provable, title={Provable Burer-Monteiro factorization for a class of norm-constrained matrix problems}, author={Park, Dohyung and Kyrillidis, Anastasios and Bhojanapalli, Srinadh and Caramanis, Constantine and Sanghavi, Sujay}, journal={arXiv preprint arXiv:1606.01316}, year={2016} } @article{mixon2016clustering, title={Clustering subgaussian mixtures by semidefinite programming}, author={Mixon, Dustin G and Villar, Soledad and Ward, Rachel}, journal={arXiv preprint arXiv:1602.06612}, year={2016} } @inproceedings{kulis2007fast, title={Fast low-rank semidefinite programming for embedding and clustering}, author={Kulis, Brian and Surendran, Arun C and Platt, John C}, booktitle={Artificial Intelligence and Statistics}, pages={235--242}, year={2007} } @article{Peng2007, Author = {Peng, J. and Wei, Y.}, Date-Added = {2017-10-26 15:22:37 +0000}, Date-Modified = {2017-10-26 15:30:53 +0000}, Journal = {SIAM J. Optim.}, Number = {1}, Pages = {186--205}, Title = {Approximating {K}--means--type clustering via semidefinite programming}, Volume = {18}, Year = {2007}} @book{fletcher2013practical, title={Practical methods of optimization}, author={Fletcher, Roger}, year={2013}, publisher={John Wiley \& Sons} } \ No newline at end of file diff --git a/ICML19/Drafts/sections/guarantees.tex b/ICML19/Drafts/sections/guarantees.tex index 7fb4a71..bdf4514 100644 --- a/ICML19/Drafts/sections/guarantees.tex +++ b/ICML19/Drafts/sections/guarantees.tex @@ -1,72 +1,103 @@ \section{Convergence Rate \label{sec:cvg rate}} Let us now quantify the global convergence rate of our algorithm below, with the proof deferred to Appendix~\ref{sec:theory}. \begin{theorem}[\textbf{Inexact AL}] \label{thm:main} In addition to the strong smoothness of $f$ and $A$ in (\ref{eq:smoothness basic}) and for $\rho'>0$, let us define \begin{align} \lambda'_f = \max_{\|x\|\le \rho'} \|\nabla f(x)\|, \qquad \lambda'_A = \max_{\|x\| \le \rho'} \|DA(x)\|, \end{align} to be the (restricted) Lipschitz constants of $f$ and $A$, respectively. For sufficiently large integers $k_0\le k_1$, consider the interval $K=[k_0:k_1]$. Let $\{x_k\}_{k\in K}$ be the output sequence of Algorithm~\ref{Algo:2} and, for $\nu>0$, assume that \begin{align} \nu \|A(x_k)\| & \le \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right), \label{eq:regularity} \end{align} for every $k\in K$. Then the output of the inexact AL in Algorithm~\ref{Algo:2} satisfies \begin{align} & \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ & = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\ &=: \frac{Q}{\beta_{k-1}^2}. \end{align} \end{theorem} A few remarks are in order about Theorem~\ref{thm:main}. \paragraph{Tradeoff.} Roughly speaking, Theorem~\ref{thm:main} states that the iterates of Algorithm~\ref{Algo:2} approach first-order stationarity for Program~\eqref{prob:01} at the rate of $1/{\b_k^2}$, where $\b_k$ is the penalty weight in AL in iteration $k$. Note the inherent tradeoff here: For a faster sequence $\{\b_k\}_k$, the iterates approach stationarity faster but the computational cost of the inexact primal subroutine (Step 2) likely increases. In words, there is a tradeoff between the number of outer and inner loop iterations of Algorithm~\ref{Algo:2}. We highlight this tradeoff for a number of subroutines below. First, we characterize the iteration complexity of Algorithm~\ref{Algo:2} for finding a first-order stationary point of~\eqref{prob:01}. We propose to use the standard accelerated proximal method (APGM), guarantees of which is analyzed in~\cite{ghadimi2016accelerated} for nonconvex problems of the form~\eqref{e:exac}. They show that APGM can find $\epsilon$ first-order stationary point in \begin{equation} -\mathcal{O}\left ( \frac{L_\beta\left( \|x^\star \|^2 + M^2 \right)}{\epsilon} \right) +\mathcal{O}\left ( \frac{L_\beta^2\left( \|x^\star \|^2 + M^2 \right)}{\epsilon} \right) \end{equation} iterations, where $L_\beta$ denotes the Lipschitz constant of the gradient of ${\mathcal{L}_{\beta}(x, y)}$ and $M$ is defined with the following bound: \begin{equation} \min_{\text{dom}\{g\}}\|x\| \leq M. \end{equation} Using this result, we can get the following corollary describing the iteration complexity: \begin{corollary}\label{cor: first_order} Let us use the following form for $\beta$ \begin{equation*} \beta_k = k^\alpha, ~~ \alpha > 0. \end{equation*} If we use APGM from~\cite{ghadimi2016accelerated} for step 2 in Algorithm~\ref{Algo:2}, it finds an $\epsilon$-first-order stationary point in the sense that \begin{equation} \dist(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x)) \leq \epsilon, ~~~~ A(x) = 0 \leq \epsilon. \end{equation} $T_K$ total number of iterations where \begin{equation} T_K \leq \left\lceil \frac{Q^{\frac{3}{2}+\frac{1}{2\alpha}}(\|x^\star\|^2 + M^2)}{\epsilon^{3+\frac{1}{\alpha}}} \right\rceil. \end{equation} \end{corollary} +{\color{red}{Ahmet (note to myself): not sure of the constants of trust-region, check again}} + +We continue by deriving total complexity bounds for finding second order stationary points of~\eqref{prob:01} in the sense that + +\begin{equation} +\lambda_{\text{min}}\left(\nabla _{xx} \mathcal{L}_{\beta_k}(x_k, y_k) \right) \geq -\epsilon. +\end{equation} + +We consider the case of using trust region method for Step 2 of Algorithm~\ref{Algo:2} developed in~\cite{cartis2012complexity}. +In~\cite{cartis2012complexity}, the complexity for finding the second order stationary point of an unconstrained problem is derived to be + +\begin{equation} +\mathcal{O}\left ( \frac{L_{\beta, H}^2}{\epsilon^3} \right), +\end{equation} + +which consequently gives the following result: + +\begin{corollary}\label{cor: first_order} +Let us use the following form for $\beta$ +\begin{equation*} +\beta_k = k^\alpha, ~~ \alpha > 0. +\end{equation*} + +If we use the trust region method from~\cite{cartis2012complexity} for step 2 in Algorithm~\ref{Algo:2}, it finds an $\epsilon$-second-order stationary point of~\eqref{prob:01} in $T_K$ total number of iterations where + +\begin{equation} +T_K \leq \left\lceil \frac{Q^{\frac{5}{2}+\frac{1}{2\alpha}}}{\epsilon^{5+\frac{1}{\alpha}}} \right\rceil. +\end{equation} +\end{corollary} + + \paragraph{Regularity.} The key condition in Theorem~\ref{thm:main} is \eqref{eq:regularity}. As the penalty weight $\b_k$ grows, the primal subroutine (Step 2) places an increasing emphasis on reducing the feasibility gap and \eqref{eq:regularity} formalizes this intuition. We verify this condition rigorously in several examples below. We argue that such a condition is necessary for controlling the feasibility gap of Program~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}. To motivate this condition, recall that the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality. As a result, Slater's condition guarantees the success of a variety of primal-dual algorithms for convex and constrained programming. As a visual example, in Program~\eqref{prob:01}, when $f=0$, $g=1_C$ is the indicator function of a convex set $C$, and $A$ is a linear operator, the Slater's condition removes any pathological cases by ensuring that the affine subspace is not tangent to $C$, see Figure~\ref{fig:convex_slater}. In such pathological cases, finding a feasible point of Program~\eqref{prob:01} is particularly difficult. For example, the iterates of the alternative projection algorithm (which iteratively project onto $\null(A)$ and $C$) have arbitrarily slow convergence, as show in Figure~\ref{fig:conve_slater}. \textbf{We should add a figure here with circle and line.}