diff --git a/ICML19/Drafts/main.tex b/ICML19/Drafts/main.tex index c1e795b..a86105e 100644 --- a/ICML19/Drafts/main.tex +++ b/ICML19/Drafts/main.tex @@ -1,163 +1,166 @@ %%%%%%%% ICML 2019 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%% \documentclass{article} % Recommended, but optional, packages for figures and better typesetting: \usepackage{microtype} \usepackage{graphicx} \usepackage{subfigure} \usepackage{booktabs} % for professional tables % hyperref makes hyperlinks in the resulting PDF. % If your build breaks (sometimes temporarily if a hyperlink spans a page) % please comment out the following usepackage line and replace % \usepackage{icml2019} with \usepackage[nohyperref]{icml2019} above. \usepackage{hyperref} % Attempt to make hyperref and algorithmic work together better: \newcommand{\theHalgorithm}{\arabic{algorithm}} % Use the following line for the initial blind version submitted for review: \usepackage{icml2019} % If accepted, instead use the following line for the camera-ready submission: %\usepackage[accepted]{icml2019} % The \icmltitle you define below is probably too long as a header. % Therefore, a short form for the running title is supplied here: \icmltitlerunning{Submission and Formatting Instructions for ICML 2019} % Our own packages \usepackage{amsthm} \usepackage{amsmath} \usepackage{amssymb} \usepackage{graphicx} \usepackage{algorithm} \usepackage{color} %\usepackage{cite} +\usepackage{enumitem} + + % AE's commands \newcommand{\RR}{\mathbb{R}} \newcommand{\edita}[1]{{\color{blue} #1}} \newcommand{\notea}[1]{{\color{magenta} \textbf{Note:AE: #1}}} \newcommand{\ol}{\overline} \newcommand{\Null}{\operatorname{null}} \newcommand{\relint}{\operatorname{relint}} \newcommand{\row}{\operatorname{row}} \newcommand{\s}{\sigma} \newcommand{\editaa}[1]{{\color{red} #1}} \renewcommand{\b}{\beta} \renewcommand{\L}{\mathcal{L}} % Lagrangian \renewcommand{\i}{\iota} \newcommand{\g}{\gamma} \newcommand{\cone}{\operatorname{cone}} \newcommand{\argmin}{\operatorname{argmin}} \newcommand{\dist}{\operatorname{dist}} % MFS's commands \newcommand{\editf}[1]{{\color[rgb]{1,0.4,0.4} #1}} %\DeclareMathOperator*{\argmax}{argmax} % thin space, limits underneath in displays % Theorem style \newtheorem{theorem}{Theorem}[section] % reset theorem numbering for each section \newtheorem{question}[theorem]{Question} \newtheorem{lemma}[theorem]{Lemma} \newtheorem{definition}{Definition} \newtheorem{proposition}{Proposition} \begin{document} \twocolumn[ \icmltitle{Submission and Formatting Instructions for \\ International Conference on Machine Learning (ICML 2019)} % It is OKAY to include author information, even for blind % submissions: the style file will automatically remove it for you % unless you've provided the [accepted] option to the icml2019 % package. % List of affiliations: The first argument should be a (short) % identifier you will use later to specify author affiliations % Academic affiliations should list Department, University, City, Region, Country % Industry affiliations should list Company, City, Region, Country % You can specify symbols, otherwise they are numbered in order. % Ideally, you should not use this facility. Affiliations will be numbered % in order of appearance and this is the preferred way. \icmlsetsymbol{equal}{*} \begin{icmlauthorlist} \icmlauthor{Aeiau Zzzz}{equal,to} \icmlauthor{Bauiu C.~Yyyy}{equal,to,goo} \icmlauthor{Cieua Vvvvv}{goo} \icmlauthor{Iaesut Saoeu}{ed} \icmlauthor{Fiuea Rrrr}{to} \icmlauthor{Tateu H.~Yasehe}{ed,to,goo} \icmlauthor{Aaoeu Iasoh}{goo} \icmlauthor{Buiui Eueu}{ed} \icmlauthor{Aeuia Zzzz}{ed} \icmlauthor{Bieea C.~Yyyy}{to,goo} \icmlauthor{Teoau Xxxx}{ed} \icmlauthor{Eee Pppp}{ed} \end{icmlauthorlist} \icmlaffiliation{to}{Department of Computation, University of Torontoland, Torontoland, Canada} \icmlaffiliation{goo}{Googol ShallowMind, New London, Michigan, USA} \icmlaffiliation{ed}{School of Computation, University of Edenborrow, Edenborrow, United Kingdom} \icmlcorrespondingauthor{Cieua Vvvvv}{c.vvvvv@googol.com} \icmlcorrespondingauthor{Eee Pppp}{ep@eden.co.uk} % You may provide any keywords that you % find helpful for describing your paper; these are used to populate % the "keywords" metadata in the PDF but will not be shown in the document \icmlkeywords{Machine Learning, ICML} \vskip 0.3in ] % this must go after the closing bracket ] following \twocolumn[ ... % This command actually creates the footnote in the first column % listing the affiliations and the copyright notice. % The command takes one argument, which is text to display at the start of the footnote. % The \icmlEqualContribution command is standard text for equal contribution. % Remove it (just {}) if you do not need this facility. %\printAffiliationsAndNotice{} % leave blank if no need to mention equal contribution \printAffiliationsAndNotice{\icmlEqualContribution} % otherwise use the standard text. %\input{preamble.tex} \input{sections/abstract.tex} \input{sections/introduction.tex} \input{sections/preliminaries.tex} \input{sections/AL.tex} \input{sections/guarantees.tex} \input{sections/related_works.tex} \input{sections/experiments.tex} \input{sections/AL_theory.tex} \appendix \input{sections/appendix.tex} %\begin{acknowledgements} %If you'd like to thank anyone, place your comments here %and remove the percent signs. %\end{acknowledgements} % BibTeX users please use one of %\bibliographystyle{icml2019} % basic style, author-year citations %\bibliography{bibliography.bib} % name your BibTeX data base \end{document} % end of file template.tex diff --git a/ICML19/Drafts/sections/AL.tex b/ICML19/Drafts/sections/AL.tex index 5e501ef..2d64cd9 100644 --- a/ICML19/Drafts/sections/AL.tex +++ b/ICML19/Drafts/sections/AL.tex @@ -1,40 +1,40 @@ \section{Inexact AL Algorithm \label{sec:AL algorithm}} To solve the equivalent formulation presented in Program~\eqref{eq:minmax}, we propose the inexact augmented Lagrangian algorithm, detailed in Algorithm~\ref{Algo:2}. Iteration $k$ of this algorithm calls a subroutine that finds an approximate stationary point of the augmented Lagrangian $\L_{\b_k}(\cdot,y_k)$ with increasing accuracy of $\epsilon_{k+1}$ (Step 2). The increasing sequence $\{\b_k\}_k$ and the dual update (Steps 4 and 5) are responsible for gradually enforcing the constraints in Program~\eqref{prob:01}. As we will see in the convergence analysis, the particular choice of dual step size $\s_k$ in Algorithm~\ref{Algo:2} ensures that the dual variable $y_k$ remains bounded. \begin{algorithm} \label{Algo:2} Input: Parameters $\rho,\rho',\rho''>0$, initialization $x_{1}\in \RR^d$ such that $\|A(x_1)\|\le \rho$ and $\|x_1\|\le \rho'$, dual initialization $y_0\in \RR^m$ and initial dual step size $\s_1$, a non-decreasing, positive, unbounded sequence $\{\b_k\}_{k\ge 1}$, stopping threshold $\tau$. \\ For $k=1,2,\ldots$, execute\\ -\begin{enumerate} +\begin{enumerate}[leftmargin=*] \item \textbf{(Update tolerance)} $\epsilon_{k+1} = 1/\b_k$. %\begin{align} %\beta_k = \frac{\beta_{1}}{\log 2}\sqrt{k}\log^2(k+1) . %\end{align} \item \textbf{(Inexact primal solution)} Obtain $x_{k+1}\in \RR^d$ such that $\dist(-\nabla_x \L_{\beta_k} (x_{k+1},y_k), \partial g(x_{k+1}) ) \le \epsilon_{k+1}$.\\ \item \textbf{(Control)} If necessary, project $x_{k+1}$ to ensure that $\|x_{k+1}\|\le \rho'$.\\ \item \textbf{(Update dual step size)} \begin{align*} \s_{k+1} & = \s_{1} \min\Big( \frac{\|A(x_1)\| \log^2 2 }{\|A(x_{k+1})\| (k+1)\log^2(k+2)} ,1 \Big). \end{align*} \item \textbf{(Dual ascent step)} $y_{k+1} = y_{k} + \sigma_{k+1}A(x_{k+1})$. \item \textbf{(Stopping criterion)} If \begin{align*} & \dist^2(-\nabla_x \L_{\b_k}(x_{k+1}),\partial g(x_{k+1}) \nonumber\\ & \qquad + \s_{k+1} \|A(x_{k+1})\|^2 \le \tau, \end{align*} then quit and return $x_{k+1}$ as an (approximate) stationary point of Program~\eqref{prob:01}. \end{enumerate} \caption{Linearized AL for solving Program \eqref{prob:01}} \end{algorithm} diff --git a/ICML19/Drafts/sections/AL_theory.tex b/ICML19/Drafts/sections/AL_theory.tex index 0bce444..44c6c8e 100644 --- a/ICML19/Drafts/sections/AL_theory.tex +++ b/ICML19/Drafts/sections/AL_theory.tex @@ -1,138 +1,132 @@ \section{Proof of Theorem \ref{thm:main} \label{sec:theory}} For every $k\ge2$, recall from (\ref{eq:Lagrangian}) and Step 2 of Algorithm~\ref{Algo:2} that $x_{k}$ satisfies \begin{align} & \dist(-\nabla f(x_k) - DA(x_k)^\top y_{k-1} \nonumber\\ & \qquad - \b_{k-1} DA(x_{k})^\top A(x_k) ,\partial g(x_k) ) \nonumber\\ & = \dist(-\nabla_x \L_{\b_{k-1}} (x_k ,y_{k-1}) ,\partial g(x_k) ) \le \epsilon_{k}. \end{align} With an application of the triangle inequality, it follows that \begin{align} & \dist( -\b_{k-1} DA(x_k)^\top A(x_k) , \partial g(x_k) ) \nonumber\\ & \qquad \le \| \nabla f(x_k )\| + \| DA(x_k)^\top y_{k-1}\| + \epsilon_k, \end{align} which implies that \begin{align} & \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \nonumber\\ & \le \frac{ \| \nabla f(x_k )\|}{\b_{k-1} } + \frac{\| DA(x_k)^\top y_{k-1}\|}{\b_{k-1} } + \frac{\epsilon_k}{\b_{k-1} } \nonumber\\ & \le \frac{\lambda'_f+\lambda'_A \|y_{k-1}\|+\epsilon_k}{\b_{k-1}} , \label{eq:before_restriction} \end{align} where \begin{align} \lambda'_f := \max_{\|x\| \le \rho'} \|\nabla f(x)\|, \quad \lambda'_A := \max_{\|x\|\le \rho'} \| DA(x)\|. \label{eq:defn_lambda'} \end{align} For example, when $g=1_C$ as in \eqref{eq:indicator}, then \eqref{eq:before_restriction} reads as \begin{align} & \| P_{T_C(x_k)} DA(x_k)^\top A(x_k) \| \nonumber\\ & \qquad \le \frac{ \lambda'_f+ \lambda'_A \|y_{k-1}\| + \epsilon_k }{\b_{k-1} } . \label{eq:before} \end{align} -We next translate \eqref{eq:before_restriction} into a bound on the feasibility gap $\|A(x_k)\|$, using the nonconvex Slater's condition in Definition~\ref{defn:nonconvex slater}. Recall that we use the notation $P_{\partial g(x)/\b}$ for the projection operator onto the set $\partial g(x)/\b$. Consider a subspace $S_K\subseteq \mathbb{R}^d$ such that -\begin{align} -S _{K} \supseteq \left\{ P_{\frac{\partial g(x_{k+1})}{\b_{k+1}} } \left( DA(x_{k+1})^\top A(x_{k+1}) \right) \right\}_{k\in K}, -\label{eq:defn of S lemma} -\end{align} -and, to simplify the notation, let $S_{K}$ also denote an orthonormal basis for this subspace. Then, the left-hand side of \eqref{eq:before_restriction} can be bounded below as +We next translate \eqref{eq:before_restriction} into a bound on the feasibility gap $\|A(x_k)\|$. Using the regularity condition \eqref{eq:regularity}, the left-hand side of \eqref{eq:before_restriction} can be bounded below as \begin{align} & \dist( -DA(x_k)^\top A(x_k) , \partial g(x_k)/ \b_{k-1} ) \nonumber\\ -& = \left\| \left(\text{id} - P_{S_K} P_{\frac{\partial g(x_k)}{\b_{k-1}}} \right) (DA(x_k)^\top A(x_k) ) \right\| \nonumber\\ -& \ge \nu_A \|A(x_k) \|, -\qquad \text{(see (\ref{eq:defn_nu_A}))} +& \ge \nu \|A(x_k) \|, +\qquad \text{(see (\ref{eq:regularity}))} \label{eq:restrited_pre} \end{align} where the last line above holds if $\rho,\rho'$ satisfy \begin{align} \max_{k\in K} \|A(x_k)\| \le \rho, \qquad \max_{k\in K} \|x_k\| \le \rho'. \label{eq:to_be_checked_later} \end{align} By substituting \eqref{eq:restrited_pre} back into \eqref{eq:before_restriction}, we find that \begin{align} -\|A(x_k)\| \le \frac{ \lambda'_f + \lambda'_A \|y_{k-1}\| + \epsilon_k}{\nu_A \b_{k-1} }. +\|A(x_k)\| \le \frac{ \lambda'_f + \lambda'_A \|y_{k-1}\| + \epsilon_k}{\nu \b_{k-1} }. \label{eq:before_dual_controlled} \end{align} In words, the feasibility gap is directly controlled by the dual sequence $y_k$. We next establish that the dual sequence is bounded. Indeed, for every $k\in K$, note that \begin{align} \|y_k\| & = \| y_0 + \sum_{i=1}^{k} \s_i A(x_i) \| \quad \text{(Step 5 of Algorithm \ref{Algo:2})} \nonumber\\ & \le \|y_0\|+ \sum_{i=1}^k \s_i \|A(x_i)\| \qquad \text{(triangle inequality)} \nonumber\\ & \le \|y_0\|+ \sum_{i=1}^k \frac{ \|A(x_1)\| \log^2 2 }{ k \log^2(k+1)} \quad \text{(Step 4)} \nonumber\\ & \le \|y_0\|+ c \|A(x_1) \| \log^2 2 =: y_{\max}, \label{eq:dual growth} \end{align} where \begin{align} c \ge \sum_{i=1}^{\infty} \frac{1}{k \log^2 (k+1)}. \end{align} Substituting \eqref{eq:dual growth} back into \eqref{eq:before_dual_controlled}, we reach \begin{align} -\|A(x_k)\| \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu_A \b_{k-1} } . +\|A(x_k)\| \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} } . \label{eq:cvg metric part 2} \end{align} Let us now revisit and simplify \eqref{eq:to_be_checked_later}. Note that $\rho'$ automatically satisfies the second inequality there by Step 3 of Algorithm~\ref{Algo:2}. Also, $\rho$ satisfies the first inequality in \eqref{eq:to_be_checked_later} if \begin{align} \frac{\lambda'_f+\lambda'_A y_{\max}}{ \nu_A \b_1} \le \rho/2, \end{align} and $k_0$ is large enough. Indeed, \begin{align} -\|A(x_k)\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu_A \b_{k-1} } +\|A(x_k)\| & \le \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu \b_{k-1} } \qquad \text{(see \eqref{eq:cvg metric part 2})} \nonumber\\ -& \le \frac{ 2( \lambda'_f + \lambda'_A y_{\max})}{\nu_A \b_1 } +& \le \frac{ 2( \lambda'_f + \lambda'_A y_{\max})}{\nu \b_1 } \qquad \text{(see \eqref{eq:dual growth})}, \end{align} where the last line above holds when $k_0$ is large enough, because $\b_k \ge \b_1$ and $\lim_{k\rightarrow\infty} \epsilon_k=0$. It remains to control the first term in \eqref{eq:cvg metric}. To that end, recalling Step 2 of Algorithm~\ref{Algo:2} and applying the triangle inequality, we write that \begin{align} & \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\ & \le \dist( -\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) , \partial g(x_{k-1}) ) \nonumber\\ -& \qquad + \| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \|. +& + \| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \|. \label{eq:cvg metric part 1 brk down} \end{align} The first term on the right-hand side above is bounded by $\epsilon_k$, by Step 5 of Algorithm~\ref{Algo:2}. For the second term on the right-hand side of \eqref{eq:cvg metric part 1 brk down}, we write that \begin{align} & \| \nabla_x \L_{\b_{k-1}} (x_k,y_{k})-\nabla_x \L_{\b_{k-1}} (x_k,y_{k-1}) \| \nonumber\\ & = \| DA(x_k)^\top (y_k - y_{k-1}) \| \qquad \text{(see \eqref{eq:Lagrangian})} \nonumber\\ & \le \lambda'_A \|y_k- y_{k-1}\| \qquad \text{(see \eqref{eq:defn_lambda'})} \nonumber\\ & = \lambda'_A \s_k \|A (x_k) \| \qquad \text{(see Step 5 of Algorithm \ref{Algo:2})} \nonumber\\ -& \le \frac{\lambda'_A \s_k }{\nu_A \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}+ \epsilon_k) . +& \le \frac{\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}+ \epsilon_k) . \label{eq:part_1_2} \end{align} By combining (\ref{eq:cvg metric part 1 brk down},\ref{eq:part_1_2}), we find that \begin{align} & \dist( \nabla_x \L_{\b_{k-1}} (x_k,y_{k}), \partial g(x_{k}) ) \nonumber\\ -& \le \frac{\lambda'_A \s_k }{\nu_A \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}+ \epsilon_k) + \epsilon_k. +& \le \frac{\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}+ \epsilon_k) + \epsilon_k. \label{eq:cvg metric part 1} \end{align} By combining (\ref{eq:cvg metric part 1},\ref{eq:cvg metric part 2}), we find that \begin{align} & \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ -& \le \left( \frac{\lambda'_A \s_k }{\nu_A \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}+ \epsilon_k) + \epsilon_k \right)^2 \nonumber\\ +& \le \left( \frac{\lambda'_A \s_k }{\nu \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}+ \epsilon_k) + \epsilon_k \right)^2 \nonumber\\ & \qquad + \left( \frac{ \lambda'_f + \lambda'_A y_{\max} + \epsilon_k}{\nu_A \b_{k-1} } \right)^2 \nonumber\\ & \le \left( \frac{2\lambda'_A \s_k }{\nu_A \b_{k-1} }( \lambda'_f+ \lambda'_Ay_{\max}) + \epsilon_k \right)^2 \nonumber\\ & \qquad + 4\left( \frac{ \lambda'_f + \lambda'_A y_{\max}}{\nu_A \b_{k-1} } \right)^2, \end{align} where the last line above holds when $k_0$ is sufficiently large, because $\lim_{k\rightarrow \infty}\epsilon_k =0$. Applying the inequality $(a+b)^2 \le 2a^2+2b^2$, we find that \begin{align} & \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ -& \le \frac{8 \lambda'_A\s_k^2 + 4}{ \nu_A^2\b_{k-1}^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\epsilon_k^2 \nonumber\\ -& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu_A^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right). +& \le \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2\b_{k-1}^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\epsilon_k^2 \nonumber\\ +& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right), \end{align} -This completes the proof of Theorem \ref{thm:main}. +which completes the proof of Theorem \ref{thm:main}. diff --git a/ICML19/Drafts/sections/experiments.tex b/ICML19/Drafts/sections/experiments.tex index a292603..57dbb56 100644 --- a/ICML19/Drafts/sections/experiments.tex +++ b/ICML19/Drafts/sections/experiments.tex @@ -1 +1,67 @@ -\section{Experiments \label{sec:experiments}} \ No newline at end of file +\section{Experiments \label{sec:experiments}} + +\subsection{Latent Group Lasso \label{sec:latent lasso}} + +For a collection of subsets $\Omega=\{\Omega_i\}_{i=1}^{I}\subset [1:p]$, the latent group Lasso norm on $\RR^p$ takes $z\in \RR^p$ to +\begin{align} +\|z\|_{\Omega,1} = \sum_{i=1}^I \| z_{\Omega_i} \|. +\end{align} +Note that we do not require $\Omega_i$ to be disjoint. For $B\in \RR^{n\times p}$, $b\in \RR^n$, and $\lambda>0$, consider the latent group lasso as +\begin{align} +\min_{z\in \RR^d} \frac{1}{2}\| Bz - b\|_2^2+ \lambda \| z\|_{\Omega,1}. +\label{eq:group_lasso} +\end{align} +Because $\Omega$ is not a partition of $[1:p]$, computing the proximal operator of $\|\cdot\|_{\Omega}$ is often intractable, ruling out proximal gradient descent and similar algorithms for solving Program~\eqref{eq:group_lasso}. Instead, often Program~\eqref{eq:group_lasso} is solved by Alternating Direction Method of Multipliers (ADMM). More concretely, ??? + +We take a radically different approach here and cast Program~\eqref{eq:group_lasso} as an instance of Program~\eqref{prob:01}. More specifically, let $z^+,z^-\in \RR^p$ be positive and negative parts of $z$, so that $z=z^+-z^-$. Let us introduce the nonnegative $u\in \RR^p$ such that $z^++z^- = u^{\circ 2}$, where we used $\circ$ notation to show entrywise power. We may now write that +\begin{align} +\| z^++z^- \|_{\Omega,1} +& = \| u^{\circ 2} \|_{\Omega,1 } =: \|u\|_{\Omega,2}^2, +\end{align} +Unlike $\|\cdot\|_{\Omega,1}$, note that $\|\cdot\|_{\Omega,2}^2$ is differentiable and, in fact, there exists a positive semidefinite matrix $Q\in \RR^{d\times d}$ such that $\|u\|_{\Omega,2}^2=u^\top Q_\Omega u$. Let us form $x=[(z^+)^\top,(z^-)^\top, u^\top]^\top\in \RR^{3d}$ and set +\begin{align*} +f(x) = \frac{1}{2}\| Bz^+-Bz^- -b\|_1+ \|u\|_{\Omega,2}^2, +\end{align*} +\begin{align} +g(x) = 0, \qquad A(x) = z^++z^--u^{\circ 2}. +\end{align} +We can now apply Algorithm~\ref{Algo:2} to solve Program~\eqref{prob:01} with $f,g,A$ specified above. + + +\paragraph{Convergence rate.} +Clearly, $f,A$ are strongly smooth in the sense that \eqref{eq:smoothness basic} holds with proper $\lambda_f,\lambda_A$. Likewise, both $f$ and the Jacobian $DA$ are continuous and the restricted Lipschitz constants $\lambda'_f,\lambda'_A$ in \eqref{eq:defn_lambda'} are consequently well-defined and finite for a fixed $\rho'>0$. We next verify the key regularity condition in Theorem~\ref{thm:main}, namely, \eqref{eq:regularity}. Note that +\begin{align*} +DA(x) & = +\left[ +\begin{array}{ccc} +I_p & I_p & -2\text{diag}(u) +\end{array} +\right]\in \RR^{d\times 3d}, +\end{align*} +\begin{align*} +-DA(x)^\top A(x) = +\left[ +\begin{array}{c} +-z^+-z^-+u^{\circ 2} \\ +-z^+-z^-+u^{\circ 2}\\ +-2\text{diag}(u)( z^++z^--u^{\circ 2}) +\end{array} +\right], +\end{align*} +\begin{align*} +& \text{dist} \left( -DA(x)^\top A(x), \frac{\partial g(x)}{\b} \right) \nonumber\\ +& = \text{dist} \left( -DA(x)^\top A(x), \{0\} \right) \nonumber\\ +& = \left\| -DA(x)^\top A(x) \right\| \nonumber\\ +& \ge \sqrt{2} \| A(x)\|, +\end{align*} +namely, \eqref{eq:regularity} holds with $\nu=1$. + + + + + +\subsection{Clustering} + + + +\subsection{Learning with Generative Models} \ No newline at end of file diff --git a/ICML19/Drafts/sections/guarantees.tex b/ICML19/Drafts/sections/guarantees.tex index 12cac53..89d7866 100644 --- a/ICML19/Drafts/sections/guarantees.tex +++ b/ICML19/Drafts/sections/guarantees.tex @@ -1,31 +1,39 @@ \section{Convergence Rate \label{sec:cvg rate}} Let us now quantify the global convergence rate of our algorithm below, with the proof deferred to Appendix~\ref{sec:theory}. \begin{theorem}[\textbf{Inexact AL}] \label{thm:main} -For sufficiently large integers $k_0\le k_1$, consider the interval $K=[k_0:k_1]$. For $\rho'>0$, in addition to the strong smoothness of $f$ and $A$ in (\ref{eq:smoothness basic}), let us define +In addition to the strong smoothness of $f$ and $A$ in (\ref{eq:smoothness basic}) and for $\rho'>0$, let us define \begin{align} \lambda'_f = \max_{\|x\|\le \rho'} \|\nabla f(x)\|, \qquad \lambda'_A = \max_{\|x\| \le \rho'} \|DA(x)\|, \end{align} -to be the (restricted) Lipschitz constants of $f$ and $A$, respectively. Let $\{x_k\}_{k\in K}$ be the output sequence of Algorithm~\ref{Algo:2}, and suppose that NSC in Definition~\ref{defn:nonconvex slater} holds with the subspace -\begin{align*} -S =\operatorname{affhull} -\left\{ P_{\frac{\partial g(x_{k})}{\b_{k}} } \left( DA(x_{k})^\top A(x_{k}) \right) \right\}_{k\in K+1}, -\end{align*} -and with -\begin{align} -\nu(g,A,S) \ge \frac{2(\lambda'_f+\lambda'_A y_{\max})}{ \b_1 \rho}, +to be the (restricted) Lipschitz constants of $f$ and $A$, respectively. + For sufficiently large integers $k_0\le k_1$, consider the interval $K=[k_0:k_1]$. Let $\{x_k\}_{k\in K}$ be the output sequence of Algorithm~\ref{Algo:2} and, for $\nu>0$, assume that + \begin{align} +\nu \|A(x_k)\| +& \le \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right), +\label{eq:regularity} \end{align} -for $y_{\max}<\infty$ specified in (\ref{eq:dual growth}). Above, $\operatorname{affhull}$ returns the affine hull of a set. +for every $k\in K$. Then the output of the inexact AL in Algorithm~\ref{Algo:2} satisfies \begin{align} & \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ -& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu_A^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right). +& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right). \end{align} - \end{theorem} -\textbf{Please add a thorough discussion of the result. } +A few remarks are in order about Theorem~\ref{thm:main}. + +\paragraph{Tradeoff.} +Roughly speaking, Theorem~\ref{thm:main} states that the iterates of Algorithm~\ref{Algo:2} approach first-order stationarity for Program~\eqref{prob:01} at the rate of $1/{\b_k^2}$, where $\b_k$ is the penalty weight in AL in iteration $k$. Note the inherent tradeoff here: For a faster sequence $\{\b_k\}_k$, the iterates approach stationarity faster but the computational cost of the inexact primal subroutine (Step 2) likely increases. In words, there is a tradeoff between the number of outer and inner loop iterations of Algorithm~\ref{Algo:2}. We highlight this tradeoff for a number of subroutines below. + +\paragraph{Regularity.} The key condition in Theorem~\ref{thm:main} is \eqref{eq:regularity}. As the penalty weight $\b_k$ grows, the primal subroutine (Step 2) places an increasing emphasis on reducing the feasibility gap and \eqref{eq:regularity} formalizes this intuition. We verify this condition rigorously in several examples below. + +We argue that such a condition is necessary for controlling the feasibility gap of Program~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}. To motivate this condition, recall that the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality. As a result, Slater's condition guarantees the success of a variety of primal-dual algorithms for convex and constrained programming. As a visual example, in Program~\eqref{prob:01}, when $f=0$, $g=1_C$ is the indicator function of a convex set $C$, and $A$ is a linear operator, the Slater's condition removes any pathological cases by ensuring that the affine subspace is not tangent to $C$, see Figure~\ref{fig:convex_slater}. In such pathological cases, finding a feasible point of Program~\eqref{prob:01} is particularly difficult. For example, the iterates of the alternative projection algorithm (which iteratively project onto $\null(A)$ and $C$) have arbitrarily slow convergence, as show in Figure~\ref{fig:conve_slater}. + +\textbf{We should add a figure here with circle and line.} + + diff --git a/ICML19/Drafts/sections/introduction.tex b/ICML19/Drafts/sections/introduction.tex index 99eb575..5018ac7 100644 --- a/ICML19/Drafts/sections/introduction.tex +++ b/ICML19/Drafts/sections/introduction.tex @@ -1,59 +1,59 @@ \newpage \section{Introduction} \label{intro} We study the non-convex optimization program \begin{equation} \label{prob:01} \begin{cases} \underset{x}{\min}\,\, f(x)+g(x)\\ A(x) = 0, \end{cases} \end{equation} where (possibly nonconvex) $f:\mathbb{R}^d\rightarrow\mathbb{R}$ and (possibly nonlinear) $A:\mathbb{R}^d\rightarrow\mathbb{R}^m$ satisfy \begin{align*} \| \nabla f(x) - \nabla f(x')\| \le \lambda_f \|x-x'\|, \end{align*} \begin{align} \| DA(x) - DA(x') \| \le \lambda_A \|x-x'\|, \label{eq:smoothness basic} \end{align} -for every $x,x'\in \mathbb{R}^d$. Above, $\nabla f(x)\in \mathbb{R}^d$ is the gradient of $f$ and $DA(x)\in \mathbb{R}^{m\times d}$ is the Jacobian of $A$. Moreover, we assume that $g:\mathbb{R}^d\rightarrow\mathbb{R}$ is convex. For instance, for a convex set $C\subset\RR^d$ and letting $g=1_C$ be the convex indicator function on $C$, Program~\eqref{prob:01} is a noncnovex optimization problem with the convex constraint $x\in C$. Throughout, we only assume that computing the proximal operator of $g$ is easy and inexpensive, see Section~\ref{sec:preliminaries}. +for every $x,x'\in \mathbb{R}^d$. Above, $\nabla f(x)\in \mathbb{R}^d$ is the gradient of $f$ and $DA(x)\in \mathbb{R}^{m\times d}$ is the Jacobian of $A$. Moreover, we assume that $g:\mathbb{R}^d\rightarrow\mathbb{R}$ is convex (but not necessarily smooth). For instance, for a convex set $C\subset\RR^d$ and letting $g=1_C$ be the convex indicator function on $C$, Program~\eqref{prob:01} is a noncnovex optimization problem with the convex constraint $x\in C$. Throughout, we only assume that computing the proximal operator of $g$ is easy and inexpensive, see Section~\ref{sec:preliminaries}. A broad range of problems in computer science \cite{khot2011grothendieck, lovasz2003semidefinite}, machine learning \cite{mossel2015consistency, song2007dependence}, and signal processing \cite{singer2011angular, singer2011three} naturally fall under this template, including but not limited to maximum cut, clustering, generalized eigenvalue, as well as community detection. See Section ? for several examples. \textbf{We should explain this better and give more high level examples to motivate this program.} %An example of our template in \eqref{prob:01} is semi-definite programming which provides powerful relaxations to above problems. Denote the space of $d'\times d'$ symmetric matrices by $\mathbb{S}^{d'\times d'}$ and consider % %\begin{equation} % \label{sdp} % \min_x \{h'(x): A'(x) = b', ~~x \in \mathcal{C'}~~\text{and}~~x\succeq0 \} %\end{equation} % %where $h': \mathbb{S}^{d'\times d'} \to \RR$, $A'\colon\mathbb{S}^{d'\times d'}\to\RR^m$, $b'\in\RR^m$, and $C' \subseteq \mathbb{R}^{d'\times d'}$. This template clearly can be put to the form of \eqref{prob:01} by using \emph{Burer-Monteiro factorization} \cite{burer2003nonlinear, burer2005local}. The \emph{augmented Lagrangian method} \cite{luenberger1984linear} is a powerful approach to solve Program \eqref{prob:01}, see Section \ref{sec:related work} for a review of the related literature as well as other approaches to solve Program \eqref{prob:01}. Indeed, for positive $\beta$, it is easy to verify that Program \eqref{prob:01} is equivalent to \begin{align} \min_{x} \max_y \,\,\mathcal{L}_\beta(x,y) + g(x), \label{eq:minmax} \end{align} where \begin{align} \label{eq:Lagrangian} \mathcal{L}_\beta(x,y) := f(x) + \langle A(x), y \rangle + \frac{\beta}{2}\|A(x)\|^2, \end{align} is the augmented Lagrangian corresponding to Program~\eqref{prob:01}. The equivalent formulation in Program \eqref{eq:minmax} naturally suggests the following algorithm to solve Program \eqref{prob:01}: \begin{equation*}\label{e:exac} x_{k+1} \in \underset{x}{\argmin} \, \mathcal{L}_{\beta}(x,y_k)+g(x), \end{equation*} \begin{equation} y_{k+1} = y_k+\s_k A(x_{k+1}), \end{equation} for dual step sizes $\{\s_k\}_k$. However, updating $x$ in the augmented Lagrangian method requires solving the non-convex Program~\eqref{e:exac} to global optimality, which is typically intractable. It is instead often easier to find an approximate first-order stationary point of Program~\eqref{e:exac} using, for example, the proximal gradient descent algorithm. Therefore, we consider an algorithm that only requires approximate stationarity in every iteration. Moreover, by gradually decreasing the stationarity tolerance and increasing the penalty weight $\b$ above, we can reach a stationary point of Program~\eqref{eq:Lagrangian}, as detailed in Section~\ref{sec:AL algorithm}. \paragraph{{\textbf{Contributions.}} } \textbf{To be written...}