diff --git a/ICML19/Drafts/sections/AL.tex b/ICML19/Drafts/sections/AL.tex index 85568e6..3953dd9 100644 --- a/ICML19/Drafts/sections/AL.tex +++ b/ICML19/Drafts/sections/AL.tex @@ -1,53 +1,53 @@ \section{Algorithm \label{sec:AL algorithm}} To solve the equivalent formulation presented in Program~\eqref{eq:minmax}, we propose the inexact ALM (iALM), detailed in Algorithm~\ref{Algo:2}. - Iteration $k$ of Algorithm~\ref{Algo:2} calls a subroutine that finds an approximate stationary point of the augmented Lagrangian $\L_{\b_k}(\cdot,y_k)$ with the accuracy of $\epsilon_{k+1}$ (Step 2), and this accuracy gradually inccreases. + Iteration $k$ of Algorithm~\ref{Algo:2} calls a solver that finds an approximate stationary point of the augmented Lagrangian $\L_{\b_k}(\cdot,y_k)$ with the accuracy of $\epsilon_{k+1}$ (Step 2), and this accuracy gradually inccreases. The increasing sequence $\{\b_k\}_k$ and the dual update (Steps 4 and 5) are responsible for gradually enforcing the constraints in Program~\eqref{prob:01}. As we will see in the convergence analysis, the particular choice of the dual step size $\s_k$ in Algorithm~\ref{Algo:2} ensures that the dual variable $y_k$ remains bounded, see \cite{bertsekas1976penalty} for a precedent in the ALM literature. Step 3 of Algorithm~\ref{Algo:2} removes pathological cases with divergent iterates. As an example, suppose that $g=1_\mathcal{X}$ in Program~\eqref{prob:01} is the indicator function on a bounded convex set $\mathcal{X}\subset \RR^d$ and take $\rho' > \max_{x\in \mathcal{X}} \|x\|$. Then, for sufficiently large $k$, it is not difficult to verify that all the iterates of Algorithm~\ref{Algo:2} automatically satisfy $\|x_k\|\le \rho'$ without the need to enforce Step 3. \begin{algorithm}[h!] \begin{algorithmic} \STATE \textbf{Input:} $\rho,\rho',\rho''>0$. A non-decreasing, positive, unbounded sequence $\{\b_k\}_{k\ge 1}$, stopping threshold $\tau$. \STATE \textbf{Initialization:} $x_{1}\in \RR^d$ such that $\|A(x_1)\|\le \rho$ and $\|x_1\|\le \rho'$, $y_0\in \RR^m$, $\s_1$. %For $k=1,2,\ldots$, execute\\ \FOR{$k=1,2,\dots$} \STATE \begin{enumerate}[leftmargin=*] \item \textbf{(Update tolerance)} $\epsilon_{k+1} = 1/\b_k$. %\begin{align} %\beta_k = \frac{\beta_{1}}{\log 2}\sqrt{k}\log^2(k+1) . %\end{align} \item \textbf{(Inexact primal solution)} Obtain $x_{k+1}\in \RR^d$ such that \begin{equation*} \dist(-\nabla_x \L_{\beta_k} (x_{k+1},y_k), \partial g(x_{k+1}) ) \le \epsilon_{k+1} \end{equation*} - for first order stationarity and + for first-order stationarity and, in addition, \begin{equation*} -\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_k}(x_{k+1}, y_k)) > -\epsilon_{k+1} +\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_k}(x_{k+1}, y_k)) \ge -\epsilon_{k+1} \end{equation*} for second-order-stationarity. \item \textbf{(Control)} If necessary, project $x_{k+1}$ to ensure that $\|x_{k+1}\|\le \rho'$.\\ \item \textbf{(Update dual step size)} \begin{align*} \s_{k+1} & = \s_{1} \min\Big( \frac{\|A(x_1)\| \log^2 2 }{\|A(x_{k+1})\| (k+1)\log^2(k+2)} ,1 \Big). \end{align*} \item \textbf{(Dual ascent)} $y_{k+1} = y_{k} + \sigma_{k+1}A(x_{k+1})$. \item \textbf{(Stopping criterion)} If \begin{align*} & \dist^2(-\nabla_x \L_{\b_k}(x_{k+1}),\partial g(x_{k+1}) \nonumber\\ & \qquad + \s_{k+1} \|A(x_{k+1})\|^2 \le \tau, \end{align*} then quit and return $x_{k+1}$ as an (approximate) stationary point of~\eqref{prob:01}. \end{enumerate} \ENDFOR \end{algorithmic} \caption{Inexact AL for solving~\eqref{prob:01}} \label{Algo:2} \end{algorithm} diff --git a/ICML19/Drafts/sections/guarantees.tex b/ICML19/Drafts/sections/guarantees.tex index cd56c80..917de20 100644 --- a/ICML19/Drafts/sections/guarantees.tex +++ b/ICML19/Drafts/sections/guarantees.tex @@ -1,158 +1,171 @@ \section{Convergence Rate \label{sec:cvg rate}} In this section, we investigate the convergence rate of Algorithm~\ref{Algo:2} for finding first-order and second-order stationary points, along with the iteration complexity results. All the proofs are deferred to Appendix~\ref{sec:theory} for the clarity of exposition. Theorem~\ref{thm:main} below characterizes the convergence rate of Algorithm~\ref{Algo:2} for finding first-order stationary points in terms of the number of outer iterations.\\ {\color{red}{Ahmet: I think that it is possible to remove sufficiently large k0 assumption here. }} \textbf{AE: You're likely right but it might not be worth your time right now.} -\begin{theorem} +\begin{theorem}\textbf{\emph{(convergence rate)}} \label{thm:main} Suppose that $f$ and $A$ are smooth in the sense defined in (\ref{eq:smoothness basic}). For $\rho'>0$, let \begin{align} \lambda'_f = \max_{\|x\|\le \rho'} \|\nabla f(x)\|, ~~ \lambda'_A = \max_{\|x\| \le \rho'} \|DA(x)\|, \label{eq:defn_restricted_lipsichtz} \end{align} be the (restricted) Lipschitz constants of $f$ and $A$, respectively. For sufficiently large integers $k_0\le k_1$, consider the interval $K=[k_0:k_1]$. Let $\{x_k\}_{k\in K}$ be the output sequence of Algorithm~\ref{Algo:2} on the interval $K$ and, for $\nu>0$, assume that \begin{align} \nu \|A(x_k)\| & \le \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right), \label{eq:regularity} \end{align} -for every $k\in K$. -Then, $x_k$ is an $\epsilon_{k,f}$ first-order stationary point of Program~\ref{prob:01} with +for every $k\in K$. We have two cases: +\begin{itemize}[leftmargin=*] +\item If a first-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f},\b_k)$ first-order stationary point of Program~(\ref{prob:01}) with %\begin{align} %& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ %& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_1^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\ %&=: \frac{Q(f,g,A)}{\beta_{k-1}^2}, %\end{align} \begin{align} \epsilon_{k,f} & = \frac{1}{\beta_{k-1}^2} \left( \frac{8 \lambda'_A\s_1^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2 \right) \nonumber \\ &=: \frac{Q}{\beta_{k-1}^2}, +\label{eq:stat_prec_first} \end{align} for every $k\in K$, where the expression for $y_{\max}=y_{\max}(x_1,y_0,\s_1)$ is given in (\ref{eq:dual growth}), and the dependence of $Q$ on $f,g,A,\s_1$ is suppressed for brevity. +\item If a second-order solver is used in Step~2, then $x_k$ is an $(\epsilon_{k,f}, \epsilon_{k,s},\b_k)$ second-order stationary point of Program~(\ref{prob:01}) with $\epsilon_{k,s}$ specified above and with +\begin{align} +\epsilon_{k,s} & = \frac{C}{\beta_{k-1}}+ \epsilon_{k-1}. +\end{align} +\textbf{We should try to get the constant C since we get the constants elsewhere for consistency.} +\end{itemize} + + \end{theorem} %A few remarks are in order about Theorem~\ref{thm:main}. %\textbf{Tradeoff.} %Roughly speaking, Theorem~\ref{thm:main} states that the iterates of Algorithm~\ref{Algo:2} approach first-order stationarity for~\eqref{prob:01} at the rate of $1/{\b_k}$, where $\b_k$ is the penalty weight in AL in iteration $k$ as in~\eqref{eq:Lagrangian}. Note the inherent tradeoff here: For a faster sequence $\{\b_k\}_k$, the iterates approach stationarity faster but the computational cost of the inexact primal subroutine (Step 2) increases since a higher accuracy is required and Lipschitz constant of the unconstrained problem is also affected by $\beta_k$. In words, there is a tradeoff between the number of outer and inner loop iterations of Algorithm~\ref{Algo:2}. We highlight this tradeoff for a number of subroutines below. -Loosely speaking, Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} converges to a first-order stationary point of Program~\eqref{prob:01} at the rate of $1/\b_k^2$. A few remarks about Theorem~\ref{thm:main} are in order. +Loosely speaking, Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} converges to a (first- or second-) order stationary point of Program~\eqref{prob:01} at the rate of $1/\b_k^2$. A few remarks about Theorem~\ref{thm:main} are in order. \textbf{Regularity.} The key condition in Theorem~\ref{thm:main} is \eqref{eq:regularity} which, broadly speaking, controls the geometry of the problem. -As the penalty weight $\b_k$ grows, the primal subroutine (Step~2) places an increasing emphasis on reducing the feasibility gap and \eqref{eq:regularity} formalizes this intuition. +As the penalty weight $\b_k$ grows, the primal solver (Step~2) places an increasing emphasis on reducing the feasibility gap and \eqref{eq:regularity} formalizes this intuition. In contrast to most of the conditions in the nonconvex optimization literature~\cite{flores2012complete} \textbf{AE: the way the previous sentence is phrased suggests more than one reference. can we add more?}, condition~\eqref{eq:regularity} appears to be easier to check. Indeed, we verify this condition rigorously in several applications in Section~\ref{sec:experiments}. -By giving an example, we argue next that such a condition is necessary for controlling the feasibility gap of Program~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}. Consider the case where $f=0$, $g=\delta_\mathcal{X}$ where $\mathcal{X}$ is a convex set, and $A$ is a linear operator. In this case, solving Program~\eqref{prob:01} finds a point in $\mathcal{X}\cap \text{null}(A)$, where $\text{null}(A)=\{ x\in\mathbb{R}^d: A(x) = 0 \} \subset \RR^d$ is the null space of $A$. -Here, the Slater's condition that +Let us provide an example to argue that such a condition is necessary for controlling the feasibility gap of Program~\eqref{prob:01} and ensuring the success of Algorithm~\ref{Algo:2}. Consider the case where $f=0$, $g=\delta_\mathcal{X}$ where $\mathcal{X}$ is a convex set, and $A$ is a linear operator. In this case, solving Program~\eqref{prob:01} finds a point in $\mathcal{X}\cap \text{null}(A)$, where the subspace $\text{null}(A)=\{ x\in\mathbb{R}^d: A(x) = 0 \} \subset \RR^d$ is the null space of $A$. +Here, the Slater's condition requires that \begin{equation} \text{relint} (\mathcal{X}) \cap \text{null}(A)\neq \emptyset. \end{equation} -In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs. Intuitively, the Slater's condition removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving Program~\eqref{prob:01}, namely, finding a point in $\text{null}(A)=\{ x\in\mathbb{R}^d: A(x) = 0 \} \subset \RR^d$, can be particularly difficult. For instance, the iterates of the alternating projection algorithm (which iteratively projects onto $\null(A)$ and $\mathcal{X}$) have arbitrarily slow convergence, as show in Figure~\ref{fig:conve_slater}. - -{\color{red}{We should add a figure here with circle and line.}} +In general, the Slater's condition plays a key role in convex optimization as a sufficient condition for strong duality and, as a result, guarantees the success of a variety of primal-dual algorithms for linearly-constrained convex programs. Intuitively, the Slater's condition here removes any pathological cases by ensuring that the subspace $\text{null}(A)$ is not tangent to $\mathcal{X}$, see Figure~\ref{fig:convex_slater}. In such pathological cases, solving Program~\eqref{prob:01}, namely, finding a point in $\mathcal{X}\cap \text{null}(A)$, can be particularly difficult. For instance, the alternating projection algorithm (which iteratively projects onto $\mathcal{X}$ and $\text{null}(A)$) has arbitrarily slow convergence, see Figure~\ref{fig:convex_slater}. +\begin{figure} +\begin{center} +\includegraphics[scale=.5]{Slater.pdf} +\end{center} +\caption{Some regularity is required for Algorithm~\ref{Algo:2} to succeed, as detailed after Theorem~\ref{thm:main}. \label{fig:convex_slater}} +\end{figure} +\textbf{Computational complexity.} Theorem~\ref{thm:main} allows us to specify the number of iterations of Algorithm~\ref{Algo:2} required to reach a near-stationary point of Program~\eqref{prob:01} with a prescribed precision and, in particular, can specify the number of calls made to the solver in Step~2. In this sense, Theorem~\ref{thm:main} does not fully capture the computational complexity of Algorithm~\ref{Algo:2}, as it does not take into account the computational cost of the solver in Step~2. -\textbf{Computational complexity.} Theorem~\ref{thm:main} allows us to specify the number of iterations of Algorithm~\ref{Algo:2} required to reach a near-stationary point of Program~\eqref{prob:01} with a prescribed precision and, in particular, can specify the number of calls made to the subroutine in Step~2. In this sense, Theorem~\ref{thm:main} does not fully capture the computational complexity of Algorithm~\ref{Algo:2}, as it does not take into account the computational cost of the subroutine. +To better understand Algorithm~\ref{Algo:2}, we consider two scenarios in what follows. In the first scenario, we take the solver in Step~2 to be the Accelerated Proximal Gradient Method (APGM), a well-known first-order algorithm~\cite{ghadimi2016accelerated}. In the second scenario, we will use the second-order trust region method developed in~\cite{cartis2012complexity}. -To better understand Algorithm~\ref{Algo:2}, we consider two scenarios in what follows. In the first scenario, we take the subroutine in Step~2 to be the Accelerated Proximal Gradient Method (APGM), a well-known first-order algorithm~\cite{ghadimi2016accelerated}. In the second scenario, we will use the second-order trust region method developed in~\cite{cartis2012complexity}. +\subsection{First-Order Solver} -\subsection{First-Order Subroutine} - -Let us first consider the case where the subroutine in Step~2 is the well-known Accelerated Proximal Gradient Method (APGM)~\cite{ghadimi2016accelerated}. \textbf{AE: Ahmet to give a brief description of APGM here. } +\textbf{AE: we go back and forth between "subroutine" and "solver". for consistency, I'm just using "solver" everywhere.} +Let us first consider the case where the solver in Step~2 is the well-known Accelerated Proximal Gradient Method (APGM)~\cite{ghadimi2016accelerated}. \textbf{AE: Ahmet to give a brief description of APGM here. } %First, we characterize the iteration complexity of Algorithm~\ref{Algo:2} for finding a first-order stationary point of~\eqref{prob:01}. %We propose to use the standard accelerated proximal method (APGM), guarantees of which are analyzed in~\cite{ghadimi2016accelerated} for nonconvex problems of the form~\eqref{e:exac}. Suppose that $g=1_\mathcal{X}$ is the indicator function on a bounded convex set $\mathcal{X}\subset \RR^d$ and let \begin{align} -x_{\max}= \max_{x\in \mathcal{X}}\|x\|. +x_{\max}= \max_{x\in \mathcal{X}}\|x\| \end{align} +be the radius of a ball centered at the origin that includes $\mathcal{X}$. Then, adapting the results in~\cite{ghadimi2016accelerated} to our setup, APGM reaches $x_{k}$ in Step 2 of Algorithm~\ref{Algo:2} after \begin{equation} \mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}}^2 x_{\max}^2 }{\epsilon_k} \right) \label{eq:iter_1storder} \end{equation} -(inner) iterations, where $\lambda_{\beta_{k-1}}$ denotes the Lipschitz constant of the gradient of ${\mathcal{L}_{\beta_{k-1}}(\cdot, y)}$, bounded in~\eqref{eq:smoothness of Lagrangian}. +(inner) iterations, where $\lambda_{\beta_{k-1}}$ denotes the Lipschitz constant of $\nabla_x{\mathcal{L}_{\beta_{k-1}}(x, y)}$, bounded in~\eqref{eq:smoothness of Lagrangian}. \textbf{AE: I made the bounds worse a bit but simplified them.} -Using \eqref{eq:iter_1storder}, we derive the following corollary, describing the total computational complexity of Algorithm~\ref{Algo:2} in terms of the calls to the first-order oracle in APGM. +Using \eqref{eq:iter_1storder}, we derive the following corollary, describing the total computational complexity of Algorithm~\ref{Algo:2} in terms of the calls to the first-order oracle in APGM. \textbf{AE: we haven't talked about oracle before.} \begin{corollary}\label{cor:first} -For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $\epsilon_f$ first-order stationary point, see (\ref{eq:inclu3}), +For $b>1$, let $\beta_k =b^k $ for every $k$. If we use APGM from~\cite{ghadimi2016accelerated} for Step~2 of Algorithm~\ref{Algo:2}, the algorithm finds an $(\epsilon_f,\b_k)$ first-order stationary point, see (\ref{eq:inclu3}), in $T$ number of calls to the first-order oracle, where % \begin{equation} T = \mathcal{O}\left( \frac{Q^{\frac{3}{2}+\frac{1}{2b}} x_{\max}^2}{\epsilon^{\frac{3}{2}+\frac{1}{2b}}} \right) = \tilde{\mathcal{O}}\left( \frac{Q^{\frac{3}{2}} x_{\max}^2}{\epsilon^{\frac{3}{2}}} \right). \end{equation} \end{corollary} -For Algorithm~\ref{Algo:2}, to reach an accuracy of $\epsilon_f$ with Algorithm~\ref{Algo:2} and with the lowest computational cost, one needs to set $b$ in Corollary~\ref{cor:first} as a function of $\epsilon_f$, specified by Theorem~\ref{thm:main} and the corresponding subroutine, and then solve the augmented Lagrangian subproblem only once for the penalty weight $\b_1=b$. However, the constants in the bound are unknown and this approach is consequently intractable in general. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight. The number of outer iterations in Algorithm~\ref{Algo:2} depend only logarithmically on the desired accuracy, as detailed in the proof of Corollary~\ref{cor:first}. - +For Algorithm~\ref{Algo:2}, to reach a near-stationary point with an accuracy of $\epsilon_f$ in the sense of \eqref{eq:inclu3} and with the lowest computational cost, one therefore needs to perform one iteration of Algorithm~\ref{Algo:2}, with $\b_1=b$ specified as a function of $\epsilon_f$ by \eqref{eq:stat_prec_first} in Theorem~\ref{thm:main}. However, the constants in the bound are unknown and this approach is consequently intractable in general. Instead, the homotopy approach taken by Algorithm~\ref{Algo:2} ensures achieving the desired accuracy by gradually increasing the penalty weight. This homotopy approach increases the computational cost of Algorithm~\ref{Algo:2} only by factor logarithmic in the $\epsilon_f$, as detailed in the proof of Corollary~\ref{cor:first}. +\textbf{AE: if we find some time, i'll add a couple of lines for how 1st order opt implies 2nd order opt for smooth constraints.} -\subsection{Second-Order Subroutine} +\subsection{Second-Order Solver} {\color{red}{Ahmet (note to myself): not sure of the constants of trust-region, check again}} \\ -In this subsection, we focus on finding second-order stationary points of~\eqref{prob:01}. -For this purpose, we need to use a subroutine in Step 2 of Algorithm~\ref{Algo:2} to find approximate second-order stationary points of the subproblem~\eqref{e:exac}. -As shown in~\cite{nouiehed2018convergence}, finding approximate second-order stationary points for constrained problems is in general NP-hard. -For this reason, we focus on the special case of~\eqref{prob:01} with $g=0$ in this section. - -In this case, we focus on find second-order stationary points of~\eqref{prob:01} in the sense of~\eqref{eq:sec_opt}. - -We first give a theorem to show the convergence rate of the algorithm for second order stationarity: \\ -{\color{red}{Ahmet: I think that it is possible to remove sufficiently large k0 assumption here. }} -\begin{corollary} -\label{thm:main_second} -Under the assumptions of Theorem~\ref{thm:main}, the output of Algorithm~\ref{Algo:2} satisfies -\begin{align} -%& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ -%& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\ -%&=: \frac{Q}{\beta_{k-1}^2}. -\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_{k-1}}(x_k, y_k)) \geq - \frac{C}{\beta_{k-1}} - \epsilon_{k-1}. -\end{align} -\end{corollary} +In this section, we use a second-order solver in Step~2 of Algorithm~\ref{Algo:2}, which in turn allows us to find approximate second-order stationary points of Program~\eqref{prob:01}. +That is, every iteration of Algorithm~\ref{Algo:2} uses a second-order solver to find approximate second-order stationary points of the Program~\eqref{e:exac}, in the sense specified in~\eqref{eq:sec_opt}. +As shown in~\cite{nouiehed2018convergence}, finding approximate second-order stationary points of convex-constrained problems is in general NP-hard. For this reason, we focus in this section on the special case of Program~\eqref{prob:01} with $g=0$. + +%We first give a theorem to show the convergence rate of the algorithm for second order stationarity: \\ +%{\color{red}{Ahmet: I think that it is possible to remove sufficiently large k0 assumption here. }} \textbf{AE: not worth it really} +%\begin{corollary} +%\label{thm:main_second} +%Under the assumptions of Theorem~\ref{thm:main}, the output of Algorithm~\ref{Algo:2} satisfies +%\begin{align} +%%& \dist^2( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|^2 \nonumber\\ +%%& = \frac{1}{\b_{k-1}^2}\left( \frac{8 \lambda'_A\s_k^2 + 4}{ \nu^2} ( \lambda'_f+\lambda'_A y_{\max})^2 + 2\right) \nonumber \\ +%%&=: \frac{Q}{\beta_{k-1}^2}. +%\lambda_{\text{min}}(\nabla _{xx}\mathcal{L}_{\beta_{k-1}}(x_k, y_k)) \geq - \frac{C}{\beta_{k-1}} - \epsilon_{k-1}. +%\end{align} +%\end{corollary} % -We now focus on total complexity of finding second-order stationary points of~\eqref{prob:01} in terms of calls to the second-order oracle. -We consider using trust region method for Step 2 of Algorithm~\ref{Algo:2} developed in~\cite{cartis2012complexity}. -In~\cite{cartis2012complexity}, the complexity for finding the second order stationary points of an unconstrained problem is derived to be +We consider the trust region method for Step~2 of Algorithm~\ref{Algo:2}, developed in~\cite{cartis2012complexity}. \textbf{AE: Ahmet to add a brief description of the algorithm.} + + + +Let us compute the total computational complexity of Algorithm~\ref{Algo:2} with the trust region method in Step~2, in terms of the number of calls made to the second-order oracle. By adapting the result in~\cite{cartis2012complexity} to our setup, we find that the number of (inner) iterations required in Step~2 of Algorithm~\ref{Algo:2} to produce $x_k$ is % \begin{equation} -\mathcal{O}\left ( \frac{\lambda_{\beta, H}^2 (\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y))}{\epsilon^3} \right), +\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}, H}^2 (\mathcal{L}_{\beta_{k-1}}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y))}{\epsilon_k^3} \right), +\label{eq:sec_inn_comp} \end{equation} % -where $\lambda_{\beta, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta$. -In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y)$ is bounded by a constant independent of $\epsilon$. -We have a similar assumption for this quantity $\forall \beta_k$. - -Consequently, we get the following result: +where $\lambda_{\beta, H}$ is the Lipschitz constant of the Hessian of the augmented Lagrangian, which is of the order of $\beta$. \textbf{AE: This is somewhat speculative. Make precise?} +In~\cite{cartis2012complexity}, the term $\mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y)$ is bounded by a constant independent of $\epsilon$. \textbf{AE: If so, there is perhaps no need to mention it in the above bound. It'd simplify things.} +We have a similar assumption for this quantity $\forall \beta_k$. \textbf{AE: A bit vague. I couldn't follow...} Using \eqref{eq:sec_inn_comp} and Theorem~\ref{thm:main}, we arrive at the following result. % -\begin{corollary}\label{cor:second} +\begin{corollary}\label{cor:second} \textbf{AE: once you converge on the statement here, please rewrite to match the language of the previous corollary as much as possible for consistency.} Let us use the following form for $\beta$ \begin{equation*} \beta_k = \alpha^k, ~~ \alpha > 1. \end{equation*} % We use the trust region method from~\cite{cartis2012complexity} for step 2 in Algorithm~\ref{Algo:2} and assume that \begin{equation} \mathcal{L}_{\beta}(x_1, y) - \min_{x}\mathcal{L}_{\beta}(x, y) \end{equation} is upper bounded by a constant independent of $\epsilon$, for all $\beta_k$. Algorithm~\ref{Algo:2} then finds an $\epsilon$-second-order stationary point of~\eqref{prob:01} in $T_K$ total number of calls to the second order oracle where % \begin{equation} T_K \leq \tilde{\mathcal{O}}\left( \frac{Q^{\frac{5}{2}}}{\epsilon^{5}} \right). \end{equation} \end{corollary} % {\color{red} Ahmet: I'll check again the constants of the corollaries, eps dependence should be correct.}\\ % diff --git a/ICML19/Drafts/sections/introduction.tex b/ICML19/Drafts/sections/introduction.tex index 93951a4..2b6f0a8 100644 --- a/ICML19/Drafts/sections/introduction.tex +++ b/ICML19/Drafts/sections/introduction.tex @@ -1,75 +1,75 @@ \section{Introduction} \label{intro} We study the nonconvex optimization problem % \begin{equation} \label{prob:01} \begin{cases} \underset{x\in \mathbb{R}^d}{\min}\,\, f(x)+g(x)\\ A(x) = b, \end{cases} \end{equation} % where $f:\mathbb{R}^d\rightarrow\mathbb{R}$ is possibly nonconvex and $A:\mathbb{R}^d\rightarrow\mathbb{R}^m$ is a possibly nonlinear operator and $b\in\mathbb{R}^m$. We assume in addition that $g:\mathbb{R}^d\rightarrow\mathbb{R}$ is convex (but not necessarily smooth). For instance, for a convex set $\mathcal{X}\subset\RR^d$, letting $g=\delta_\mathcal{X}$ to be the indicator function on $\mathcal{X}$, Program~\eqref{prob:01} is a nonconvex optimization problem with the convex constraint $x\in \mathcal{X}$. Our other requirement for $g$ is that computing its proximal operator is inexpensive, see Section~\ref{sec:preliminaries}. A broad range of problems in computer science \cite{khot2011grothendieck, lovasz2003semidefinite}, machine learning \cite{mossel2015consistency, song2007dependence}, and signal processing \cite{singer2011angular, singer2011three} naturally fall under the template of Program~\eqref{prob:01}, including but not limited to maximum cut, clustering, generalized eigenvalue, as well as community detection. \textbf{AE: We need to add some more specifics to this paragraph to convince people that this is an important template. What are the names of the problems you're citing? Max-cut, sparse regression, statistical learning with deep networks as prior \cite{Bora2017}, what else we can think of?} \paragraph{Vignette: Burer-Monteiro splitting} A powerful relaxation for max-cut, clustering, and several other problems above are provided by the SemiDefinite Program (SDP) of the form % \begin{equation} \label{eq:sdp} \begin{cases} \underset{X\in\mathbb{S}^{d \times d}}{\min} \langle C, X \rangle \\ \mathcal{A}(X) = b, \,\, X \succeq 0, \end{cases} \end{equation} % where $X$ is a positive semidefinite and symmetric $d\times d$ matrix, %$\mathbb{S}^{d \times d}$ denotes the set of real symmetric matrices, and $\mathcal{A}: \mathbb{S}^{d\times d} \to \mathbb{R}^m$ is a linear operator. In practice, $d$ is often very large which makes interior point methods, with their poor scaling in $d$, an unattractive option for solving Program~\eqref{eq:sdp}. Attempts to resolve this issue prompted extensive research in computationally- and memory-efficient SDP solvers. The first such solvers relied on the so-called Linear Minimization Oracle (LMO), reviewed in Section~\ref{sec:related work}, alongside other scalabe SDP solvers. The second approach for solving Program~\eqref{prob:01}, dating back to ~\cite{burer2003nonlinear, burer2005local}, is the so-called Burer-Monteiro (BR) splitting $X=UU^\top$, where $U\in\mathbb{R}^{d\times r}$ and $r$ is selected according to the guidlines in~\cite{pataki1998rank, barvinok1995problems} to remove spurious local minima of the (nonconvex) factorized problem. The obvious advantage of such a change of variables is its low storage cost and fast iterations. This splitting forms the nonlinear-constrained, nonconvex program \vspace{-.3cm} \begin{equation} \label{prob:nc} \begin{cases} \underset{U\in\mathbb{R}^{d \times r}}{\min} \langle C, UU^\top \rangle \\ \mathcal{A}(UU^\top) = b, \end{cases} \end{equation} \vspace{-.3cm} which can be written in the form of Program~\eqref{prob:01}. %\textbf{Vignette 2: Latent group lasso} %See Section ? for several examples. \textbf{We should explain this better and give more high level examples to motivate this program.} %An example of our template in \eqref{prob:01} is semi-definite programming which provides powerful relaxations to above problems. Denote the space of $d'\times d'$ symmetric matrices by $\mathbb{S}^{d'\times d'}$ and consider % %\begin{equation} % \label{sdp} % \min_x \{h'(x): A'(x) = b', ~~x \in \mathcal{C'}~~\text{and}~~x\succeq0 \} %\end{equation} % %where $h': \mathbb{S}^{d'\times d'} \to \RR$, $A'\colon\mathbb{S}^{d'\times d'}\to\RR^m$, $b'\in\RR^m$, and $C' \subseteq \mathbb{R}^{d'\times d'}$. This template clearly can be put to the form of \eqref{prob:01} by using \emph{Burer-Monteiro factorization} \cite{burer2003nonlinear, burer2005local}. -To solve Program~\eqref{prob:nc}, the inexact Augmented Lagrangian Method (iALM) is widely used~\cite{burer2003nonlinear, kulis2007fast}, due its cheap per iteration cost and empirical success. Every (outer) iteration of iALM calls a solver to inexactly solve an intermediate augmented Lagrangian subproblem to near stationarity, and the user has freedom in choosing this subroutine, which could be a first-order solver (say, proximal gradient descent) or a second-order solver (such as BFGS \cite{nocedal2006numerical}). +To solve Program~\eqref{prob:nc}, the inexact Augmented Lagrangian Method (iALM) is widely used~\cite{burer2003nonlinear, kulis2007fast}, due its cheap per iteration cost and empirical success. Every (outer) iteration of iALM calls a solver to inexactly solve an intermediate augmented Lagrangian subproblem to near stationarity, and the user has freedom in choosing this solver, which could be first-order (say, proximal gradient descent) or second-order (such as BFGS \cite{nocedal2006numerical}). However, unlike its convex analouge~\cite{nedelcu2014computational,lan2016iteration,xu2017inexact}, the convergence rate and complexity of iALM for Program~\eqref{prob:nc} are not well-understood, see Section~\ref{sec:related work} for a review of the related literature. Indeed, addressing this important theoretical gap is one of the contributions of the present work. {\textbf{Contributions.} \\[1mm] $\bullet$ We characterize the convergence rate of iALM for Program~\eqref{prob:01} with an arbitrary solver for finding first-order and second-order stationary points. \\[2mm] $\bullet$ We investigate using different solvers for augmented Lagrangian subproblems and provide overall iteration complexity bounds for finding first-order and second-order stationary points of Program~\eqref{prob:01}. \\[2mm] $\bullet$ We provide numerical evidence for the flexibility and efficiency of the framework. diff --git a/ICML19/Drafts/sections/preliminaries.tex b/ICML19/Drafts/sections/preliminaries.tex index e5283b9..5fb1e27 100644 --- a/ICML19/Drafts/sections/preliminaries.tex +++ b/ICML19/Drafts/sections/preliminaries.tex @@ -1,174 +1,176 @@ \section{Preliminaries \label{sec:preliminaries}} \paragraph{\textbf{{Notation.}}} We use the notation $\langle\cdot ,\cdot \rangle $ and $\|\cdot\|$ for the {standard inner} product and norm on $\RR^d$. For matrices, $\|\cdot\|$ and $\|\cdot\|_F$ denote the spectral and Frobenius norms, respectively. %For a matrix $M$, its row and null spaces are denoted by $\row(A)$ and $\text{null}(A)$, respectively. For a convex function $g:\RR^d\rightarrow\RR$, the subdifferential set at $x\in \RR^d$ is denoted by $\partial g(x)$ and we will occasionally use the notation $\partial g(x)/\b = \{ z/\b : z\in \partial g(x)\}$. %The proximal operator $P_g:\RR^d\rightarrow\RR^d$ takes $x$ to %\begin{align} %P_g(x) = \underset{y}{\argmin} \, g(y) + \frac{1}{2}\|x-y\|^2, %\label{eq:defn of prox} %\end{align} %and, if $g=1_C$ is the indicator function of a convex set $C$, we will use the shorthand $P_C=P_{1_C}$ to show the orthogonal projection onto $C$. For a set $\mathcal{X}\subset\RR^d$, the indicator function $\delta_\mathcal{X}:\RR^d\rightarrow\RR$ takes $x$ to \begin{align} g(x) = \delta_\mathcal{X}(x) = \begin{cases} 0 & x \in \mathcal{X}\\ \infty & x\notin \mathcal{X}. \end{cases} \label{eq:indicator} \end{align} %The tangent cone of $C$ at $x\in C$ is denoted by $T_C(x)$ and The distance function from a point $x$ to $\mathcal{X}$ is denoted by $\dist(x,\mathcal{X}) = \min_{z\in \mathcal{X}} \|x-z\|$. For integers $k_0 \le k_1$, we set $[k_0:k_1]=\{k_0,\cdots,k_1\}$. For an operator $A:\RR^d\rightarrow\RR^m$ with components $\{A_i\}_{i=1}^m$, we let $DA(x) \in \mathbb{R}^{m\times d}$ denote the Jacobian of $A$, where the $i$th row of $DA(x)$ is the gradient vector $\nabla A_i(x)$. \textbf{AE: need to revisit this.} We denote the Hessian of the augmented Lagrangian with respect to $x$ as $\nabla _{xx} \mathcal{L}_{\beta}(x,y)$. \\ {\color{red} define $A_i$ rigorously.} \textbf{Smoothness.} We require $f:\RR^d\rightarrow\RR$ and $A:\RR^d\rightarrow \RR^m$ to be smooth in the sense that there exists $\lambda_f,\lambda_A\ge 0$ such that % %\vspace{-.3cm} \begin{align} \| \nabla f(x) - \nabla f(x')\| & \le \lambda_f \|x-x'\|, \nonumber\\ \| DA(x) - DA(x') \| & \le \lambda_A \|x-x'\|, \label{eq:smoothness basic} \end{align} for every $x,x'\in \mathbb{R}^d$. \textbf{Augmented Lagrangian method}. ALM is a classical algorithm, proposed in~\cite{hestenes1969multiplier, powell1969method}, and further studied in~\cite{bertsekas2014constrained}. For solving Program~\eqref{prob:01}, ALM suggests solving the equivalent problem % %\vspace{-.3cm} \begin{equation} \min_{x} \max_y \,\,\mathcal{L}_\beta(x,y) + g(x), \label{eq:minmax} \end{equation} %\vspace{-.5cm} % where $\mathcal{L}_\b$ is the augmented Lagrangian corresponding to Program~\eqref{prob:01}, defined as \begin{align} \label{eq:Lagrangian} \mathcal{L}_\beta(x,y) := f(x) + \langle A(x), y \rangle + \frac{\beta}{2}\|A(x)\|^2. \end{align} \textbf{AE: Adding the bias $b$ doesn't seem to add much except making the presentation more clunky.} %\vspace{-.5cm} The minimax formulation in Program~\eqref{eq:minmax} naturally suggests the following algorithm to solve Program~\eqref{prob:01}. For dual step sizes $\{\s_k\}_k$, consider the iterates \begin{equation}\label{e:exac} x_{k+1} \in \underset{x}{\argmin} \,\, \mathcal{L}_{\beta}(x,y_k)+g(x), \end{equation} \begin{equation*} y_{k+1} = y_k+\s_k A(x_{k+1}). \end{equation*} However, updating $x$ above requires solving the nonconvex Program~\eqref{e:exac} to global optimality, which is typically intractable. It is instead often easier to find an approximate first-order or second-order stationary point of Program~\eqref{e:exac}. %We therefore consider an algorithm that only requires approximate stationarity in every iteration. Our central claim is that, by gradually decreasing the stationarity precision and increasing the penalty weight $\b$ above, we can reach a stationary point of Program~\eqref{prob:01}, as detailed in Section~\ref{sec:AL algorithm}. {\textbf{{{Optimality conditions.}}} \label{sec:opt cnds}} {First-order necessary optimality conditions} for Program~\eqref{prob:01} are well-understood. {Indeed, $x\in \RR^d$ is a first-order stationary point of Program \eqref{prob:01} if there exists $y\in \RR^m$ such that \begin{align} \begin{cases} -\nabla f(x) - DA(x)^\top y \in \partial g(x)\\ A(x) = 0, \end{cases} \label{e:inclu1} \end{align} where $DA(x)$ is the Jacobian of $A$ at $x$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to \begin{align} \begin{cases} -\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x)\\ A(x) = 0, \end{cases} \label{e:inclu2} \end{align} which is in turn the necessary optimality condition for Program \eqref{eq:minmax}. } %Note that \eqref{e:inclu2} is equivalent to %\begin{align} %\left[ %\begin{array}{c} %\nabla_x \L_\b(x,y) \\ %\nabla_y \L_\b(x,y) %\end{array} %\right] %\in %\left\{ %\left[ %\begin{array}{c} % v\\ %0 %\end{array} %\right] %: v\in \partial g(x) \right\} %, %\end{align} %which rephrases the stationarity condition in terms of the gradient of the duality gap of Program~\eqref{eq:Lagrangian}. -Inspired by this, we say that $x$ is an $\epsilon_f$ first-order stationary point if +Inspired by this, we say that $x$ is an $(\epsilon_f,\b)$ first-order stationary point if \begin{align} \begin{cases} \dist(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x)) \leq \epsilon_f \\ -\| A(x) \| \leq \epsilon_f. +\| A(x) \| \leq \epsilon_f, \end{cases} \label{eq:inclu3} \end{align} +for $\epsilon_s\ge 0$. In light of \eqref{eq:inclu3}, a suitable metric for evaluating the stationarity of a pair $(x,y)\in \RR^d\times \RR^m$ is \begin{align} \dist^2\left(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x) \right) + \|A(x)\|^2 , \label{eq:cvg metric} \end{align} -which we will also use as the stopping criterion later on. It is also not difficult to verify that the expression in \eqref{eq:cvg metric} is the norm of the gradient of the duality gap of Program~\eqref{prob:01}. +which we will also use as the stopping criterion later on. When $g=0$, it is also not difficult to verify that the expression in \eqref{eq:cvg metric} is the norm of the gradient of the duality gap in Program~\eqref{eq:minmax}. As an example, for a convex set $\mathcal{X}\subset\RR^d$, suppose that $g = \delta_\mathcal{X}$ is the indicator function on $\mathcal{X}$.} We also let $T_\mathcal{X}(x) \subseteq \RR^d$ denote the tangent cone to $\mathcal{X}$ at $x$, and use $P_{T_\mathcal{X}(x)}:\RR^d\rightarrow\RR^d$ to denote the orthogonal projection onto this tangent cone. Then, for $u\in\RR^d$, it is not difficult to verify that \begin{align}\label{eq:dist_subgrad} \dist\left(u, \partial g(x) \right) = \| P_{T_\mathcal{X}(x)}(u) \|. \end{align} % -Similarly, when $g(x) = 0$, we say that a first-order stationary point $x\in \RR^d$ of Program~\eqref{prob:01} if is also a second-order stationarity point, if +Similarly, when $g(x) = 0$, we say that a first-order stationary point $x\in \RR^d$ of Program~\eqref{prob:01} is also a second-order stationary point if % %\vspace{-.5cm} \begin{equation} \lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) > 0, \end{equation} %\vspace{-.5cm} % where $\lambda_{\text{min}}(\cdot)$ returns the smallest eigenvalue of its argument. -Analogously, $x$ is an $\epsilon_s$-second order stationary point if +Analogously, $x$ is an $(\epsilon_f, \epsilon_s,\b)$ second order stationary point if, in addition to \eqref{eq:inclu3}, it holds that % %\vspace{-.5cm} \begin{equation}\label{eq:sec_opt} -\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) > -\epsilon_s. +\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) \ge -\epsilon_s, \end{equation} +for $\epsilon_s>0$. %\vspace{-.5cm} % Naturally, for second-order stationarity, we will use $\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))$ as the stopping criterion. \paragraph{{\textbf{Smoothness lemma.}}} The following result controls the smoothness of $\L_\b(\cdot,y)$ for a fixed $y$. The proof is standard but is included in Appendix~\ref{sec:proof of smoothness lemma} for completeness. -\begin{lemma}[Smoothness]\label{lem:smoothness} +\begin{lemma}[\textbf{Smoothness}]\label{lem:smoothness} For fixed $y\in \RR^m$ and $\rho,\rho'\ge 0$, it holds that \begin{align} \| \nabla_x \mathcal{L}_{\beta}(x, y)- \nabla_x \mathcal{L}_{\beta}(x', y) \| \le \lambda_\b \|x-x'\|, \end{align} for every $x,x' \in \{ x'': \|A(x'') \|\le \rho, \|x''\|\le \rho'\}$, where \begin{align} \lambda_\beta & \le \lambda_f + \sqrt{m}\lambda_A \|y\| + (\sqrt{m}\lambda_A\rho + d \lambda'^2_A )\b \nonumber\\ & =: \lambda_f + \sqrt{m}\lambda_A \|y\| + \lambda''(A,x_1) \b, \label{eq:smoothness of Lagrangian} \end{align} Above, $\lambda_f,\lambda_A$ were defined in (\ref{eq:smoothness basic}) and \begin{align} \lambda'_A := \max_{\|x\|\le \rho'}\|DA(x)\|. \end{align} \end{lemma} diff --git a/ICML19/Drafts/sections/related_works.tex b/ICML19/Drafts/sections/related_works.tex index f82ee8a..e4a0f81 100644 --- a/ICML19/Drafts/sections/related_works.tex +++ b/ICML19/Drafts/sections/related_works.tex @@ -1,69 +1,69 @@ \section{Related Works \label{sec:related work}} -ALM is first proposed in~\cite{hestenes1969multiplier, powell1969method}. -For the specific case of~\eqref{prob:01} with a linear operator $A$ and convex $f$, standard, inexact and linearized versions of ALM are studied extensively~\cite{lan2016iteration,nedelcu2014computational,tran2018adaptive,xu2017inexact}. -Classical works on ALM focused on the general template of~\eqref{prob:01}, with relatively stronger assumptions and exact solution to subproblem in Step 2 of Algorithm~\ref{Algo:2}~\cite{bertsekas2014constrained}. -A similar analysis is conducted in~\cite{fernandez2012local}, where the authors considered inexact ALM under specific assumptions on the initialization of the dual variable with convergence rates. However, unlike our results, the authors did not provide total complexity results. +ALM has a long history in the optimization literature, dating back to~\cite{hestenes1969multiplier, powell1969method}. +In the special case of Program~\eqref{prob:01} with a convex $f$ and a linear operator $A$, standard, inexact and linearized versions of ALM have been extensively studied~\cite{lan2016iteration,nedelcu2014computational,tran2018adaptive,xu2017inexact}. +Classical works on ALM focused on the general template of Program~\eqref{prob:01}, with relatively stronger assumptions and required exact solutions to the subproblem in Step 2 of Algorithm~\ref{Algo:2}, namely, Program~\eqref{e:exac} \cite{bertsekas2014constrained}. +A similar analysis was conducted in~\cite{fernandez2012local}, where the authors considered inexact ALM under specific assumptions on the initialization of the dual variable with convergence rates. However, unlike our results, the authors did not provide total complexity results. \textbf{AE: Mention if this paper was convex or nonconvex?} -Problem~\eqref{prob:01} with similar assumptions is also studied in~\cite{birgin2016evaluation} and~\cite{cartis2018optimality} for first order and second order stationarity, respectively, with explicit iteration complexity analysis. +Program~\eqref{prob:01} with similar assumptions is also studied in~\cite{birgin2016evaluation} and~\cite{cartis2018optimality} for first-order and second-order stationarity, respectively, with explicit iteration complexity analysis. As we have mentioned in Section~\ref{sec:cvg rate}, our iteration complexity results matches these works with a simpler algorithm and a simpler analysis. In addition, these algorithms require setting final accuracies since they utilize this information in the algorithm. On the contrary, our algorithm does not require setting accuracies a priori. \cite{bolte2018nonconvex} also considers the general template~\eqref{prob:01} with specific assumptions. Even though the authors point that deriving convergence rate results are straightforward, they did not present iteration complexity results that we can compare. Another recent line of work~\cite{clason2018acceleration} focused on solving the problem template~\eqref{prob:01} by adapting the primal-dual method of Chambolle and Pock~\cite{chambolle2011first}. The authors proved the convergence of the method with rate guarantees by assuming error bound conditions on the objective function, which is not necessarily satisfied for general SDPs. %Some works also considered the application of ALM/ADMM to nonconvex problems~\cite{wang2015global, liu2017linearized}. %These works assume that the operator in~\eqref{prob:01} is linear, therefore, they do not apply to our setting. %since we have a nonlinear constraint in addition to a nonconvex objective function. \cite{burer2003nonlinear, burer2005local} were the first works that proposed the splitting $X=UU^\top$ for solving SDPs of the form~\eqref{eq:sdp}. Starting from these works, literature on Burer-Monteiro in a large part focused on using ALM for solving the reformulated problem~\eqref{prob:nc}. However, this approach suffers from several drawbacks: First, these works focuses on the case of exact solutions in Step 2 of Algorithm~\ref{Algo:2} in theory, however they use inexact solutions in practice. In addiition, their results are for convergence only, without any rate guarantees. Our work can be considered as providing a theoretical understanding on this practical approach of using inexact solutions to Step 2 of Algorithm~\ref{Algo:2} with explicit iteration complexities. %Their practical stopping condition is also not analyzed theoretically. %In addition, Burer and Monteiro in~\cite{burer2005local} needed to bound the primal iterates they have to put an artificial bound to the primal domain which will be ineffective in practice; which is impossible to do without knowing the norm of the solution. % and their results do not extend to the case where projection in primal domain are required in each iteration. One of the earliest efforts for showing global optimality with Burer-Monteiro splitting were the series of works~\cite{bhojanapalli2016dropping, park2016provable}. These works focused on the special case of SDPs without linear constraints. For these specific problems, they prove the global convergence of gradient descent on with special initialization. Unfortunately, these results do not apply to general SDPs of the form~\eqref{eq:sdp} where the difficulty arises due to linear constraints. %In their followup work~, the authors considered projected gradient descent, but only for restricted strongly convex functions. %Their results are not able to extend to linear constraints and general convex functions. %, therefore not applicable to general SDPs. Another populer method for solving SDPs with some specific assumptions are due to~\cite{boumal2014manopt, boumal2016global, boumal2016non}. These works focus on the case where the constraints in~\eqref{prob:01} can be written as a smooth Riemannian manifold. In this case, the authors apply Riemannian gradient descent and Riemannian trust region methods for obtaining first and second order stationary points, respectively. They obtain~$\mathcal{O}(1/\epsilon^2)$ complexity for finding first order stationary points is and~$\mathcal{O}(1/\epsilon^3)$ complexity for finding second order stationary points. Even though these complexities are faster than what has been provided in Section~\ref{sec:cvg rate}, the smooth manifold assumption of these works seem to be restrictive. These requirement holds for Maximum Cut and generalized eigenvalue problems, but it is not satisfied for other important SDPs such as quadratic programming (QAP), optimal power flow and clustering. In addition, as noted in~\cite{boumal2016global}, per iteration cost of their method for Max-Cut problem is $\mathcal{O}({n^6})$ for solving~\eqref{prob:nc} which is astronomically larger than our cost of $\mathcal{O}(n^2r)$ where $r \ll n$. %They do not apply to the general semidefinite programming since $f(U)=\langle C, UU^\ast \rangle$. %, these conditions are not satisfied. \cite{bhojanapalli2018smoothed} focused on the quadratic penalty formulation of~\eqref{prob:01}: \begin{equation}\label{eq:pen_sdp} \min_{X\succeq 0} \langle C, X\rangle + \frac{\mu}{2} \| \mathcal{A}(x)-b\|^2. \end{equation} They study the optimality of the second order stationary points of the formulation: \begin{equation}\label{eq:pen_nc} \min_{U\in\mathbb{R}^{d\times r}} \langle C, UU^\top \rangle + \frac{\mu}{2} \| \mathcal{A}(UU^\top)-b\|^2. \end{equation} These results are for establishing a connection between the stationary points of~\eqref{eq:pen_nc} and global optima of~\eqref{eq:pen_sdp}. In contrast, we focus on the relation to the original constrained problem~\eqref{eq:sdp}. %\cite{birgin2016evaluation} can handle the same problem but their algorithm is much more complicated than ours. There also exist a line of work for solving SDPs in their original convex formulation, in a storage efficient way~\cite{nesterov2009primal, yurtsever2015universal, yurtsever2018conditional}. These works have global optimality guarantees due to solving convex formulations. Unfortunately, these works requires the use of eigenvalue routines which causes their convergence to be slow compared to nonconvex approaches.