preliminaries.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Nov 10, 21:44

preliminaries.tex
View Options

	%!TEX root = ../main.tex
	%\begin{comment}
	\vspace{-4mm}
	\section{Preliminaries \label{sec:preliminaries}}\vspace{-3mm}
	%\editf{Try to simplify this section as much as possible. We need to shorthen the paper.}
	\paragraph{\textbf{{Notation.}}}
	We use the notation $\langle\cdot ,\cdot \rangle $ and $\\|\cdot\\|$ for the {standard inner} product and the norm on $\RR^d$. For matrices, $\\|\cdot\\|$ and $\\|\cdot\\|_F$ denote the spectral and the Frobenius norms, respectively.
	%For a matrix $M$, its row and null spaces are denoted by $\row(A)$ and $\text{null}(A)$, respectively.
	For the convex function $g:\RR^d\rightarrow\RR$, the subdifferential set at $x\in \RR^d$ is denoted by $\partial g(x)$ and we will occasionally use the notation $\partial g(x)/\b = \{ z/\b : z\in \partial g(x)\}$.
	When presenting iteration complexity results, we often use $\widetilde{O}(\cdot)$ which suppresses the logarithmic dependencies.
	%The proximal operator $P_g:\RR^d\rightarrow\RR^d$ takes $x$ to
	%\begin{align}
	%P_g(x) = \underset{y}{\argmin} \, g(y) + \frac{1}{2}\\|x-y\\|^2,
	%\label{eq:defn of prox}
	%\end{align}
	%and, if $g=1_C$ is the indicator function of a convex set $C$, we will use the shorthand $P_C=P_{1_C}$ to show the orthogonal projection onto $C$.

	We denote $\delta_\mathcal{X}:\RR^d\rightarrow\RR$ as the indicator function of a set $\mathcal{X}\subset\RR^d$. %, which takes $x$ to
	%\begin{align}
	%\delta_\mathcal{X}(x) =
	%\begin{cases}
	%0 & x \in \mathcal{X}\\
	%\infty & x\notin \mathcal{X}.
	%\end{cases}
	%\label{eq:indicator}
	%\end{align}
	%
	%The tangent cone of $C$ at $x\in C$ is denoted by $T_C(x)$ and
	The distance function from a point $x$ to $\mathcal{X}$ is denoted by $\dist(x,\mathcal{X}) = \min_{z\in \mathcal{X}} \\|x-z\\|$.
	For integers $k_0 \le k_1$, we use the notation $[k_0:k_1]=\{k_0,\ldots,k_1\}$. For an operator $A:\RR^d\rightarrow\RR^m$ with components $\{A_i\}_{i=1}^m$, $DA(x) \in \mathbb{R}^{m\times d}$ denotes the Jacobian of $A$, where the $i$th row of $DA(x)$ is the vector $\nabla A_i(x) \in \RR^d$.

	%We denote the Hessian of the augmented Lagrangian with respect to $x$ as $\nabla _{xx} \mathcal{L}_{\beta}(x,y)$. \\
	%{\color{red} define $A_i$ rigorously.}

	\paragraph{Smoothness.}
	We assume smooth $f:\RR^d\rightarrow\RR$ and $A:\RR^d\rightarrow \RR^m$; i.e., there exist $\lambda_f,\lambda_A\ge 0$ s.t.
	%
	%\vspace{-.3cm}
	\begin{align}
	\\| \nabla f(x) - \nabla f(x')\\| \le \lambda_f \\|x-x'\\|, \quad \\| DA(x) - DA(x') \\| \le \lambda_A \\|x-x'\\|,
	\quad \forall x, x' \in \RR^d .
	\label{eq:smoothness basic}
	\end{align}
	%\edita{the above format is more professional imo. using and sign is not common. }

	%\end{comment}
	\paragraph{Augmented Lagrangian method (ALM)}.
	ALM is a classical algorithm, which first appeared in~\cite{hestenes1969multiplier, powell1969method} and extensively studied afterwards in~\cite{bertsekas1982constrained, birgin2014practical}.
	For solving \eqref{prob:01}, ALM suggests solving the problem
	%
	%\vspace{-.3cm}
	\begin{equation}
	\min_{x} \max_y \,\,\mathcal{L}_\beta(x,y) + g(x),
	\label{eq:minmax}
	\end{equation}
	%\vspace{-.5cm}
	%
	where, for penalty weight $\b>0$, $\mathcal{L}_\b$ is the corresponding augmented Lagrangian, defined as
	\begin{align}
	\label{eq:Lagrangian}
	\mathcal{L}_\beta(x,y) := f(x) + \langle A(x), y \rangle + \frac{\beta}{2}\\|A(x)\\|^2.
	\end{align}
	%\textbf{AE: Adding the bias $b$ doesn't seem to add much except making the presentation more clunky.}
	%\vspace{-.5cm}
	The minimax formulation in \eqref{eq:minmax} naturally suggests the following algorithm for solving \eqref{prob:01}:
	\begin{equation}\label{e:exac}
	x_{k+1} \in \underset{x}{\argmin} \,\, \mathcal{L}_{\beta}(x,y_k)+g(x),
	\end{equation}
	\begin{equation*}
	y_{k+1} = y_k+\s_k A(x_{k+1}),
	\end{equation*}
	where the dual step sizes are denoted as $\{\s_k\}_k$.
	However, computing $x_{k+1}$ above requires solving the nonconvex problem~\eqref{e:exac} to optimality, which is typically intractable. Instead, it is often easier to find an approximate first- or second-order stationary point of~\eqref{e:exac}.
	%We therefore consider an algorithm that only requires approximate stationarity in every iteration.

	Hence, we argue that by gradually improving the stationarity precision and increasing the penalty weight $\b$ above, we can reach a stationary point of the main problem in~\eqref{eq:minmax}, as detailed in Section~\ref{sec:AL algorithm}.

	\paragraph{{\textbf{Optimality conditions.}}}
	{First-order necessary optimality conditions} for \eqref{prob:01} are well-studied. {Indeed, $x\in \RR^d$ is a first-order stationary point of~\eqref{prob:01} if there exists $y\in \RR^m$ such that
	%
	%%% I am cutting this to save space. put it back if necessary (VC)
	%\begin{align}
	%%\begin{cases}
	%-\nabla f(x) - DA(x)^\top y \in \partial g(x), \qquad \,\, A(x) = 0,
	%%\end{cases}
	%\label{e:inclu1}
	%\end{align}
	%
	%where $DA(x)$ is the Jacobian of $A$ at $x$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to
	\begin{align}
	%\begin{cases}
	-\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x),\qquad A(x) = 0,
	%\end{cases}
	\label{e:inclu2}
	\end{align}
	which is in turn the necessary optimality condition for \eqref{eq:minmax}.}
	%Note that \eqref{e:inclu2} is equivalent to
	%\begin{align}
	%\left[
	%\begin{array}{c}
	%\nabla_x \L_\b(x,y) \\
	%\nabla_y \L_\b(x,y)
	%\end{array}
	%\right]
	%\in
	%\left\{
	%\left[
	%\begin{array}{c}
	% v\\
	%0
	%\end{array}
	%\right]
	%: v\in \partial g(x) \right\}
	%,
	%\end{align}
	%which rephrases the stationarity condition in terms of the gradient of the duality gap of Program~\eqref{eq:Lagrangian}.
	%\editf{mfs: check approx. optimality conditions, how they apply in this setting.}
	Inspired by this, we say that $x$ is an $(\epsilon_f,\b)$ first-order stationary point of \eqref{eq:minmax} if {there exists a $y \in \RR^m$} such that
	\begin{align}
	%\begin{cases}
	\dist(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x)) \leq \epsilon_f, \qquad \\| A(x) \\| \leq \epsilon_f,
	%\end{cases}
	\label{eq:inclu3}
	\end{align}
	for $\epsilon_f\ge 0$.
	In light of \eqref{eq:inclu3}, a metric for evaluating the stationarity of a pair $(x,y)\in \RR^d\times \RR^m$ is
	\begin{align}
	\dist\left(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x) \right) + \\|A(x)\\| ,
	\label{eq:cvg metric}
	\end{align}
	which we use as the first-order stopping criterion.
	%When $g=0$, it is also not difficult to verify that the expression in \eqref{eq:cvg metric} is the norm of the gradient of the duality gap in \eqref{eq:minmax}.
	As an example, for a convex set $\mathcal{X}\subset\RR^d$, suppose that $g = \delta_\mathcal{X}$ is the indicator function on $\mathcal{X}$.
	Let also $T_\mathcal{X}(x) \subseteq \RR^d$ denote the tangent cone to $\mathcal{X}$ at $x$, and with $P_{T_\mathcal{X}(x)}:\RR^d\rightarrow\RR^d$ we denote the orthogonal projection onto this tangent cone. Then, for $u\in\RR^d$, it is not difficult to verify that
	\begin{align}\label{eq:dist_subgrad}
	\dist\left(u, \partial g(x) \right) = \\| P_{T_\mathcal{X}(x)}(u) \\|.
	\end{align}
	%
	When $g = 0$, a first-order stationary point $x\in \RR^d$ of \eqref{prob:01} is also second-order stationary if
	%
	%\vspace{-.5cm}
	\begin{equation}
	\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))\ge 0,
	\end{equation}
	%\vspace{-.5cm}
	%
	where $\nabla_{xx}\mathcal{L}_\b$ is the Hessian of $\mathcal{L}_\b$ with respect to $x$, and $\lambda_{\text{min}}(\cdot)$ returns the smallest eigenvalue of its argument.
	Analogously, $x$ is an $(\epsilon_f, \epsilon_s,\b)$ second-order stationary point if, in addition to \eqref{eq:inclu3}, it holds that
	%
	%\vspace{-.5cm}
	\begin{equation}\label{eq:sec_opt}
	\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) \ge -\epsilon_s,
	\end{equation}
	for $\epsilon_s\ge 0$.
	%\vspace{-.5cm}
	%
	Naturally, for second-order stationarity, we use $\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))$ as the stopping criterion.


	%\paragraph{{\textbf{Stationarity.}}}$x$ is an $(\epsilon_f,\b)$ {\textit{first-order stationary}} point of \eqref{eq:minmax} if {there exists a $y \in \RR^m$} such that
	%\begin{align}
	%%\begin{cases}
	%\dist(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x)) \leq \epsilon_f, \qquad \\| A(x) \\| \leq \epsilon_f,
	%%\end{cases}
	%\label{eq:inclu3}
	%\end{align}
	%for $\epsilon_f\ge 0$.
	%In light of \eqref{eq:inclu3}, a metric for evaluating the stationarity of a pair $(x,y)\in \RR^d\times \RR^m$ is
	%\begin{align}
	%\dist\left(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x) \right) + \\|A(x)\\| ,
	%\label{eq:cvg metric}
	%\end{align}
	%which we use as the first-order stopping criterion.
	%
	%When $g = 0$, a first-order stationary point $x\in \RR^d$ of \eqref{prob:01} is also \textit{second-order stationary} if
	%%
	%%\vspace{-.5cm}
	%\begin{equation}
	%\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))\ge 0,
	%\end{equation}
	%%\vspace{-.5cm}
	%%
	%where $\nabla_{xx}\mathcal{L}_\b$ is the Hessian of $\mathcal{L}_\b$ with respect to $x$, and $\lambda_{\text{min}}(\cdot)$ returns the smallest eigenvalue of its argument.

	%{\textbf{{{Optimality conditions.}}}

	\paragraph{{\textbf{Smoothness lemma.}}} This next result controls the smoothness of $\L_\b(\cdot,y)$ for a fixed $y$. The proof is standard but nevertheless is included in Appendix~\ref{sec:proof of smoothness lemma} for completeness.
	\begin{lemma}[\textbf{smoothness}]\label{lem:smoothness}
	For fixed $y\in \RR^m$ and $\rho,\rho'\ge 0$, it holds that
	\begin{align}
	\\| \nabla_x \mathcal{L}_{\beta}(x, y)- \nabla_x \mathcal{L}_{\beta}(x', y) \\| \le \lambda_\b \\|x-x'\\|,
	\end{align}
	for every $x,x' \in \{ x'': \\|x''\\|\le \rho, \\|A(x'') \\|\le \rho'\}$, where
	\begin{align}
	\lambda_\beta
	\le \lambda_f + \sqrt{m}\lambda_A \\|y\\| + (\sqrt{m}\lambda_A\rho' + d \lambda'^2_A )\b
	=: \lambda_f + \sqrt{m}\lambda_A \\|y\\| + \lambda''(A,\rho,\rho') \b.
	\label{eq:smoothness of Lagrangian}
	\end{align}
	Above, $\lambda_f,\lambda_A$ were defined in (\ref{eq:smoothness basic}) and
	\begin{align}
	\lambda'_A := \max_{\\|x\\|\le \rho}\\|DA(x)\\|.
	\end{align}
	\end{lemma}

preliminaries.texNo OneTemporaryActions

File Metadata

preliminaries.texView Options

Event Timeline

preliminaries.tex
No OneTemporary
Actions

preliminaries.tex
View Options