Page MenuHomec4science

preliminaries.tex
No OneTemporary

File Metadata

Created
Sun, May 19, 11:09

preliminaries.tex

\vspace{-3mm}
\section{Preliminaries \label{sec:preliminaries}}
\paragraph{\textbf{{Notation.}}}
We use the notation $\langle\cdot ,\cdot \rangle $ and $\|\cdot\|$ for the {standard inner} product and the norm on $\RR^d$. For matrices, $\|\cdot\|$ and $\|\cdot\|_F$ denote the spectral and the Frobenius norms, respectively.
%For a matrix $M$, its row and null spaces are denoted by $\row(A)$ and $\text{null}(A)$, respectively.
For a convex function $g:\RR^d\rightarrow\RR$, the subdifferential set at $x\in \RR^d$ is denoted by $\partial g(x)$ and we will occasionally use the notation $\partial g(x)/\b = \{ z/\b : z\in \partial g(x)\}$.
When presenting iteration complexity results, we use $\tilde{O}(\cdot)$ which suppresses the logarithmic dependencies.
%The proximal operator $P_g:\RR^d\rightarrow\RR^d$ takes $x$ to
%\begin{align}
%P_g(x) = \underset{y}{\argmin} \, g(y) + \frac{1}{2}\|x-y\|^2,
%\label{eq:defn of prox}
%\end{align}
%and, if $g=1_C$ is the indicator function of a convex set $C$, we will use the shorthand $P_C=P_{1_C}$ to show the orthogonal projection onto $C$.
We use the indicator function $\delta_\mathcal{X}:\RR^d\rightarrow\RR$ of a set $\mathcal{X}\subset\RR^d$
\begin{align}
g(x) = \delta_\mathcal{X}(x) =
\begin{cases}
0 & x \in \mathcal{X}\\
\infty & x\notin \mathcal{X}.
\end{cases}
\label{eq:indicator}
\end{align}
%The tangent cone of $C$ at $x\in C$ is denoted by $T_C(x)$ and
The distance function from a point $x$ to $\mathcal{X}$ is denoted by $\dist(x,\mathcal{X}) = \min_{z\in \mathcal{X}} \|x-z\|$.
For integers $k_0 \le k_1$, we denote $[k_0:k_1]=\{k_0,\ldots,k_1\}$.
For an operator $A:\RR^d\rightarrow\RR^m$ with components $\{A_i\}_{i=1}^m$, we let $DA(x) \in \mathbb{R}^{m\times d}$ denote the Jacobian of $A$, where the $i$th row of $DA(x)$ is the gradient vector $\nabla A_i(x)$.
\textbf{AE: need to revisit this.}
We denote the Hessian of the augmented Lagrangian with respect to $x$ as $\nabla _{xx} \mathcal{L}_{\beta}(x,y)$. \\
{\color{red} define $A_i$ rigorously.}
\textbf{Smoothness.}
We require $f:\RR^d\rightarrow\RR$ and $A:\RR^d\rightarrow \RR^m$ to be smooth; i.e., there exists $\lambda_f,\lambda_A\ge 0$ such that
%
%\vspace{-.3cm}
\begin{align}
\| \nabla f(x) - \nabla f(x')\| & \le \lambda_f \|x-x'\|, \nonumber\\
\| DA(x) - DA(x') \| & \le \lambda_A \|x-x'\|,
\label{eq:smoothness basic}
\end{align}
for every $x,x'\in \mathbb{R}^d$.
\textbf{Augmented Lagrangian method}.
ALM is a classical algorithm, proposed in~\cite{hestenes1969multiplier, powell1969method}, and further studied in~\cite{bertsekas2014constrained}.
For solving Program~\eqref{prob:01}, ALM suggests solving the equivalent problem
%
%\vspace{-.3cm}
\begin{equation}
\min_{x} \max_y \,\,\mathcal{L}_\beta(x,y) + g(x),
\label{eq:minmax}
\end{equation}
%\vspace{-.5cm}
%
where $\mathcal{L}_\b$ is the augmented Lagrangian corresponding to %Program~\eqref{prob:01}, defined as
\begin{align}
\label{eq:Lagrangian}
\mathcal{L}_\beta(x,y) := f(x) + \langle A(x), y \rangle + \frac{\beta}{2}\|A(x)\|^2.
\end{align}
%\textbf{AE: Adding the bias $b$ doesn't seem to add much except making the presentation more clunky.}
%\vspace{-.5cm}
The minimax formulation in \eqref{eq:minmax} naturally suggests the following algorithm to solve \eqref{prob:01}. For dual step sizes $\{\s_k\}_k$, consider the following iteration invariant:
\begin{equation}\label{e:exac}
x_{k+1} \in \underset{x}{\argmin} \,\, \mathcal{L}_{\beta}(x,y_k)+g(x),
\end{equation}
\begin{equation*}
y_{k+1} = y_k+\s_k A(x_{k+1}).
\end{equation*}
However, updating $x$ above requires solving the non-convex Program~\eqref{e:exac} to global optimality, which is typically intractable. Instead, it is often easier to find an approximate first- or second-order stationary point of Program~\eqref{e:exac}.
%We therefore consider an algorithm that only requires approximate stationarity in every iteration.
Hence, we argue that by gradually decreasing the stationarity precision and increasing the penalty weight $\b$ above, we can reach a stationary point of Program~\eqref{prob:01}, as in Section~\ref{sec:AL algorithm}.
{\textbf{{{Optimality conditions.}}} \label{sec:opt cnds}}
{First-order necessary optimality conditions} for Program~\eqref{prob:01} are well-understood. {Indeed, $x\in \RR^d$ is a first-order stationary point of Program \eqref{prob:01} if there exists $y\in \RR^m$ such that
\begin{align}
\begin{cases}
-\nabla f(x) - DA(x)^\top y \in \partial g(x)\\
A(x) = 0,
\end{cases}
\label{e:inclu1}
\end{align}
where $DA(x)$ is the Jacobian of $A$ at $x$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to
\begin{align}
\begin{cases}
-\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x)\\
A(x) = 0,
\end{cases}
\label{e:inclu2}
\end{align}
which is in turn the necessary optimality condition for Program \eqref{eq:minmax}. }
%Note that \eqref{e:inclu2} is equivalent to
%\begin{align}
%\left[
%\begin{array}{c}
%\nabla_x \L_\b(x,y) \\
%\nabla_y \L_\b(x,y)
%\end{array}
%\right]
%\in
%\left\{
%\left[
%\begin{array}{c}
% v\\
%0
%\end{array}
%\right]
%: v\in \partial g(x) \right\}
%,
%\end{align}
%which rephrases the stationarity condition in terms of the gradient of the duality gap of Program~\eqref{eq:Lagrangian}.
Inspired by this, we say that $x$ is an $(\epsilon_f,\b)$ first-order stationary point if
\begin{align}
\begin{cases}
\dist(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x)) \leq \epsilon_f \\
\| A(x) \| \leq \epsilon_f,
\end{cases}
\label{eq:inclu3}
\end{align}
for $\epsilon_s\ge 0$.
In light of \eqref{eq:inclu3}, a suitable metric for evaluating the stationarity of a pair $(x,y)\in \RR^d\times \RR^m$ is
\begin{align}
\dist^2\left(-\nabla_x \mathcal{L}_\beta(x,y), \partial g(x) \right) + \|A(x)\|^2 ,
\label{eq:cvg metric}
\end{align}
which we use as the stopping criterion. When $g=0$, it is also not difficult to verify that the expression in \eqref{eq:cvg metric} is the norm of the gradient of the duality gap in \eqref{eq:minmax}.
As an example, for a convex set $\mathcal{X}\subset\RR^d$, suppose that $g = \delta_\mathcal{X}$ is the indicator function on $\mathcal{X}$.}
We also let $T_\mathcal{X}(x) \subseteq \RR^d$ denote the tangent cone to $\mathcal{X}$ at $x$, and use $P_{T_\mathcal{X}(x)}:\RR^d\rightarrow\RR^d$ to denote the orthogonal projection onto this tangent cone. Then, for $u\in\RR^d$, it is not difficult to verify that
\begin{align}\label{eq:dist_subgrad}
\dist\left(u, \partial g(x) \right) = \| P_{T_\mathcal{X}(x)}(u) \|.
\end{align}
%
Similarly, when $g(x) = 0$, a first-order stationary point $x\in \RR^d$ of \eqref{prob:01} is also a second-order stationary point if
%
%\vspace{-.5cm}
\begin{equation}
\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) > 0,
\end{equation}
%\vspace{-.5cm}
%
where $\lambda_{\text{min}}(\cdot)$ returns the smallest eigenvalue of its argument.
Analogously, $x$ is an $(\epsilon_f, \epsilon_s,\b)$ second order stationary point if, in addition to \eqref{eq:inclu3}, it holds that
%
%\vspace{-.5cm}
\begin{equation}\label{eq:sec_opt}
\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y)) \ge -\epsilon_s,
\end{equation}
for $\epsilon_s>0$.
%\vspace{-.5cm}
%
Naturally, for second-order stationarity, we use $\lambda_{\text{min}}(\nabla _{xx} \mathcal{L}_{\beta}(x,y))$ as the stopping criterion.
\paragraph{{\textbf{Smoothness lemma.}}} The following result controls the smoothness of $\L_\b(\cdot,y)$ for a fixed $y$. The proof is standard, however it is still included in Appendix~\ref{sec:proof of smoothness lemma} for completeness.
\begin{lemma}[\textbf{Smoothness}]\label{lem:smoothness}
For fixed $y\in \RR^m$ and $\rho,\rho'\ge 0$, it holds that
\begin{align}
\| \nabla_x \mathcal{L}_{\beta}(x, y)- \nabla_x \mathcal{L}_{\beta}(x', y) \| \le \lambda_\b \|x-x'\|,
\end{align}
for every $x,x' \in \{ x'': \|A(x'') \|\le \rho, \|x''\|\le \rho'\}$, where
\begin{align}
\lambda_\beta
& \le \lambda_f + \sqrt{m}\lambda_A \|y\| + (\sqrt{m}\lambda_A\rho + d \lambda'^2_A )\b \nonumber\\
& =: \lambda_f + \sqrt{m}\lambda_A \|y\| + \lambda''(A,x_1) \b,
\label{eq:smoothness of Lagrangian}
\end{align}
Above, $\lambda_f,\lambda_A$ were defined in (\ref{eq:smoothness basic}) and
\begin{align}
\lambda'_A := \max_{\|x\|\le \rho'}\|DA(x)\|.
\end{align}
\end{lemma}

Event Timeline