Page MenuHomec4science

Escaping saddle points.tex
No OneTemporary

File Metadata

Created
Tue, May 21, 14:33

Escaping saddle points.tex

\documentclass[english]{article}
\usepackage[utf8]{inputenc}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{hyperref}
\hypersetup{
colorlinks=true,%
citecolor=black,%
filecolor=black,%
linkcolor=black,%
urlcolor=blue
}
\makeatletter
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Textclass uspecific LaTeX commands.
\theoremstyle{plain}
\newtheorem{thm}{\protect\theoremname}
\theoremstyle{plain}
\newtheorem{lem}[thm]{\protect\lemmaname}
\theoremstyle{plain}
\newtheorem{corr}[thm]{\protect\corrolaryname}
%\theoremstyle{definition}
\theoremstyle{plain}
\newtheorem{defn}[thm]{\protect\definitionname}
\theoremstyle{plain}
\newtheorem{prop}[thm]{\protect\propositionname}
\theoremstyle{definition}
\newtheorem{example}[thm]{\protect\examplename}
\newtheorem*{remark}{Remark}
\theoremstyle{plain}
\newtheorem{conditions}[thm]{\protect\conditionsname}
\makeatother
\usepackage{babel}
\providecommand{\definitionname}{Definition}
\providecommand{\examplename}{Example}
\providecommand{\lemmaname}{Lemma}
\providecommand{\corrolaryname}{Corollary}
\providecommand{\propositionname}{Proposition}
\providecommand{\conditionsname}{Conditions}
\providecommand{\theoremname}{Theorem}
\usepackage{geometry}
\geometry{
a4paper,
left=20mm,
%lmargin = 0mm,
%inner = 0mm,
right = 20mm,
%rmargin = 0mm,
%outer = 0mm,
top=15mm,
%tmargin = 0mm,
bottom = 15mm,
%bmargin = 0mm,
}
\def \l{\left}
\def \r{\right}
\def \R{\mathbb{R}}
\def \E{\mathbb{E}}
\def \wt{\widetilde}
\def \D{\mathcal{D}}
\def \wh{\widehat}
\def \ol{\overline}
\def \m{(m)}
\def \t {\theta}
\def \b {\beta}
\def \a {\alpha}
\def \g{\gamma}
\def \L{\mathcal{L}}
\def \d{\delta}
\def \S{\mathcal{S}}
\title{Escaping From Saddle Points}
\begin{document}
\maketitle
Consider the optimization program
\begin{align}
\begin{cases}
\min h(u)\\
L(u) = 0\\
u\in C,
\end{cases}
\label{eq:main}
\end{align}
where $h$ is a non-convex but differentiable function, $L$ is a differentiable map, and $C$ is a convex set. Program \eqref{eq:main} is equivalent to
\begin{align}
& \min_{u\in C} \max_y h(u)+\langle L(u),y \rangle+\frac{\|L(u)\|_2^2}{2\b^*} \nonumber\\
& =: \min_{u\in C} \max_y \L(u,y,\b^*),
\end{align}
where $1_C$ is the convex indicator function on $C$ and $\b^*>0$. Above, $\L$ is the augmented Lagrangian of Program \eqref{eq:main}. Therefore, $(u^*,y^*)$ is a stationary point of Program \eqref{eq:main} if
\begin{align}
\begin{cases}
0\in \nabla_u \L(u^*,y^*,\b^*) + N_C(u^*) \\
L(u^*) = 0,
\end{cases}
\label{eq:opt cnd}
\end{align}
where $DL(u^*)$ is the Jacobian of $L$ and $N_C(u^*)$ is the normal cone of $C$, both at $u^*$. Note that $u^*$ is a local minimum if
\begin{equation}
\delta_u^T \nabla_{uu} \L(u^*,y^*,\b^*) \d_u \ge 0,
\qquad \forall \delta_u \in T_C(u^*),
\end{equation}
where $T_C(u^*)$ is the tangent cone of $C$ at $u^*$. Alternatively, if the above second form takes both positive and negative values on $T_C(u^*)$, then $u^*$ is a strict saddle point.
%Note that $u$ is a stationary point of Program \eqref{eq:main} if
%\begin{align}
%\begin{cases}
%\delta\in T_C(u) \\
%\nabla h(u)^T \delta = 0\\
%L(u)=0
%\end{cases}
%\Longrightarrow
%DL(u)\delta = 0,
%\end{align}
%where $T_C(u)$ is the tangent cone to $C$ at $u$. Equivalently, $u$ is a stationary point of Program \eqref{eq:main} if
To solve this program, Bang et al. suggested the first-order algorithm
\begin{align*}
u_{k+1} = P_C(u_k - \g_k \nabla_u \L(u_k,y_k,\b_k) ),
\end{align*}
\begin{align*}
y_{k+1} = y_k + \frac{L(u_{k+1})}{2\b_k},
\end{align*}
\begin{equation}
\beta_{k+1} = b(u_k,y_k,\b_k,\g_k),
\qquad
\g_{k+1} = c(u_k,y_k,\b_k,\g_k),
\label{eq:main alg}
\end{equation}
for positive-valued $b,c$ specified in their write-up. {\color{blue} Is the last claim correct?}
It is easy to verify that a limit point $z^* = (u^*,y^*,\b^*,\g^*)$ of Algorithm \eqref{eq:main alg} satisfies \eqref{eq:opt cnd}.
Consequently, any limit point of Algorithm \eqref{eq:main alg} is a stationary point of Program \eqref{eq:main} and vice verse. However, without limiting the choice of $h$ (to convex for example), we cannot ensure that Algorithm \eqref{eq:main alg} finds a local minimum of Program \eqref{eq:main}. Our goal is to show that, almost surely, Algorithm \eqref{eq:main alg} avoids (strict) saddle points of Program \eqref{eq:main}. For example, suppose that Program \eqref{eq:main} does not have any non-strict saddle points. Under this assumption, if Algorithm \eqref{eq:main alg} converges, then the limit point is a local minimum, almost surely.
%Then we may write the above algorithm as a dynamical system that satisfies
%\begin{align}
%z_{k+1}:=
%\l[
%\begin{array}{c}
%u_{k+1}\\
%y_{k+1}\\
%\b_{k+1}\\
%\g_{k+1}
%\end{array}
%\r]
%& =
%\l[
%\begin{array}{c}
%P_C (u_k- \g_k\nabla_u \L_k(u_k,y_k)\\
%y_k+ \frac{L(u_{k+1})}{2\b_k}\\
%b(u_k,y_k,\b_k,\g_k)\\
%c(u_k,y_k,\b_k,\g_k)
%\end{array}
%\r].
%\end{align}
\section{Analysis}
Let us use the shorthand $z=(u,y,\b,\g)$ and consider the dynamical system specified by
\begin{align}
z_{k+1}
& :=
\l[
\begin{array}{c}
P_C (u_k- \g_k\nabla_u \L(z_k))\\
y_k+ \frac{L(u_{k})}{2\b_k}\\
b(z_k)\\
c(z_k)
\end{array}
\r]
=: O
\l(z_k
\r)
.
\label{eq:sys}
\end{align}
It is easy to verify that any fixed point of the System \eqref{eq:sys} is a limit point of Algorithm \eqref{eq:main alg}. Moreover, when close enough to a stationary point (equivalently, fixed point) $z^*$, the future iterates of Algorithm \ref{eq:main alg} are attracted (dispelled) from $z^*$ if and only if the future iterates of System \eqref{eq:sys} are attracted (dispelled) from $z^*$. {\color{blue} Haven't verified the last claim.}
%with the same starting point, the sequence produced by System \eqref{eq:sys} converges to a fixed point $z^*$ if and only if the sequence produced by Algorithm \eqref{eq:main alg} converges to $z^*$.
It therefore suffices to study System \eqref{eq:sys} in what follows.
\begin{lem}
$O$ is an isomorphism in a neighborhood of $z^*$. That is, $O$ is continuous and has a continuous inverse. {\color{blue} I don't know if this claim is correct... For it to be correct, it might require $C$ to have bounded \emph{reach}. }
\end{lem}
Consider a fixed point $z^*$ of System \eqref{eq:sys} and a point $z=z^*+\d_z$ in its vicinity, where $\d_z=(\d_u,\d_y,\d_\b,\d_\g)$ and $\d_u\in T_C(u^*)$. We may then write that
\begin{align}
O(z^*+\d_z) -O( z^*) & = O(z+\d_z) - z^* \nonumber\\
& =
\l[
\begin{array}{c}
P_C\l(u^*+\d_u- (\g^*+\d_\g) \nabla_u \L(z^*+\d_z ) \r)- u^*\\
\frac{D L(u^*) \d_u }{2\b^*} + \d_y - \frac{L(u^*) \d_\b}{2(\b^*)^2 } \\
\langle \nabla b(z^*), \d_z \rangle\\
\langle \nabla c(z^*) ,\d_z \rangle
\end{array}
\r].
\label{eq:what happens}
\end{align}
To simplify the first entry of the vector above, let $P_{T_C(u^*)}$ be the orthogonal projection onto the tangent cone of $C$ at $u^*$, and note that {\color{blue} Is the first identity below correct?}
\begin{align}
& P_C(u^*+\d_u- (\g^*+\d_\g) \nabla_u \L(z^*+\d_z )- u^*\nonumber\\
& = P_{T_C(u^*)} (\d_u- (\g^*+\d_\g) \nabla_u \L(z^*+\d_z) ) \nonumber\\
& = \delta_u - (\g^*+\d_\g) P_{T_C(u^*)} ( \nabla_u \L(z^*+\d_z) )
\qquad \l( \delta_u \in T_C(u^*) \r) \nonumber\\
& = \delta_u - \g^* P_{T_C(u^*)} \nabla_{u,u} \L(z^*) \d_u
- \g^* P_{T_C(u^*)} \nabla_{u,u} \L(z^*) \d_u
- \g^* P_{T_C(u^*)} \nabla_{u,y} \L(z^*) \d_y
- \g^* P_{T_C(u^*)} \nabla_{u,\b} \L(z^*) \d_\b \nonumber\\
& \qquad
- \delta_\g P_{T_C(u^*)} \nabla_u \L(z^*) \nonumber\\
& = \delta_u - \g^* P_{T_C(u^*)} \nabla_{u,u} \L(z^*) \d_u
- \g^* P_{T_C(u^*)} \nabla_{u,u} \L(z^*) \d_u
- \g^* P_{T_C(u^*)} \nabla_{u,y} \L(z^*) \d_y
- \g^* P_{T_C(u^*)} \nabla_{u,\b} \L(z^*) \d_\b
\qquad \text{(see \eqref{eq:opt cnd})} \nonumber\\
& = \l[
\begin{array}{cccc}
I_u - \g^* P_{T_C(u^*)} \nabla_{u,u} \L(z^*) & - \g^* P_{T_C(u^*)} \nabla_{u,y} \L(z^*) &
- \g^* P_{T_C(u^*)} \nabla_{u,\b} \L(z^*)
\end{array}
\r]
\cdot
\l[
\begin{array}{c}
\d_u \\
\d_y \\
\d_\b\\
\end{array}
\r].
\end{align}
Substituting the above identity back into \eqref{eq:what happens}, we find that
\begin{align}
O(z^*+\d_z) - O(z^*) &
=
\l[
\begin{array}{cccc}
I_u - \g^* P_{T_C(u^*)} \nabla_{u,u} \L(z^*) & - \g^* P_{T_C(u^*)} \nabla_{u,y} \L(z^*) &
- \g^* P_{T_C(u^*)} \nabla_{u,\b} \L(z^*) & 0 \\
\frac{DL(u^*)}{2\b^*} & I_y& -\frac{L(u^*)}{2(\b^*)^2} & 0\\
\nabla_u b(z^*)^T & \nabla_y b(z^*)^T & \nabla_\b b(z^*)^T & \nabla_\g b(z^*)^T\\
\nabla_u c(z^*)^T & \nabla_y c(z^*)^T & \nabla_\b c(z^*)^T & \nabla_\g c(z^*)^T\\
\end{array}
\r]
\cdot
\d_z \nonumber\\
& = DO(z^*) \cdot \d_z.
\end{align}
If $u^*$ is a strict saddle point of Program \eqref{eq:main}, then there exists $\delta_u,\delta_u'\in T_C(u^*)$ such that
\begin{equation}
\delta_u^T \nabla_{uu} \L(z^*) \delta_u >0,
\qquad
\delta'^T_u\nabla_{uu} \L(z^*) \delta_u' <0.
\end{equation}
Therefore, $DO(z^*)$ has eigenvalues outside of the unit circle as long as
\begin{equation}
\g^* \le \frac{1}{\|P_{T_C(u^*)} \nabla_{uu}\L(z^*) P_{T_C(u^*)}\|}.
\end{equation}
Let us now recall the stable manifold theorem, e.g., Theorem 3 in ``Behavior of accelerated gradient methods near critical points of non-convex problems''.
\begin{thm}
\textbf{\emph{(Stable manifold theorem)}} Suppose that $z^*$ is a fixed point of $O$. Suppose also that $O$ is itself a diffeomorphism in a neighborhood of $z^*$. Suppose lastly that $DO(z^*)$ has an eigenvalue outside of the unit circle. Then the local stable manifold $\S_{z^*} \ni z^*$ has measure zero. In particular, there exists a neighborhood $B\ni z^*$ such that, if $O^k(z)\in B$ for all $k$, then $z\in \S_{z^*}$. {\color{blue} I think we need a version of this theorem that holds for an isomorphism, namely, just continuous with a continuous inverse. Can we find one in the book ``Global stability of dynamical systems''? I have the book, if needed. }
\end{thm}
Consider the sequence $\{z_k\}$ generated by System \eqref{eq:sys}. Suppose that $\{z_k\}$ converges to $z^*$. Then, for sufficiently large $k$, $z_k$ belongs to the stable manifold at $z^*$. That is, $z_0\in O^{-k}(\S_{z^*})$. Since $\S_{z^*}$ has measure zero, so does $O^{-k}(\S_{z_k})$. That is, almost surely, System \eqref{eq:sys} (or equivalently Algorithm \eqref{eq:main alg}) does not converge to the strict saddle point $z^*$.
\end{document}

Event Timeline