Escaping saddle points.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, May 21, 14:33

Escaping saddle points.tex
View Options

	\documentclass[english]{article}
	\usepackage[utf8]{inputenc}
	\usepackage{amsthm}
	\usepackage{amsmath}
	\usepackage{amssymb}
	\usepackage{hyperref}

	\hypersetup{
	colorlinks=true,%
	citecolor=black,%
	filecolor=black,%
	linkcolor=black,%
	urlcolor=blue
	}



	\makeatletter
	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Textclass uspecific LaTeX commands.
	\theoremstyle{plain}
	\newtheorem{thm}{\protect\theoremname}
	\theoremstyle{plain}
	\newtheorem{lem}[thm]{\protect\lemmaname}
	\theoremstyle{plain}
	\newtheorem{corr}[thm]{\protect\corrolaryname}
	%\theoremstyle{definition}
	\theoremstyle{plain}
	\newtheorem{defn}[thm]{\protect\definitionname}
	\theoremstyle{plain}
	\newtheorem{prop}[thm]{\protect\propositionname}
	\theoremstyle{definition}
	\newtheorem{example}[thm]{\protect\examplename}
	\newtheorem*{remark}{Remark}
	\theoremstyle{plain}
	\newtheorem{conditions}[thm]{\protect\conditionsname}

	\makeatother

	\usepackage{babel}
	\providecommand{\definitionname}{Definition}
	\providecommand{\examplename}{Example}
	\providecommand{\lemmaname}{Lemma}
	\providecommand{\corrolaryname}{Corollary}
	\providecommand{\propositionname}{Proposition}
	\providecommand{\conditionsname}{Conditions}
	\providecommand{\theoremname}{Theorem}


	\usepackage{geometry}
	\geometry{
	a4paper,
	left=20mm,
	%lmargin = 0mm,
	%inner = 0mm,
	right = 20mm,
	%rmargin = 0mm,
	%outer = 0mm,
	top=15mm,
	%tmargin = 0mm,
	bottom = 15mm,
	%bmargin = 0mm,
	}





	\def \l{\left}
	\def \r{\right}
	\def \R{\mathbb{R}}
	\def \E{\mathbb{E}}
	\def \wt{\widetilde}
	\def \D{\mathcal{D}}
	\def \wh{\widehat}
	\def \ol{\overline}
	\def \m{(m)}
	\def \t {\theta}
	\def \b {\beta}
	\def \a {\alpha}
	\def \g{\gamma}
	\def \L{\mathcal{L}}
	\def \d{\delta}
	\def \S{\mathcal{S}}

	\title{Escaping From Saddle Points}


	\begin{document}

	\maketitle

	Consider the optimization program
	\begin{align}
	\begin{cases}
	\min h(u)\\
	L(u) = 0\\
	u\in C,
	\end{cases}
	\label{eq:main}
	\end{align}
	where $h$ is a non-convex but differentiable function, $L$ is a differentiable map, and $C$ is a convex set. Program \eqref{eq:main} is equivalent to
	\begin{align}
	& \min_{u\in C} \max_y h(u)+\langle L(u),y \rangle+\frac{\\|L(u)\\|_2^2}{2\b^*} \nonumber\\
	& =: \min_{u\in C} \max_y \L(u,y,\b^*),
	\end{align}
	where $1_C$ is the convex indicator function on $C$ and $\b^>0$. Above, $\L$ is the augmented Lagrangian of Program \eqref{eq:main}. Therefore, $(u^,y^*)$ is a stationary point of Program \eqref{eq:main} if
	\begin{align}
	\begin{cases}
	0\in \nabla_u \L(u^,y^,\b^) + N_C(u^) \\
	L(u^*) = 0,
	\end{cases}
	\label{eq:opt cnd}
	\end{align}
	where $DL(u^)$ is the Jacobian of $L$ and $N_C(u^)$ is the normal cone of $C$, both at $u^$. Note that $u^$ is a local minimum if
	\begin{equation}
	\delta_u^T \nabla_{uu} \L(u^,y^,\b^*) \d_u \ge 0,
	\qquad \forall \delta_u \in T_C(u^*),
	\end{equation}
	where $T_C(u^)$ is the tangent cone of $C$ at $u^$. Alternatively, if the above second form takes both positive and negative values on $T_C(u^)$, then $u^$ is a strict saddle point.



	%Note that $u$ is a stationary point of Program \eqref{eq:main} if
	%\begin{align}
	%\begin{cases}
	%\delta\in T_C(u) \\
	%\nabla h(u)^T \delta = 0\\
	%L(u)=0
	%\end{cases}
	%\Longrightarrow
	%DL(u)\delta = 0,
	%\end{align}
	%where $T_C(u)$ is the tangent cone to $C$ at $u$. Equivalently, $u$ is a stationary point of Program \eqref{eq:main} if
	To solve this program, Bang et al. suggested the first-order algorithm
	\begin{align*}
	u_{k+1} = P_C(u_k - \g_k \nabla_u \L(u_k,y_k,\b_k) ),
	\end{align*}
	\begin{align*}
	y_{k+1} = y_k + \frac{L(u_{k+1})}{2\b_k},
	\end{align*}
	\begin{equation}
	\beta_{k+1} = b(u_k,y_k,\b_k,\g_k),
	\qquad
	\g_{k+1} = c(u_k,y_k,\b_k,\g_k),
	\label{eq:main alg}
	\end{equation}
	for positive-valued $b,c$ specified in their write-up. {\color{blue} Is the last claim correct?}
	It is easy to verify that a limit point $z^* = (u^,y^,\b^,\g^)$ of Algorithm \eqref{eq:main alg} satisfies \eqref{eq:opt cnd}.
	Consequently, any limit point of Algorithm \eqref{eq:main alg} is a stationary point of Program \eqref{eq:main} and vice verse. However, without limiting the choice of $h$ (to convex for example), we cannot ensure that Algorithm \eqref{eq:main alg} finds a local minimum of Program \eqref{eq:main}. Our goal is to show that, almost surely, Algorithm \eqref{eq:main alg} avoids (strict) saddle points of Program \eqref{eq:main}. For example, suppose that Program \eqref{eq:main} does not have any non-strict saddle points. Under this assumption, if Algorithm \eqref{eq:main alg} converges, then the limit point is a local minimum, almost surely.


	%Then we may write the above algorithm as a dynamical system that satisfies
	%\begin{align}
	%z_{k+1}:=
	%\l[
	%\begin{array}{c}
	%u_{k+1}\\
	%y_{k+1}\\
	%\b_{k+1}\\
	%\g_{k+1}
	%\end{array}
	%\r]
	%& =
	%\l[
	%\begin{array}{c}
	%P_C (u_k- \g_k\nabla_u \L_k(u_k,y_k)\\
	%y_k+ \frac{L(u_{k+1})}{2\b_k}\\
	%b(u_k,y_k,\b_k,\g_k)\\
	%c(u_k,y_k,\b_k,\g_k)
	%\end{array}
	%\r].
	%\end{align}

	\section{Analysis}
	Let us use the shorthand $z=(u,y,\b,\g)$ and consider the dynamical system specified by
	\begin{align}
	z_{k+1}
	& :=
	\l[
	\begin{array}{c}
	P_C (u_k- \g_k\nabla_u \L(z_k))\\
	y_k+ \frac{L(u_{k})}{2\b_k}\\
	b(z_k)\\
	c(z_k)
	\end{array}
	\r]
	=: O
	\l(z_k
	\r)
	.
	\label{eq:sys}
	\end{align}
	It is easy to verify that any fixed point of the System \eqref{eq:sys} is a limit point of Algorithm \eqref{eq:main alg}. Moreover, when close enough to a stationary point (equivalently, fixed point) $z^$, the future iterates of Algorithm \ref{eq:main alg} are attracted (dispelled) from $z^$ if and only if the future iterates of System \eqref{eq:sys} are attracted (dispelled) from $z^*$. {\color{blue} Haven't verified the last claim.}
	%with the same starting point, the sequence produced by System \eqref{eq:sys} converges to a fixed point $z^$ if and only if the sequence produced by Algorithm \eqref{eq:main alg} converges to $z^$.
	It therefore suffices to study System \eqref{eq:sys} in what follows.

	\begin{lem}
	$O$ is an isomorphism in a neighborhood of $z^*$. That is, $O$ is continuous and has a continuous inverse. {\color{blue} I don't know if this claim is correct... For it to be correct, it might require $C$ to have bounded \emph{reach}. }
	\end{lem}



	Consider a fixed point $z^$ of System \eqref{eq:sys} and a point $z=z^+\d_z$ in its vicinity, where $\d_z=(\d_u,\d_y,\d_\b,\d_\g)$ and $\d_u\in T_C(u^*)$. We may then write that
	\begin{align}
	O(z^+\d_z) -O( z^) & = O(z+\d_z) - z^* \nonumber\\
	& =
	\l[
	\begin{array}{c}
	P_C\l(u^+\d_u- (\g^+\d_\g) \nabla_u \L(z^+\d_z ) \r)- u^\\
	\frac{D L(u^) \d_u }{2\b^} + \d_y - \frac{L(u^) \d_\b}{2(\b^)^2 } \\
	\langle \nabla b(z^*), \d_z \rangle\\
	\langle \nabla c(z^*) ,\d_z \rangle
	\end{array}
	\r].
	\label{eq:what happens}
	\end{align}
	To simplify the first entry of the vector above, let $P_{T_C(u^)}$ be the orthogonal projection onto the tangent cone of $C$ at $u^$, and note that {\color{blue} Is the first identity below correct?}
	\begin{align}
	& P_C(u^+\d_u- (\g^+\d_\g) \nabla_u \L(z^+\d_z )- u^\nonumber\\
	& = P_{T_C(u^)} (\d_u- (\g^+\d_\g) \nabla_u \L(z^*+\d_z) ) \nonumber\\
	& = \delta_u - (\g^+\d_\g) P_{T_C(u^)} ( \nabla_u \L(z^*+\d_z) )
	\qquad \l( \delta_u \in T_C(u^*) \r) \nonumber\\
	& = \delta_u - \g^* P_{T_C(u^)} \nabla_{u,u} \L(z^) \d_u
	- \g^* P_{T_C(u^)} \nabla_{u,u} \L(z^) \d_u
	- \g^* P_{T_C(u^)} \nabla_{u,y} \L(z^) \d_y
	- \g^* P_{T_C(u^)} \nabla_{u,\b} \L(z^) \d_\b \nonumber\\
	& \qquad
	- \delta_\g P_{T_C(u^)} \nabla_u \L(z^) \nonumber\\
	& = \delta_u - \g^* P_{T_C(u^)} \nabla_{u,u} \L(z^) \d_u
	- \g^* P_{T_C(u^)} \nabla_{u,u} \L(z^) \d_u
	- \g^* P_{T_C(u^)} \nabla_{u,y} \L(z^) \d_y
	- \g^* P_{T_C(u^)} \nabla_{u,\b} \L(z^) \d_\b
	\qquad \text{(see \eqref{eq:opt cnd})} \nonumber\\
	& = \l[
	\begin{array}{cccc}
	I_u - \g^* P_{T_C(u^)} \nabla_{u,u} \L(z^) & - \g^* P_{T_C(u^)} \nabla_{u,y} \L(z^) &
	- \g^* P_{T_C(u^)} \nabla_{u,\b} \L(z^)
	\end{array}
	\r]
	\cdot
	\l[
	\begin{array}{c}
	\d_u \\
	\d_y \\
	\d_\b\\
	\end{array}
	\r].
	\end{align}
	Substituting the above identity back into \eqref{eq:what happens}, we find that
	\begin{align}
	O(z^+\d_z) - O(z^) &
	=
	\l[
	\begin{array}{cccc}
	I_u - \g^* P_{T_C(u^)} \nabla_{u,u} \L(z^) & - \g^* P_{T_C(u^)} \nabla_{u,y} \L(z^) &
	- \g^* P_{T_C(u^)} \nabla_{u,\b} \L(z^) & 0 \\
	\frac{DL(u^)}{2\b^} & I_y& -\frac{L(u^)}{2(\b^)^2} & 0\\
	\nabla_u b(z^)^T & \nabla_y b(z^)^T & \nabla_\b b(z^)^T & \nabla_\g b(z^)^T\\
	\nabla_u c(z^)^T & \nabla_y c(z^)^T & \nabla_\b c(z^)^T & \nabla_\g c(z^)^T\\
	\end{array}
	\r]
	\cdot
	\d_z \nonumber\\
	& = DO(z^*) \cdot \d_z.
	\end{align}
	If $u^$ is a strict saddle point of Program \eqref{eq:main}, then there exists $\delta_u,\delta_u'\in T_C(u^)$ such that
	\begin{equation}
	\delta_u^T \nabla_{uu} \L(z^*) \delta_u >0,
	\qquad
	\delta'^T_u\nabla_{uu} \L(z^*) \delta_u' <0.
	\end{equation}
	Therefore, $DO(z^*)$ has eigenvalues outside of the unit circle as long as
	\begin{equation}
	\g^* \le \frac{1}{\\|P_{T_C(u^)} \nabla_{uu}\L(z^) P_{T_C(u^*)}\\|}.
	\end{equation}
	Let us now recall the stable manifold theorem, e.g., Theorem 3 in ``Behavior of accelerated gradient methods near critical points of non-convex problems''.
	\begin{thm}
	\textbf{\emph{(Stable manifold theorem)}} Suppose that $z^$ is a fixed point of $O$. Suppose also that $O$ is itself a diffeomorphism in a neighborhood of $z^$. Suppose lastly that $DO(z^)$ has an eigenvalue outside of the unit circle. Then the local stable manifold $\S_{z^} \ni z^$ has measure zero. In particular, there exists a neighborhood $B\ni z^$ such that, if $O^k(z)\in B$ for all $k$, then $z\in \S_{z^*}$. {\color{blue} I think we need a version of this theorem that holds for an isomorphism, namely, just continuous with a continuous inverse. Can we find one in the book ``Global stability of dynamical systems''? I have the book, if needed. }
	\end{thm}
	Consider the sequence $\{z_k\}$ generated by System \eqref{eq:sys}. Suppose that $\{z_k\}$ converges to $z^$. Then, for sufficiently large $k$, $z_k$ belongs to the stable manifold at $z^$. That is, $z_0\in O^{-k}(\S_{z^})$. Since $\S_{z^}$ has measure zero, so does $O^{-k}(\S_{z_k})$. That is, almost surely, System \eqref{eq:sys} (or equivalently Algorithm \eqref{eq:main alg}) does not converge to the strict saddle point $z^*$.




	\end{document}

Escaping saddle points.texNo OneTemporaryActions

File Metadata

Escaping saddle points.texView Options

Event Timeline

Escaping saddle points.tex
No OneTemporary
Actions

Escaping saddle points.tex
View Options