Page MenuHomec4science

preliminaries.tex
No OneTemporary

File Metadata

Created
Wed, May 22, 00:33

preliminaries.tex

\section{Preliminaries \label{sec:preliminaries}}
\paragraph{\textbf{\emph{Notation.}}}
We use the notations $\langle\cdot ,\cdot \rangle $ and $\|\cdot\|$ for the {standard inner} product and norm on $\RR^d${, respectively}. Gradient of differentiable $f:\RR^d\rightarrow\RR$ at $x$ is denoted by $\nabla f(x)$. For a differentiable map $A:\RR^d\rightarrow\RR^m$, $\D A(x$ denote its Jacobian at $x$.
For a convex function $g:\RR^d\rightarrow\RR$, the subdifferential at $x$ is denoted by $\partial g(x)$ and the proximal operator $P_g:\RR^d\rightarrow\RR^d$ takes $x$ to
\begin{align}
P_g(x) = \underset{y}{\argmin} \, g(y) + \frac{1}{2}\|x-y\|^2.
\label{eq:defn of prox}
\end{align}
In addition, if $g=1_C$ is the indicator function of a convex set or cone, we use the simpler notation $P_C$, instead of $P_{1_C}$, to denote the orthogonal projection onto $C$.
Throughout, $g^*$ and $\partial g^*$ will denote the Fenchel conjugate of $g$ and its subdifferential, respectively. For a cone $C$, we denote its polar by $C^*$, namely,
\begin{align}
C^* = \{x: \langle x,x'\rangle \le 0,\,\, \forall x' \in C \}.
\end{align}
An integer interval is denoted by $[k_0:k_1]=\{k_0,\cdots,k_1\}$ for integers $k_0 \le k_1$. For matrices, $\|\cdot\|$ and $\|\cdot\|_F$ denote the spectral and Frobenius norms, respectively.
\paragraph{\textbf{\emph{{Necessary optimality conditions.}}} \label{sec:opt cnds}}
{First-order necessary optimality conditions} for \eqref{prob:01} are well-studied~\cite{rockafellar2009variational}. {Indeed, $x$ is a (first-order) stationary point of \eqref{prob:01} if there exists $y\in \RR^m$ for which
\begin{align}
\begin{cases}
-\nabla f(x) - \D A(x)^\top y \in \partial g(x)\\
A(x) = 0.
\end{cases}
\label{e:inclu1}
\end{align}
Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to
\begin{align}
\begin{cases}
-\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x)\\
A(x) = 0,
\end{cases}
\label{e:inclu2}
\end{align}
which is in turn the first-order optimality condition for \eqref{eq:minmax}. } \notea{have to add 2nd optimality?}
\paragraph{\emph{\textbf{Technical lemmas.}}} The following standard results and notions are frequently used throughout this paper and proved in the appendix for completeness. The first result below shows that the augmented Lagrangian is smooth, see Appendix~\ref{sec:proof of smoothness lemma} for the proof.
\begin{lemma}[Smoothness]\label{lem:smoothness}
For fixed $y\in \RR^m$ and $\b,\rho,\rho'\ge 0$, it holds that
\begin{align}
\| \nabla_x \mathcal{L}_{\beta}(x, y)- \nabla_x \mathcal{L}_{\beta}(x', y) \| \le \lambda_\b \|x-x'\|,
\quad \forall x,x'\in \X_{\rho,\rho'},
\end{align}
where
\begin{align}
\X_{\rho,\rho'} := \{ x'': \|A(x'')\|\le \rho, \|x''\|\le \rho'\} \subset \RR^d,
\end{align}
\begin{align}
\lambda_\beta
& := \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) + \b d \lambda'^2_A,
\label{eq:smoothness of Lagrangian}
\end{align}
\begin{align}
\lambda'_A := \max_{\|x\|\le \rho'}\|\D A(x)\|,
\end{align}
and $\lambda_f,\lambda_A$ were defined in (\ref{eq:smoothness basic}).
\end{lemma}
Gradient mapping~\notea{what to cite?}, defined below, plays an important role in our convergence analysis.
\begin{definition}\label{def:grad map}\textbf{{(Gradient mapping)}} Given $y\in \RR^d$ and $\gamma >0$, the gradient mapping $G_{\beta,\gamma}(\cdot; y): \RR^d\rightarrow\RR^d$ takes $x\in \RR^d$ to
\begin{equation}
G_{\b,\gamma}(x,y) = \frac{x-x^+}{\gamma},
\label{eq:grad map}
\end{equation}
where $x^+=P_{g}(x-\gamma \nabla_x \mathcal{L}_ {\beta}(x,y))$.
\end{definition}
As the name suggests, if in particular we set $g\equiv 0$ in \eqref{prob:01}, the gradient mapping reduces to $G_{\beta,\gamma}(x,y)=\nabla f(x) $. For a sufficiently small step size $\g$, the gradient mapping controls the descent in the objective function of \eqref{eq:minmax}. The following result is standard \cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}, but the proof is given in Appendix \ref{sec:proof of descent lemma} for completeness.
\begin{lemma}[Descent lemma]\label{lem:11}
For $x\in \RR^d$ and $y\in \RR^m$, let $x^+ = P_g(x - \g \nabla_x \L_\b(x,y) )$, where $\g< 1/\lambda_\b$. For $\rho,\rho'\ge 0$, suppose that
\begin{align}
x,x^+ \in \X_{\rho,\rho'}= \{ x': \|A(x')\| \le \rho, \|x'\| \le \rho' \}.
\end{align}
Then it holds that
\begin{equation}
\label{e:deslem}
\| G_{\beta,\gamma}(x,y)\|^{2}\leq \frac{2}{\gamma} (\mathcal{L}_\beta(x,y) + g (x)- \mathcal{L}_\beta(x^+,y)- g(x^+) ).
\end{equation}
\end{lemma}
In {practice}, determining the step size $\gamma$ by computing the right-hand side of \eqref{eq:smoothness of Lagrangian} is infeasible, since $\lambda_f,\lambda_A,\lambda'_A$ are often unknown. Instead, we can resort to the line search technique, reviewed below and proved in Appendix~\ref{sec:proof of linesearch}.
\begin{lemma}[Line search] \label{lem:eval Lipsc cte} Fix $\theta \in (0,1)$ and ${\gamma}_0>0$. For $\gamma'>0$, let
\begin{align}
x^+_{\gamma'} = P_g(x - \gamma' \nabla_x \mathcal{L}_\beta(x,y)),
\label{eq:defn of x+gamma}
\end{align}
and define
\begin{align}
\gamma & :=
\max \Big\{
\gamma' ={\gamma}_0 \theta^i :
\mathcal{L}_\beta (x^+_{\gamma'},y) \nonumber\\
& \qquad \qquad \quad
\le \mathcal{L}_\beta (x,y) +
\left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,y) \right\rangle
+ \frac{1}{2\gamma'} \| x^+_{\gamma'} - x\|^2
\Big\}.
\label{eq:defn of gamma line search}
\end{align}
Then, (\ref{e:deslem}) holds and, moreover, we have that
\begin{align}
\gamma \ge \frac{\theta}{\lambda_\beta}.
\label{eq:moreover}
\end{align}
\end{lemma}

Event Timeline