Page MenuHomec4science

ADMM.tex
No OneTemporary

File Metadata

Created
Sat, Oct 5, 20:34

ADMM.tex

\section{ADMM}
To solve the equivalent formulation presented in Program \eqref{eq:minmax}, we first propose LinAL, an linearized augmented Lagrangian algorithm, detailed in Algorithm~\ref{Algo:2}.
\begin{algorithm}
\label{Algo:2}
Input: Parameters $\beta_1,\s_1,\rho,\rho',\rho'',\tau> 0$, initialization $x_{1}\in \RR^d$ with $\|A(x_1)\|\le \rho$ and $\|x_1\|\le \rho'$, dual initialization $y_0\in \RR^m$.
\\
For $k=1,2,\ldots$ and until convergence, execute\\
\begin{enumerate}
\item \textbf{(Update penalty weight)} Set
\begin{align}
\beta_k = \frac{\beta_{1}}{\log 2}\sqrt{k \log^2(k+1)} .
\end{align}
\item \textbf{(Line search)} Use the line search procedure in \eqref{eq:defn of gamma line search} with $x=x_k,y=y_k,\b=\b_k$ and let $\g_k = \g$ be the output.\\
\item \textbf{(Primal descent step)} Set $ x_{k+1} = P_{g }(x_{k} - \gamma_{k} \nabla \mathcal{L}_{\beta_{k}}(x_{k},y_{k}))$, where $\L_{\b_k}$ is the augmented Lagrangian and $P_g$ denotes the proximal operator, defined in (\ref{eq:Lagrangian},\ref{eq:defn of prox}), respectively.\\
\item \textbf{(Control)} Properly modify $x_{k+1}$ to ensure that $\|x_{k+1}\|\le \rho'$ or $\|x_{k+1}-x_k\|\le \rho''$.\\
\item \textbf{(Stopping criterion)} If $\g_k \|G_{\b_k,\g_k}(x_k,y_k)\|^2 + \s_k \|A(x_k)\|^2 \le \tau$, then quit and return $x_{k+1}$ as an approximate minimizer of Program \eqref{prob:01}. See \eqref{eq:grad map} for the definition of the gradient mapping. \\
\item \textbf{(Update dual step size)} Set
\begin{align}
\s_{k+1} = \s_{1} \min\left( {\frac{1}{\sqrt{k}}}, \frac{\|A(x_1)\| \log^2 2 }{\|A(x_k)\|k\log^2(k+1)} \right).
\end{align}\\
\item \textbf{(Dual ascent step)} Set $y_{k+1} = y_{k} + \sigma_{k+1}A(x_{k+1})$.
\end{enumerate}
\caption{LinAL for solving Program \eqref{prob:01}}
\end{algorithm}
Slater's condition plays a key role in convex optimization and we require a similar condition for the success of our algorithm.
\begin{definition}\label{defn:nonconvex slater} \textbf{(Nonconvex Slater's condition)}
Consider an interval $K=[k_0:k_1]$ and a sequence $\{x_k\}_{k=k_0}^{k_1}\subset \RR^d$. Let $S_K\subset \RR^d$ be a subspace such that
\begin{align}
S _{K} \supseteq \bigcup_{k\in K} P_{\partial g^*(x_{k+1})}\left( DA(x_{k+1})^\top A(x_{k+1}) \right),
\label{eq:defn of S defn}
\end{align}
and, to simplify the notation, let $S_{K}$ also denote an orthonormal basis for this subspace.
For $\rho,\rho'>0$, we set
\begin{align}
\nu_A :=
\begin{cases}
\underset{v,x}{\min} \, \frac{\left\| S_{K}^\top P_{\partial g^*(x)} ( DA(x)^\top v) \right\|}{\|v\|} \\
\|v\|\le \rho\\
\|x\|\le \rho',
\end{cases}
\label{eq:new slater defn}
\end{align}
and say that Program (\ref{prob:01}) satisfies the nonconvex Slater's condition if $\nu_A >0$.
\end{definition}
To gain some insight about Definition \ref{defn:nonconvex slater}, suppose that $f:\RR^d\rightarrow\RR$ is convex. Consider a nonempty and bounded convex set $C\subseteq\RR^d$ and let us restrict ourselves to the choice of $g = 1_C:\RR^d\rightarrow\RR$, which takes $x\in \RR^d$ to
\begin{align}
g(x) = 1_C(x) =
\begin{cases}
0 & x \in C\\
\infty & x\notin C.
\end{cases}
\label{eq:indicator}
\end{align}
We also let $T_C(x)$ denote the tangent cone to $C$ at $x$, and $P_{T_C(x)}:\RR^d\rightarrow\RR^d$ denotes the orthogonal projection onto this tangent cone.
We also set $S_K = \RR^d$ to simplify the discussion. Then, \eqref{eq:new slater defn} can be written as
\begin{align}
\nu_A :=
\begin{cases}
\underset{v,x}{\min} \, \frac{\left\| P_{T_C(x)} A^\top v \right\|}{\|v\|} \\
\|v\|\le \rho\\
\|x\|\le \rho'
\end{cases}
= \begin{cases}
\underset{x}{\min} \, \eta_{m}\left( P_{T_C(x)} A^\top \right) \\
\|x\|\le \rho',
\end{cases}
\label{eq:nu for cvx}
\end{align}
where $\eta_{m}$ returns the $m$th largest singular value of its input. The nonconvex Slater condition here, namely, $\nu_A>0$, is closely related to the standard Slater's condition in convex optimization.
\begin{proposition}\label{prop:convex}
In Program (\ref{prob:01}), suppose that $f$ is a convex function, $g$ is as in (\ref{eq:indicator}), and $A:\RR^d\rightarrow\RR^m$ a linear operator, represented with the matrix $A\in \RR^{m\times d}$. Assume also that Program (\ref{prob:01}) is feasible, namely, there exists $x\in C$ such that $Ax=0$. In (\ref{eq:nu for cvx}), let us take $S_K =\RR^d$. Then the standard Slater's condition holds if the nonconvex Slater's condition ($\nu_A>0$) holds.
\end{proposition}
\begin{proof}
Suppose that the standard Slater's condition does not hold, namely, that
\begin{equation}
\relint(\Null(A) \cap C) = \Null(A)\cap \relint(C) = \emptyset,
\label{eq:no feas}
\end{equation}
where $\Null(A)$ and $\relint(C)$ denote the null space of the matrix $A$ and the relative interior of $C$, respectively.
By assumption, there exists $x_0\in C$ such that $Ax_0=0$. It follows from \eqref{eq:no feas} that $x_0\in \text{boundary}(C)$ and that $\Null(A)$ supports $C$ at $x_0$, namely,
$A x_0\ge 0$, for every $x_0\in C$.
Consequently, $\Null(A) \cap T_C(x) \ne \{0\}$, where $T_C(x_0)$ is the tangent cone of the set $C$ at $x_0$.
Equivalently, it holds that
$\row(A)\cap N_C(u) \ne \{0\}$, where $\row(A)$ is the row space of the matrix $A$. That is, there exists a unit-norm vector $v$ such that
$P_{T_C(x_0)}A^\top v=0$,
and, consequently,
$
\eta_{m}(P_{T_C(x_0)}A^\top) = 0
$.
Let us take $\rho'=\|x_0\|$ in \eqref{eq:nu for cvx}. We then conclude that $\nu_A=0$, namely, the nonconvex Slater's condition also does not hold, thereby completing the proof of Proposition \ref{prop:convex}.\hfill \qed
\end{proof}
The nonconvex Slater's condition is necessary for the success of Algorithm~\ref{Algo:2}. \textbf{Please add and discuss the convex visualization discussed with Volkan.}
\textbf{We should add a discussion of the result.}
\begin{theorem}[LinAL]
\label{thm:main}
For a sufficiently large $k_0$, consider an integer interval $K=[k_0:k_1]$, and suppose that the nonconvex Slater's condition, namely, Lemma \ref{lem:bnd bnd Ak}, is in force. Suppose also that
\begin{align}
\inf_{k\ge k_0} f(x_k) + g(x_k) + \langle A(x_k) ,y_{k_0-1}\rangle
>- \infty,
\end{align}
and that $\rho$ is large enough such that (\ref{eq:suff cnd on rho final}) holds . For $\rho'>0$, in addition to the strong smoothness of $f$ and $A$ quantified in (\ref{eq:smoothness basic}), let us define
\begin{align}
\lambda'_f = \max_{\|x\|\le \rho'} \|\nabla f(x)\|,
\qquad
\lambda'_A = \max_{\|x\| \le \rho'} \|DA(x)\|,
\end{align}
to be the (restricted) Lipschitz constants of $f$ and $A$, respectively.
Moreover, suppose that
\begin{align}
\rho'' \le \frac{2\nu_A}{\lambda_A}.
\end{align}
Then the output of LinAL satisfies
\begin{align}
\min_{k\in K}\overline{\g} \|G_k\|^2 \sqrt{\frac{k_0 \log^2(k_0+1)}{k_1\log^2(k_1+1)}} + \|A(x_k)\|^2
\lesssim \frac{1}{k_1-k_0},
\end{align}
where $\overline{\gamma}$ is independent of $k_1$ and $\lesssim$ suppresses the dependence on all parameters except $k_1$. The exact expressions in terms of $\lambda_f,\lambda'_f,\lambda_A,\lambda'_A, \b_{k_0},\s_{k_0}, \nu_A,y_0$ are found in (\ref{eq:low bnd on gammas},\ref{eq:exact dependences}).
\end{theorem}

Event Timeline