ADMM.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Nov 10, 23:19

ADMM.tex
View Options

	\section{Linearized Alternating Direction Method of Multipliers}


	In convex optimization, whenever applicable, Alternating Direction Method of Multipliers (ADMM)
	\cite{Glowinski1975,Gabay1976,Boyd2011} often outperforms the augmented Lagrangian method. In addition, ADMM often more efficiently handles non-smooth terms.


	In light of the success of ADMM in convex optimization, in this section we develop and study a (linearized) ADMM variant of Algorithm~\ref{Algo:2}.
	More specifically, consider the program
	\begin{align}
	\begin{cases}
	\underset{x,z}{\min} \,\, f(x)+g(x)+h(z)+l(z)\\
	A(x)+B(z) = 0,
	\end{cases}
	\label{eq:admm 2}
	\end{align}
	where $f,h:\RR^d\rightarrow\RR$ and $A,B:\RR^d\rightarrow\RR^d$ are smooth in the sense described later in this section.
	%\begin{align*}
	%\\| \nabla f(x) - \nabla f(x') \\| \le \lambda_f \\|x-x'\\|,
	%\,\,
	%\\| \D A(x) - \D A(x') \\| \le \lambda_A \\|x-x'\\|,
	%\end{align*}
	%\begin{align}
	%\\| \nabla h(z) - \nabla h(z') \\| \le \lambda_h \\|z-z'\\|,
	%\,\,
	%\\| \D B(z) - \D B(z') \\| \le \lambda_B \\|z-z'\\|,
	%\label{eq:smoothness basics admm}
	%\end{align}
	%for every $x,x',z,z'\in \RR^d$.
	Above, $g,l:\RR^d\rightarrow\RR$ are proximal-friendly convex functions which might not be differentiable.
	%For example, when $C,D\subset \RR^d$ and $g=1_C, l=1_D$ are the indicator functions on $C,D$, respectively, Program~\eqref{eq:admm 2} becomes
	%\begin{align}
	%\begin{cases}
	%\underset{x,z}{\min} \,\, f(x)+h(z)\\
	%A(x)+B(z) = 0\\
	%x\in C,\qquad
	%z\in D.
	%\end{cases}
	%\label{eq:admm 2}
	%\end{align}
	For penalty weight $\b\ge 0$, the augmented Lagrangian corresponding to problem~\eqref{eq:admm 2} is
	\begin{align}
	\L_{\b}(x,z,y) & = f(x)+h(z)
	+ \langle A(x)+B(z),y \rangle + \frac{\b}{2}\\| A(x)+B(z) \\|^2,
	\label{eq:lagrangian admm}
	\end{align}
	and problem \eqref{eq:admm 2} is therefore equivalent to the minimax program
	\begin{align}
	\underset{x,z}{\min} \underset{y}{\max}\,\, \L_\b(x,z,y).
	\label{eq:admm main}
	\end{align}
	To solve the equivalent formulation in \eqref{eq:admm main}, we propose the linearized ADMM in Algorithm~\ref{Algo:3}. Most remarks about Algorithm~\ref{Algo:2} apply to Algorithm~\ref{Algo:3} as well and, in particular, note that Algorithm~\ref{Algo:3} performs two consecutive primal updates, one on $x$ and then one on $z$.

	To parse the details of Algorithm~\ref{Algo:3}, we need to slightly change the gradient map in Definition~\ref{def:grad map} and the line search procedure in Lemma~\ref{lem:eval Lipsc cte} to match the new augmented Lagrangian $\L_\b$ in \eqref{eq:lagrangian admm}. More specifically, the corresponding gradient maps are defined as
	\begin{equation}
	G_{\b,\gamma}(x,z,y) = \frac{x-x^+}{\gamma},
	\qquad
	H_{\b,\iota}(x,z,y) = \frac{z-z^+}{\iota},
	\label{eq:grad map admm}
	\end{equation}
	where
	\begin{align}
	x^+=P_{g}(x-\gamma \nabla_x \mathcal{L}_ {\beta}(x,z,y)),
	\qquad
	z^+=P_{l}(z-\iota \nabla_z \mathcal{L}_ {\beta}(x,z,y)),
	\end{align}
	and $\g,\iota>0$ are the primal step sizes. Above, $P_g$ and $P_l$ are the corresponding proximal operators.
	The line search procedure too is similar to Lemma~\ref{lem:eval Lipsc cte} and we set
	\begin{align*}
	x^+_{\gamma'} = P_g(x - \gamma' \nabla_x \mathcal{L}_\beta(x,z,y)),
	\end{align*}
	\begin{align}
	z^+_{\iota'} = P_l(z - \iota' \nabla_z \mathcal{L}_\beta(x,z,y)),
	\end{align}
	\begin{align}
	\gamma & :=
	\max \Big\{
	\gamma' ={\gamma}_0 \theta^i :
	\mathcal{L}_\beta (x^+_{\gamma'},z,y) \nonumber\\
	& \qquad \qquad
	\le \mathcal{L}_\beta (x,z,y) +
	\left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle
	+ \frac{1}{2\gamma'} \\| x^+_{\gamma'} - x\\|^2
	\Big\},
	\label{eq:line search x admm}
	\end{align}
	\begin{align}
	\iota & :=
	\max \Big\{
	\iota' ={\iota}_0 \theta^i :
	\mathcal{L}_\beta (x,z^+_{\iota'},y) \nonumber\\
	& \qquad \qquad
	\le \mathcal{L}_\beta (x,z,y) +
	\left\langle z^+_{\iota'} - z , \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle
	+ \frac{1}{2\iota'} \\| z^+_{\iota'} - z\\|^2
	\Big\}.
	\label{eq:defn of gamma line search admm}
	\end{align}
	The analysis of Algorithm~\ref{Algo:3} is similar to that of Algorithm~\ref{Algo:2}, involving also a similar version of Lemma~\ref{lem:bnd bnd Ak}. The convergence rate of Algorithm~\ref{Algo:3} is detailed below and proved in Appendix~\ref{sec:ADMM theory}. To present the result, let us introduce some additional notation. We let
	\begin{align}
	u=\left[
	\begin{array}{cc}
	x^\top z^\top
	\end{array}
	\right]^\top,
	\qquad
	p(u) = f(x)+h(z),
	\nonumber\\
	q(u) = g(x)+l(z),
	\qquad
	D(u) = A(x)+B(z),
	\label{eq:new-notation-admm}
	\end{align}
	for short and assume that $p,D$ are smooth in the sense that
	\begin{align}
	\\| \nabla p(u) - \nabla p(u') \\| \le \lambda_p \\|u-u'\\|,
	\nonumber\\
	\\| \D D(u) - \D D(u') \\| \le \lambda_D \\|u-u'\\|,
	\label{eq:smoothness basics admm}
	\end{align}
	for every $u,u'\in \RR^{2d}$.


	\begin{algorithm}
	\label{Algo:3}
	Input: Parameters $\beta_1,\s_1,\rho,\tau> 0$, primal initialization $x_{1},z_1\in \RR^d$ with $\\|A(x_1)+B(z_1)\\|\le \rho$, dual initialization $y_1\in \RR^m$.
	\\

	For $k=1,2,\ldots$, execute\\
	\begin{enumerate}

	\item \textbf{(Update penalty weight)}
	$
	\beta_k \leftarrow \beta_{1} \sqrt{k} \log(k+1)/\log 2.
	$
	\item \textbf{(Line search in $x$)} Use the line search procedure in \eqref{eq:line search x admm} by replacing $x=x_k,z=z_k,y=y_k,\b=\b_k$ and let $\g_k \leftarrow \g$ be the output.\\

	\item \textbf{(Descent step in $x$)} $ x_{k+1} \leftarrow P_{g }(x_{k} - \gamma_{k} \nabla_x \mathcal{L}_{\beta_{k}}(x_{k},z_k,y_{k}))$, where $\L_{\b_k}$ is the augmented Lagrangian and $P_g$ denotes the proximal operator, defined in (\ref{eq:Lagrangian},\ref{eq:defn of prox}), respectively.\\


	\item \textbf{(Line search in $z$)} Use the line search procedure in \eqref{eq:defn of gamma line search} by replacing $x=x_{k+1},z=z_k,y=y_k,\b=\b_k$ and let $\iota_k \leftarrow \iota$ be the output.\\

	\item \textbf{(Descent step in $z$)} $ z_{k+1} \leftarrow P_{l }(z_{k} - \iota_{k} \nabla_z \mathcal{L}_{\beta_{k}}(x_{k+1},z_k,y_{k}))$, where $P_l$ denotes the proximal operator for $l$.\\

	%
	%\item \textbf{(Control)} Properly modify $x_{k+1},z_{k+1}$ to ensure that $\\|x_{k+1}\\|,\\|z_{k+1}\\|\le \rho'$ or $\\|x_{k+1}-x_k\\|\le \rho''$.\\

	\item \textbf{(Stopping criterion)} If $\g_k \\|G_{\b_k,\g_k}(x_k,z_k,y_k)\\|^2+ \iota_k \\|G_{\b_k,\iota_k}(x_{k+1},z_k,y_k)\\|^2 + \s_k \\|A(x_k)+B(z_k)\\|^2 \le \tau$, then quit and return $x_{k+1},z_{k+1}$.
	% as approximate minimizers of Program \eqref{prob:01}.
	See \eqref{eq:grad map admm} for the definition of the gradient mapping. \\
	\item \textbf{(Update dual step size)}
	$\s_{k+1} \leftarrow \s_{1} \min\left( {\frac{1}{\sqrt{k+1}}}, \frac{\\|A(x_1)+B(z_1)\\|}{\|A(x_{k+1})+B(z_{k+1})\\|}\cdot \frac{ \log^2 2 }{(k+1)\log^2(k+2)} \right).
	$\\
	\item \textbf{(Dual ascent step)} $y_{k+1} \leftarrow y_{k} + \sigma_{k+1}(A(x_{k+1})+B(z_{k+1}))$.
	\end{enumerate}
	\caption{Linearized ADMM for solving problem \eqref{eq:admm main}}
	\end{algorithm}



	%\begin{theorem}[Linearized ADMM]
	%\label{thm:main admm}
	%For a sufficiently large $k_0$, consider an integer interval $K=[k_0:k_1]$, and suppose that
	%\begin{align}
	% \inf_{k\ge k_0} f(x_k) + g(x_k)+ h(z_k) + l(z_k) + \langle A(x_k)+B(z_k) ,y_{k_0}\rangle
	%>- \infty,
	%\end{align}
	%and that $\rho$ is large enough such that (\ref{eq:suff cnd on rho final}) holds . For $\rho'>0$, in addition to the strong smoothness of $f$ and $A$ quantified in (\ref{eq:smoothness basic}), let us define
	%\begin{align}
	%\lambda'_f = \max_{\\|x\\|\le \rho'} \\|\nabla f(x)\\|,
	%\qquad
	%\lambda'_A = \max_{\\|x\\| \le \rho'} \\|DA(x)\\|,
	%\qquad
	%\lambda'_B = \max_{\\|x\\|\le \rho'} \\|DB(x)\\|,
	%\end{align}
	%to be the (restricted) Lipschitz constants of $f,A,B$, respectively.
	%Moreover, suppose that the nonconvex Slater's condition, namely, Lemma \ref{lem:bnd bnd Ak}, is in force, and
	%\begin{align}
	%\nu_A \ge 2\lambda_A \rho''.
	%\end{align}
	%Then the output of linearized ADMM in Algorithm~\ref{Algo:2} satisfies \textbf{we need a better expression below to better show the dependence on various parameters.}
	%\begin{align}
	%& \min_{k\in K}\,\, \min(\overline{\g},\overline{\iota})( \\|G_k\\|^2 + \\|H_k\\|^2) \sqrt{\frac{k_0 \log^2(k_0+1)}{k_1\log^2(k_1+1)}} \nonumber\\
	%& \qquad + \\|A(x_k)\\|^2
	%\lesssim \frac{1}{k_1-k_0},
	%\end{align}
	%where $\overline{\gamma},\overline{\iota}$ are independent of $k_1$ and $\lesssim$ suppresses the dependence on all parameters except $k_1$. The exact expressions in terms of $\lambda_f,\lambda'_f,\lambda_A,\lambda'_A, \b_{k_0},\s_{k_0}, \nu_A,y_0$ are found in (\ref{eq:raw bnd on sum admm},\ref{eq:lower_band_iotas_admm}).
	%
	%\end{theorem}



	\begin{theorem}[Convergence rate of linearized ADMM]
	\label{thm:main admm}
	For sufficiently large integers $k_0< k_1$, consider the interval $K=[k_0:k_1]$,\footnote{The requirement for $k_0$ to be sufficiently large is merely to simplify the analysis. If desired, one can set $k_1=\infty$ in Theorem~\ref{thm:main admm}.} and consider the output sequence $\{x_k,z_k,y_k\}_{k\in K}$ of Algorithm~\ref{Algo:2} with $\kappa_k:= \min(\g_k,\iota_k)$ for short. After recalling (\ref{eq:new-notation-admm}), suppose that
	\begin{align*}
	\mu := - \min(0, \inf_{k} p(u_k) + q(u_k) + \langle D(u_k) ,y_{k_0}\rangle )
	< \infty.
	\end{align*}
	For $\rho'>0$, in addition to the strong smoothness of $p$ and $D$ quantified in (\ref{eq:smoothness basics admm}), let us define
	\begin{align}
	\lambda'_p = \max_{\\|u\\|\le \rho'} \\|\nabla p(x)\\|,
	\qquad
	\lambda'_D = \max_{\\|u\\| \le \rho'} \\|\D D(x)\\|,
	\end{align}
	to be the (restricted) Lipschitz constants of $p$ and $D$, respectively.
	Suppose also that problem~(\ref{eq:admm 2}) satisfies the geometric regularity in Definition~\ref{defn:nonconvex slater} with
	\begin{align}
	\nu(q,D,S,\rho,\rho') \gtrsim
	\max\left(\lambda_D \max_{k\in K} \sqrt{\kappa_k\mu} , \frac{{\lambda'_p+\lambda'_D}}{\sqrt{\mu}} \right),
	\label{eq:low-bnd-on-regularity-admm}
	\end{align}
	where
	\begin{itemize}
	\item $\rho \gtrsim \sqrt{\mu}$,
	%\item $\rho \gtrsim -\sqrt{\mu}$,
	\item $\rho'\ge \max_{k\in K} \\|u_k\\|$,
	\item $S \supseteq \bigcup_{k\in K} P_{\cone(\partial q (u_{k+1}) )^*}\left( \D D(u_{k+1})^\top D(u_{k+1}) \right).$
	\end{itemize}
	Then the output sequence of Algorithm~\ref{Algo:2} satisfies
	\begin{align}
	\label{eq:rates-thm-admm}
	& \min_{k\in K} \frac{1}{\lambda_p+ \sqrt{m} \lambda_D \rho + d \lambda'^2_D} \cdot \frac{\\|G_{\b_K,\g_k}(x_k,y_k)\\|^2
	+ \\|H_{\b_K,\iota_k}(z_k,y_k)\\|^2
	}{\sqrt{k_1}\log(k_1+1)} \nonumber\\
	& + \\|A(x_k)\\|^2+\\|B(z_k)\\|^2
	\lesssim \frac{1}{k_1-k_0}\left( \frac{\lambda_p'^2+\lambda_D'^2}{\nu(q,D,S,\rho,\rho')^2} + \mu\right),
	\end{align}
	where $\lesssim,\gtrsim$ above suppress the dependence on constants and less important parameters, for the sake of clarity.
	%The exact expressions
	%%in terms of $\lambda_f,\lambda'_f,\lambda_A,\lambda'_A, \b_{k_0},\s_{k_0},\rho, \rho', \nu(g,A,S,\rho,\rho'),y_0$
	%are found in (\ref{eq:exact dependences},\ref{eq:suff cnd on nu-1},\ref{eq:cnd-on-nu-proof}).
	\end{theorem}

	\noindent Most of the remarks after Theorem \ref{thm:main} apply to Theorem~\ref{thm:main admm} too.


	\section*{Acknowledgments}

	AE is extremely grateful to Nadav Hallak for his thoughtful and constructive comments, which improved this manuscript.

ADMM.texNo OneTemporaryActions

File Metadata

ADMM.texView Options

Event Timeline

ADMM.tex
No OneTemporary
Actions

ADMM.tex
View Options