In convex optimization, whenever applicable, Alternating Direction Method of Multipliers (ADMM)
\cite{Glowinski1975,Gabay1976,Boyd2011} often outperforms the augmented Lagrangian method. Additionally, ADMM often more efficiently incorporates any proximal operators. \notea{this is very vague and hand-wavy. what to say? }
Inspired by the success of ADMM in convex optimization, in this section we develop and study a (linearized) ADMM variant of Algorithm~\ref{Algo:2}.
More specifically, consider the program
\begin{align}
\begin{cases}
\underset{x,z}{\min}\,\, f(x)+g(x)+h(z)+l(z)\\
A(x)+B(z) = 0,
\end{cases}
\label{eq:admm 2}
\end{align}
where $f,h:\RR^d\rightarrow\RR$ and $A,B:\RR^d\rightarrow\RR^d$ are smooth in the sense that
To solve the equivalent formulation in \eqref{eq:admm main}, we propose the linearized ADMM, detailed in Algorithm~\ref{Algo:3}. Most remarks about Algorithm~\ref{Algo:2} apply to Algorithm~\ref{Algo:3} as well and, in particular, note that Algorithm~\ref{Algo:3} performs two consecutive primal updates, one on $x$ and then one on $z$.
To parse the details of Algorithm~\ref{Algo:3}, we need to slightly change the gradient map in Definition~\ref{def:grad map} and the line search procedure in Lemma~\ref{lem:eval Lipsc cte} to match $\L_\b$ in \eqref{eq:lagrangian admm}. More specifically, the corresponding gradient maps are defined as
\left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle
+ \frac{1}{2\gamma'}\| x^+_{\gamma'} - x\|^2
\Big\},
\label{eq:line search x admm}
\end{align}
\begin{align}
\iota& :=
\max\Big\{
\iota' ={\iota}_0 \theta^i :
\mathcal{L}_\beta (x,z^+_{\iota'},y) \nonumber\\
&\qquad\qquad
\le\mathcal{L}_\beta (x,z,y) +
\left\langle z^+_{\iota'} - z , \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle
+ \frac{1}{2\iota'}\| z^+_{\iota'} - z\|^2
\Big\}.
\label{eq:defn of gamma line search admm}
\end{align}
The analysis of Algorithm~\ref{Algo:3} is similar to that of Algorithm~\ref{Algo:2}, involving also a similar version of Lemma~\ref{lem:bnd bnd Ak}. The convergence rate of Algorithm~\ref{Algo:3} is detailed below and proved in Appendix~\ref{sec:ADMM theory}.
\item\textbf{(Line search in $x$)} Use the line search procedure in \eqref{eq:line search x admm} by replacing $x=x_k,z=z_k,y=y_k,\b=\b_k$ and let $\g_k \leftarrow\g$ be the output.\\
\item\textbf{(Descent step in $x$)}$ x_{k+1} \leftarrow P_{g }(x_{k} -\gamma_{k} \nabla_x \mathcal{L}_{\beta_{k}}(x_{k},z_k,y_{k}))$, where $\L_{\b_k}$ is the augmented Lagrangian and $P_g$ denotes the proximal operator, defined in (\ref{eq:Lagrangian},\ref{eq:defn of prox}), respectively.\\
\item\textbf{(Line search in $z$)} Use the line search procedure in \eqref{eq:defn of gamma line search} by replacing $x=x_{k+1},z=z_k,y=y_k,\b=\b_k$ and let $\iota_k \leftarrow\iota$ be the output.\\
\item\textbf{(Descent step in $z$)}$ z_{k+1} \leftarrow P_{l }(z_{k} -\iota_{k} \nabla_z \mathcal{L}_{\beta_{k}}(x_{k+1},z_k,y_{k}))$, where $P_l$ denotes the proximal operator for $l$.\\
%
%\item \textbf{(Control)} Properly modify $x_{k+1},z_{k+1}$ to ensure that $\|x_{k+1}\|,\|z_{k+1}\|\le \rho'$ or $\|x_{k+1}-x_k\|\le \rho''$.\\
\item\textbf{(Stopping criterion)} If $\g_k \|G_{\b_k,\g_k}(x_k,z_k,y_k)\|^2+\iota_k \|G_{\b_k,\iota_k}(x_{k+1},z_k,y_k)\|^2+\s_k \|A(x_k)+B(z_k)\|^2\le\tau$, then quit and return $x_{k+1},z_{k+1}$.
% as approximate minimizers of Program \eqref{prob:01}.
See \eqref{eq:grad map} for the definition of the gradient mapping. \\
%and that $\rho$ is large enough such that (\ref{eq:suff cnd on rho final}) holds . For $\rho'>0$, in addition to the strong smoothness of $f$ and $A$ quantified in (\ref{eq:smoothness basic}), let us define
%to be the (restricted) Lipschitz constants of $f,A,B$, respectively.
%Moreover, suppose that the nonconvex Slater's condition, namely, Lemma \ref{lem:bnd bnd Ak}, is in force, and
%\begin{align}
%\nu_A \ge 2\lambda_A \rho''.
%\end{align}
%Then the output of linearized ADMM in Algorithm~\ref{Algo:2} satisfies \textbf{we need a better expression below to better show the dependence on various parameters.}
%where $\overline{\gamma},\overline{\iota}$ are independent of $k_1$ and $\lesssim$ suppresses the dependence on all parameters except $k_1$. The exact expressions in terms of $\lambda_f,\lambda'_f,\lambda_A,\lambda'_A, \b_{k_0},\s_{k_0}, \nu_A,y_0$ are found in (\ref{eq:raw bnd on sum admm},\ref{eq:lower_band_iotas_admm}).
%
%\end{theorem}
\begin{theorem}[Convergence rate of linearized ADMM]
\label{thm:main admm}
For sufficiently large integers $k_0< k_1$, consider the interval $K=[k_0:k_1]$, and consider the output sequence $\{x_k,y_k\}_{k\in K}$ of Algorithm~\ref{Algo:2}. Suppose that
\notea{note that mu and some other quantities are slightly changed from the proof to simplify the presentation which is inconsequential for the proof I think.}
For $\rho'\gtrsim\sqrt{\mu}$, in addition to the strong smoothness of $f$ and $A$ quantified in (\ref{eq:smoothness basic}), let us define