diff --git a/tex/Paper/sections/ADMM.tex b/tex/Paper/sections/ADMM.tex
index f33be09..ac55ec3 100644
--- a/tex/Paper/sections/ADMM.tex
+++ b/tex/Paper/sections/ADMM.tex
@@ -1,161 +1,161 @@
 \section{Linearized ADMM}
 
 Consider the program 
 \begin{align}
 \begin{cases}
 \underset{x,z}{\min} \,\, f(x)+g(x)+h(z)+l(z)\\
 A(x)+B(z) = 0,
 \end{cases}
 \label{eq:admm 2}
 \end{align}
 where $f,h:\RR^d\rightarrow\RR$ and $A,B:\RR^d\rightarrow\RR^d$ are smooth in the sense that 
 \begin{align*}
 \| \nabla f(x) - \nabla f(x') \| \le \lambda_f \|x-x'\|,
 \,\, 
 \| DA(x) - DA(x') \| \le \lambda_A \|x-x'\|,
 \end{align*}
 \begin{align}
 \| \nabla h(z) - \nabla h(z') \| \le \lambda_h \|z-z'\|,
 \,\, 
 \| DB(z) - DB(z') \| \le \lambda_B \|z-z'\|,
 \label{eq:smoothness basics admm}
 \end{align}
 for every $x,x',z,z'\in \RR^d$. Above, $g,l:\RR^d\rightarrow\RR$ are convex functions. For example, when $C,D\subset \RR^d$ and $g=1_C, l=1_D$ are the indicator functions on $C,D$, respectively, Program~\eqref{eq:admm 2} becomes 
 \begin{align}
 \begin{cases}
 \underset{x,z}{\min} \,\, f(x)+h(z)\\
 A(x)+B(z) = 0\\
 x\in C\\
 z\in D.
 \end{cases}
 \label{eq:admm 2}
 \end{align}
 The augmented Lagrangian corresponding to Program~\eqref{eq:admm 2} is 
 \begin{align}
 \L_{\b}(x,z,y) & = f(x)+g(x)+h(z)+l(z) \nonumber\\
 & \qquad + \langle A(x)+B(z),y \rangle + \frac{\b}{2}\| A(x)+B(z) \|^2,
 \label{eq:lagrangian admm}
 \end{align}
 and Program \eqref{eq:admm 2} is therefore equivalent to the minimax program 
 \begin{align}
 \underset{x,z}{\min} \underset{y}{\max}\,\, \L_\b(x,z,y).
 \label{eq:admm main}
 \end{align}
 To solve the equivalent formulation in Program \eqref{eq:admm main}, we propose the linearized ADMM, detailed in Algorithm~\ref{Algo:3}. To parse this algorithm, we need to slightly change the gradient map in Definition~\ref{def:grad map} and the line search procedure in Lemma~\ref{lem:eval Lipsc cte} to match $\L_\b$ in \eqref{eq:lagrangian admm}. More specifically, the corresponding gradient maps are defined as 
 \begin{equation} 
 G_{\b,\gamma}(x,z,y) = \frac{x-x^+}{\gamma},
 \qquad 
 H_{\b,\iota}(x,z,y) = \frac{z-z^+}{\iota},
  \label{eq:grad map admm}
 \end{equation} 
 where 
 \begin{align}
 x^+=P_{g}(x-\gamma \nabla_x \mathcal{L}_ {\beta}(x,z,y)),
 \qquad 
 z^+=P_{l}(z-\iota \nabla_z \mathcal{L}_ {\beta}(x,z,y)),
 \end{align} 
 and $\g,\iota$ are the primal step sizes. 
 The line search procedure is similar to Lemma~\ref{lem:eval Lipsc cte} and we set 
 \begin{align*}
 x^+_{\gamma'} = P_g(x - \gamma' \nabla_x \mathcal{L}_\beta(x,z,y)),
 \end{align*}
 \begin{align}
 z^+_{\iota'} = P_l(z - \iota' \nabla_z \mathcal{L}_\beta(x,z,y)),
 \end{align}
 \begin{align}
 \gamma & := 
 \max \Big\{
 \gamma' ={\gamma}_0 \theta^i : 
 \mathcal{L}_\beta (x^+_{\gamma'},z,y) \nonumber\\
  &  \qquad \qquad 
 \le \mathcal{L}_\beta (x,z,y) + 
-\g' \left\langle x^+_{\gamma'} - x ,  \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle 
+\left\langle x^+_{\gamma'} - x ,  \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle 
 + \frac{1}{2\gamma'} \| x^+_{\gamma'} - x\|^2   
 \Big\},
 \label{eq:line search x admm}
 \end{align}
 \begin{align}
 \iota & := 
 \max \Big\{
 \iota' ={\iota}_0 \theta^i : 
 \mathcal{L}_\beta (x,z^+_{\iota'},y) \nonumber\\
  &  \qquad \qquad 
 \le \mathcal{L}_\beta (x,z,y) + 
-\iota' \left\langle z^+_{\iota'} - z ,  \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle 
+\left\langle z^+_{\iota'} - z ,  \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle 
 + \frac{1}{2\iota'} \| z^+_{\iota'} - z\|^2   
 \Big\}.
 \label{eq:defn of gamma line search}
 \end{align}
 
 
 
 
 
 
 
 
 \begin{algorithm}
 \label{Algo:3} 
 Input: Parameters $\beta_1,\s_1,\rho,\rho',\rho'',\tau> 0$,  initialization $x_{1},z_1\in \RR^d$ with $\|A(x_1)+B(z_1)\|\le \rho$ and $\|x_1\|,\|z_1\|\le \rho'$, dual initialization $y_0\in \RR^m$.
 \\
 
 For $k=1,2,\ldots$, execute\\ 
 \begin{enumerate}
 
 \item \textbf{(Update penalty weight)} Set 
 \begin{align}
 \beta_k = \frac{\beta_{1}}{\log 2}\sqrt{k \log^2(k+1)} .
 \end{align}
 \item  \textbf{(Line search in $x$)} Use the line search procedure in \eqref{eq:line search x admm} by replacing $x=x_k,z=z_k,y=y_k,\b=\b_k$ and let $\g_k = \g$ be the output.\\
 
 \item  \textbf{(Descent step in $x$)} Set $ x_{k+1} = P_{g }(x_{k} - \gamma_{k} \nabla_x \mathcal{L}_{\beta_{k}}(x_{k},z_k,y_{k}))$, where $\L_{\b_k}$ is the augmented Lagrangian and $P_g$ denotes the proximal operator, defined in (\ref{eq:Lagrangian},\ref{eq:defn of prox}), respectively.\\ 
 
 
 \item  \textbf{(Line search in $z$)} Use the line search procedure in \eqref{eq:defn of gamma line search} by replacing $x=x_{k+1},z=z_k,y=y_k,\b=\b_k$ and let $\iota_k = \iota$ be the output.\\
 
 \item  \textbf{(Descent step in $z$)} Set $ z_{k+1} = P_{l }(z_{k} - \iota_{k} \nabla_z \mathcal{L}_{\beta_{k}}(x_{k+1},z_k,y_{k}))$, where $P_l$ denotes the proximal operator for $l$.\\ 
 
 
 \item \textbf{(Control)} Properly modify $x_{k+1},z_{k+1}$ to ensure that $\|x_{k+1}\|,\|z_{k+1}\|\le \rho'$ or $\|x_{k+1}-x_k\|\le \rho''$.\\
 \item \textbf{(Stopping criterion)} If $\g_k \|G_{\b_k,\g_k}(x_k,z_k,y_k)\|^2+ \iota_k \|G_{\b_k,\iota_k}(x_{k+1},z_k,y_k)\|^2 + \s_k \|A(x_k)+B(z_k)\|^2 \le \tau$, then quit and return $x_{k+1},z_{k+1}$ as  approximate minimizers of Program \eqref{prob:01}. See \eqref{eq:grad map} for the definition of the gradient mapping. \\
 \item \textbf{(Update dual step size)} Set 
 \begin{align}
 \s_{k+1} = \s_{1} \min\left( {\frac{1}{\sqrt{k+1}}}, \frac{\|A(x_1)+B(z_1)\|}{|A(x_{k+1})+B(z_{k+1})\|}\cdot \frac{ \log^2 2 }{(k+1)\log^2(k+2)} \right). 
 \end{align}\\
 \item \textbf{(Dual ascent step)} Set $y_{k+1} = y_{k} + \sigma_{k+1}(A(x_{k+1})+B(z_{k+1}))$.
 \end{enumerate}
 \caption{ADMM for solving Program \eqref{eq:admm main}}
 \end{algorithm}
 
 
 
 \begin{theorem}[Linearized ADMM]
 \label{thm:main admm}
 For a sufficiently large $k_0$, consider an integer interval $K=[k_0:k_1]$, and suppose that 
 \begin{align}
  \inf_{k\ge k_0}  f(x_k) + g(x_k)+ h(z_k) + l(z_k) + \langle A(x_k)+B(z_k) ,y_{k_0}\rangle 
 >- \infty,
 \end{align}
 and that $\rho$  is large enough such that (\ref{eq:suff cnd on rho final}) holds . For $\rho'>0$, in addition to the strong smoothness of $f$ and $A$ quantified in (\ref{eq:smoothness basic}), let us define 
 \begin{align}
 \lambda'_f = \max_{\|x\|\le \rho'} \|\nabla f(x)\|,
 \qquad 
 \lambda'_A = \max_{\|x\| \le \rho'} \|DA(x)\|,
 \qquad 
 \lambda'_B = \max_{\|x\|\le \rho'} \|DB(x)\|,
 \end{align}
 to be the (restricted) Lipschitz constants of $f,A,B$, respectively. 
 Moreover, suppose that  the nonconvex Slater's condition, namely, Lemma \ref{lem:bnd bnd Ak},  is in force, and 
 \begin{align}
 \nu_A \ge 2\lambda_A \rho''. 
 \end{align}
 Then the output of linearized ADMM in Algorithm~\ref{Algo:2} satisfies \textbf{we need a better expression below to better show the dependence on various parameters.}
 \begin{align}
 & \min_{k\in K}\,\, \min(\overline{\g},\overline{\iota})( \|G_k\|^2 + \|H_k\|^2) \sqrt{\frac{k_0 \log^2(k_0+1)}{k_1\log^2(k_1+1)}} \nonumber\\
 & \qquad  + \|A(x_k)\|^2 
 \lesssim  \frac{1}{k_1-k_0},
 \end{align}
 where $\overline{\gamma},\overline{\iota}$ are independent of $k_1$ and $\lesssim$ suppresses the dependence on all parameters except $k_1$. The exact expressions in terms of  $\lambda_f,\lambda'_f,\lambda_A,\lambda'_A, \b_{k_0},\s_{k_0}, \nu_A,y_0$ are found in (\ref{eq:raw bnd on sum admm},\ref{eq:lower_band_iotas_admm}). 
 
 \end{theorem}
 
diff --git a/tex/Paper/sections/ADMM_theory.tex b/tex/Paper/sections/ADMM_theory.tex
index c93e0e4..5203e5f 100644
--- a/tex/Paper/sections/ADMM_theory.tex
+++ b/tex/Paper/sections/ADMM_theory.tex
@@ -1,195 +1,195 @@
 \section{Proof of Theorem \ref{thm:main admm} \label{sec:ADMM theory}}
 
 Let us repeat the technical lemmas and definitions in Section \ref{sec:preliminaries}, slightly adjusted for the augmented Lagrangian of Program~\eqref{eq:admm main}, see \eqref{eq:lagrangian admm}. These standard results are stated without proof.
 \begin{lemma}[Smoothness]\label{lem:smoothness admm}
  For  $\rho,\rho'\ge 0$, it holds that 
  \begin{align*}
 \| \nabla_x \mathcal{L}_{\beta}(x, z,y)-  \nabla_x \mathcal{L}_{\beta}(x', z, y) \| \le \lambda_{\b,z} \|x-x'\|,
  \end{align*}
  \begin{align}
  \| \nabla_z \L_\b(x,z,y) - \nabla_z \L_\b(x,z',y) \| \le \lambda_{\b,x} \|z-z'\|,
  \end{align}
  for every $x,x',z,z' \in \{ x'': \|A(x'')+B(z)\| \le \rho, \|x''\|\le \rho'\}$ and $y\in\RR^m$, where  
  \begin{align*}
 \lambda_{\beta,z} 
 & \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho  \right) + \b d \lambda'^2_A.
 \end{align*}
 \begin{align}
 \lambda_{\beta,x} 
 & \le \lambda_h + \sqrt{m}\lambda_B \left(\|y\| + \b\rho  \right) + \b d \lambda'^2_B.
 \label{eq:smoothness of Lagrangian admm}
 \end{align}
 Above, $\lambda_f,\lambda_A,\lambda_h,\lambda_B$ were defined in (\ref{eq:smoothness basics admm}) and 
 \begin{align}
 \lambda'_A := \max_{\|x\|\le \rho'}\|DA(x)\|,
 \qquad 
 \lambda'_B := \max_{\|z\|\le \rho'}\|DB(z)\|.
 \end{align}
 \end{lemma}
 \begin{definition}\label{def:grad map admm}\textbf{{(Gradient Mapping)}} Given $x\in \RR^d$ and $\gamma >0$,   the gradient mappings $G_{\beta,\gamma}(\cdot,z, y), G_{\b,\iota}(x,\cdot,y): \RR^d\rightarrow\RR^d$ takes, respectively, $x,z\in \RR^d$ to 
 \begin{equation} 
 G_{\b,\gamma}(x,z,y) = \frac{x-x^+}{\gamma},
 \qquad 
 H_{\b,\iota}(x,z,y) = \frac{z-z^+}{\iota},
  \label{eq:grad map admm}
 \end{equation} 
 where $x^+=P_{g}(x-\gamma \nabla_x \mathcal{L}_ {\beta}(x,z,y))$ and $z^+=P_{l}(z-\iota \nabla_z \mathcal{L}_ {\beta}(x,z,y))$. 
 \end{definition}
 
 \begin{lemma}[Descent lemma]\label{lem:11 admm}
 For $x,z\in \RR^d$ and $y\in \RR^m$, let $x^+,z^+$ be as in Definition \ref{def:grad map admm} with $\g<1/\lambda_{\b,z}$ and $\iota<1/\lambda_{\b,x}$. 
 For $\rho,\rho'\ge 0$, suppose that  
 \begin{align}
 x,x^+,z,z^+ \in \{ x': \|A(x')\| \le \rho, \|x'\| \le \rho' \}. 
 \end{align}
 Then  it holds that 
  \begin{equation*}
  \label{e:deslem}
  \| G_{\beta,\gamma}(x,z,y)\|^{2}\leq \frac{2}{\gamma} (\mathcal{L}_\beta(x,z,y) + g (x)- \mathcal{L}_\beta(x^+,z,y)- g(x^+) ),
 \end{equation*}
 \begin{align}
  \| H_{\beta,\iota}(x,z,y)\|^{2}\leq \frac{2}{\iota} (\mathcal{L}_\beta(x,z,y) + l (x)- \mathcal{L}_\beta(x,z^+,y)- l(x^+) ).
  \label{eq:deslem admm}
 \end{align}
 \end{lemma}
 \begin{lemma}[Line search] \label{lem:eval Lipsc cte admm} Fix $\theta \in (0,1)$ and ${\gamma}_0,\iota_0>0$. For $\gamma',\iota'>0$, let 
 \begin{align}
 x^+_{\gamma'} = P_g(x - \gamma' \nabla_x \mathcal{L}_\beta(x,z,y)),
 \qquad 
 z^+_{\iota'} = P_l(z - \iota' \nabla_z \mathcal{L}_\beta(x,z,y)),
 \end{align}
 and  define 
 \begin{align*}
 \gamma & := 
 \max \Big\{
 \gamma' ={\gamma}_0 \theta^i : 
 \mathcal{L}_\beta (x^+_{\gamma'},z,y) \nonumber\\
  & \qquad \qquad \quad 
 \le \mathcal{L}_\beta (x,z,y) + 
-\g' \left\langle x^+_{\gamma'} - x ,  \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle 
+\left\langle x^+_{\gamma'} - x ,  \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle 
 + \frac{1}{2\gamma'} \| x^+_{\gamma'} - x\|^2   
 \Big\},
 \end{align*}
 \begin{align}
 \iota & := 
 \max \Big\{
 \iota' ={\iota}_0 \theta^i : 
 \mathcal{L}_\beta (x,z^+_{\iota'},y) \nonumber\\
  & \qquad \qquad \quad 
 \le \mathcal{L}_\beta (x,z,y) + 
-\iota' \left\langle z^+_{\iota'} - z ,  \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle 
+\left\langle z^+_{\iota'} - z ,  \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle 
 + \frac{1}{2\iota'} \| z^+_{\iota'} - z\|^2   
 \Big\}.
 \label{eq:defn of gamma line search}
 \end{align}
 Then, (\ref{eq:deslem admm}) holds and, moreover, we have that 
 \begin{align}
  \gamma \ge \frac{\theta}{\lambda_{\beta,x}},
  \qquad 
  \iota \ge \frac{\theta}{\lambda_{\beta,z}}.
  \label{eq:moreover}
 \end{align}
 \end{lemma}
 For the reader's convenience, let us also recall the updates of  the nonconvex ADMM algorithm in iteration $k$ as 
 \begin{align}
 x_{k+1}& =P_g(x_k -\g_k \nabla_x \L_{\b_k} (x_k,z_k,y_k)),\nonumber\\
 z_{k+1}& =P_l(z_k - \i_k \nabla_z \L_{\b_k} (x_{k+1},z_k,y_k)),\nonumber\\
 y_{k+1}& = y_k+\s_{k+1} (A(x_{k+1})+B(z_{k+1})). 
 \label{eq:ADMM updates recall}
 \end{align}
 For every $k\in K$, recall that the primal step sizes $\g_k,\i_k$ are determined by line search in Lemma \ref{lem:eval Lipsc cte admm} and the penalty weights and  dual step sizes are set as 
 \begin{align}
 \b_k & = \b_{k_0} \sqrt{\frac{k\log^2(k+1)}{k_0 \log^2(k_0+1)}}, 
  \nonumber\\
 \s_k &  = \s_{k_0}\min\left( \sqrt{\frac{k_0}{k}} , \frac{\|A(u_{k_0})+B(v_{k_0})\|}{\|A(u_k)+B(v_k)\|} \cdot  \frac{k_0 \log^2 (k_0+1)}{k \log^2 (k+1)} \right).
 \end{align}
 For every $k\in K$, let us set 
 \begin{align*}
 G_k = G_{\b_k,\g_k}(x_k,z_k,y_k) =  \frac{x_k-x_{k+1}}{\g_k},
 \end{align*}
 \begin{align}
 H_k = H_{\b_k,\iota_k}(x_{k+1},z_k,y_k)= \frac{z_k-z_{k+1}}{\i_k},
 \end{align}
 for short. 
 The convergence analysis only slightly differs from the proof of Theorem \ref{thm:main} and we therefore focus only on the differences. Similar to the proof of Theorem~\ref{thm:main}, two applications of Lemma~\ref{lem:11 admm} yields that 
 \begin{align}
 \frac{\g_k \|G_k\|^2}{2} \le \L_{\b_k}(x_k,z_k,y_k)- \L_{\b_k}(x_{k+1},z_k,y_k), \nonumber\\
 \frac{\i_k \|H_k\|^2}{2} \le \L_{\b_k}(x_{k+1},z_k,y_k)- \L_{\b_k}(x_{k+1},z_{k+1},y_k),
 \label{eq:decrease twice}
 \end{align}
 for every $k$. 
 By setting
 \begin{align*}
 u_k = [
 \begin{array}{cc}
 x_k^\top & z_k^\top
 \end{array}
 ]^\top \in \RR^{2d},
 \qquad 
 Q_k = [
 \begin{array}{cc}
 G_k^\top & H_k^\top
 \end{array}
 ]^\top \in \RR^{2d},
 \end{align*}
 \begin{align}
 q(u) = f(x)+g(x)+h(z)+l(z),
 \qquad 
 \kappa_k = \min(\g_k,\iota_k), 
 \label{eq:defn of w}
 \end{align}
 for every $k\in K$ and after summing up the two inequalities in \eqref{eq:decrease twice}, we reach 
 \begin{align}
 \frac{\kappa_k \|Q_k\|^2}{2} & \le \L_{\b_k}(x_k,z_k,y_k) - \L_{\b_k}(x_{k+1},z_{k+1},y_k),
 \qquad \forall k\in K.
 \end{align}
 By following the same steps as in the proof of Theorem \ref{thm:main}, we find that 
 \begin{align}
 \sum_{k=k_0}^{k_1} \frac{\kappa_k \|Q_k\|^2}{2} & \le \mu+ 2\sum_{k=k_0}^{k_1} \| A(x_k) + B(z_k) \|^2 ,
 \end{align}
 where 
 \begin{align}
 \mu & :=  \sup_{k\ge k_0} \Big( q(u_{k_0})  - q(u_{k})+ \langle A(x_{k_0})+B(z_{k_0})-A(x_k)-B(z_k) ,y_{k_0}\rangle \nonumber\\
 & \qquad \qquad  + \frac{\beta_{k_0}}{2} \|A(x_{k_0})+B(z_{k_0})\|^2 \Big) 
 < \infty.
 \end{align}
 On the other hand, the $x$ update in \eqref{eq:ADMM updates recall} implies that 
 \begin{align}
 & G_k - \nabla f(x_k)  - DA(x_k)^\top y_k \nonumber\\
 & \qquad - \b_k DA(x_k)^\top (A(x_k) + B(z_k) ) \in \partial g(x_{k+1}).
 \end{align}
 For $\rho,\rho',\rho''>0$, suppose that 
 \begin{align}
 \max_{k\in K} \|A(x_k)+B(z_k) \| \le \rho,
 \quad 
 \max_{k\in K} \|x_k\| \le \rho',
 \quad 
 \max_{k\in K} \|x_{k+1}-x_k\| \le \rho'',
 \label{eq:feas to be checked later}
 \end{align}
 \begin{align}
 \nu_A \ge 2\lambda_A \rho'',
 \end{align}
 where $\nu_A$ was defined in \eqref{eq:new slater defn}. 
 Then, following the same line of argument, a similar version of Lemma~\ref{lem:bnd bnd Ak} holds and 
 \begin{align}
 \|A(x_k)+B(z_k) \| \le \frac{2}{\nu_A\b_k} (\|G_k\|+	\lambda'_h+ \lambda'_A\|y_{k-1}\|).
 \end{align}
 If we assume that (\ref{eq:to be used later on},\ref{eq:suff cnd on rho}) hold, then the first bound in \eqref{eq:feas to be checked later} is in force and  
 \begin{align}
 & \sum_{k=k_0}^{k_1} \kappa_k \|Q_k\|^2 + \|A(x_k)+B(z_k)\|^2  \nonumber\\
 & \le \frac{24c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b^2_{k_0}} +  \frac{24c\lambda'^2_A  y_{\max}^2  k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2} + 5\mu.
 \label{eq:raw bnd on sum admm}
 \end{align}
 To better interpret the above bound, we next estimate the primal step sizes $\{\g_k,\i_k\}$, which are determined by line search. Let $\lambda_{\b,u}$ denote the Lipschitz constant of $\L_{\b_k}(u,v)$ with respect to $u$. Similar to the proof of Theorem \ref{thm:main}, we find that 
 \begin{align*}
 \gamma_k 
 & \ge \frac{\theta }{ 2 \b_{k_0} \left( \sqrt{m}\lambda_A \rho + d \lambda'^2_A \right)}\sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}} \nonumber\\
 & =: \overline{\g} \sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}},
 \end{align*}
 \begin{align}
 \iota_k &  \ge \frac{\theta }{ 2 \b_{k_0} \left( \sqrt{m}\lambda_B \rho + d \lambda'^2_B \right)}\sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}} \nonumber\\
 & =: \overline{\iota} \sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}},
 \label{eq:lower_band_iotas_admm}
 \end{align}
 for sufficiently large $k_0$. Substituting the above bounds in \eqref{eq:raw bnd on sum admm} and following the steps in the proof of Theorem \ref{thm:main} completes the proof of Theorem \ref{thm:main admm}.
diff --git a/tex/Paper/sections/appendix.tex b/tex/Paper/sections/appendix.tex
index bca5034..23f87e4 100644
--- a/tex/Paper/sections/appendix.tex
+++ b/tex/Paper/sections/appendix.tex
@@ -1,217 +1,217 @@
 \section{Proof of Lemma \ref{lem:smoothness}\label{sec:proof of smoothness lemma}}
 \textbf{We assume Hessian exists. We shouldn't assume that for a strictly correct proof!}
 Note that
 \begin{align}
 \mathcal{L}_{\beta}(x,y) = f(x) + \sum_{i=1}^m y_i A_i (x)  + \frac{\b}{2} \sum_{i=1}^m (A_i(x))^2,
 \end{align}
 which implies that 
 \begin{align}
 \nabla_x \mathcal{L}_\beta(x,y) & = \nabla f(x) + \sum_{i=1}^m y_i  \nabla A_i(x) + \frac{\b}{2} \sum_{i=1}^m A_i(x) \nabla A_i(x) \nonumber\\
 & = \nabla f(x) + DA(x)^\top y  + \b DA(x)^\top A(x),
 \end{align}
 where $DA(x)$ is the Jacobian of $A$ at $x$. By taking another derivative with respect to $x$, we reach 
 \begin{align}
 \nabla^2_x \mathcal{L}_\beta(x,y) & = \nabla^2 f(x) + \sum_{i=1}^m \left( y_i + \b A_i(x) \right) \nabla^2 A_i(x) + \b \sum_{i=1}^m \nabla A_i(x) \nabla A_i(x)^\top. 
 \end{align}
 It follows that 
 \begin{align}
 \|\nabla_x^2 \mathcal{L}_\beta(x,y)\| & \le \| \nabla^2 f(x) \|+  \max_i \| \nabla^2 A_i(x)\| \left (\|y\|_1+\b \|A(x)\|_1 \right) + \beta\sum_{i=1}^m \|\nabla A_i(x)\|^2 \nonumber\\
 & \le \lambda_h+  \sqrt{m} \lambda_A \left (\|y\|+\b \|A(x)\| \right) + \b \|DA(x)\|^2_F.
 \end{align}
 For every $x$ such that $\|A(x)\|\le \rho$ and $\|x\|\le \rho'$, we conclude that 
 \begin{align}
 \|\nabla_x^2 \mathcal{L}_\beta(x,y)\|
 & \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) + \b \max_{\|x\|\le \rho'}\|DA(x)\|_F^2,
 \end{align}
 which completes the proof of Lemma \ref{lem:smoothness}. 
 
 \section{Proof of Lemma \ref{lem:11}\label{sec:proof of descent lemma}}
 
 Throughout, let 
 \begin{align}
 G = G_{\b,\g}(x,y) = \frac{x-x^+}{\g},
 \end{align}
 for short. 
 Suppose that $\|A(x)\|\le \rho$, $\|x\|\le \rho$, and similarly $\|A(x^+)\|\le \rho$, $\|x^+\|\le \rho'$. An application of Lemma \ref{lem:smoothness} yields that 
 \begin{align}
 \L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y)+ \langle x^+-x,\nabla_x \L_\b(x,y) \rangle 
 + \frac{\lambda_\b}{2} \|x^+ - x\|^2 + g(x^+) \nonumber\\
 & = \L_\b(x,y)-\g \langle G ,\nabla_x \L_\b(x,y) \rangle 
 + \frac{\g^2 \lambda_\b }{2} \|G\|^2 + g(x^+)
 \label{eq:descent pr 1}
 \end{align}
 Since $x^+ = P_g(x - \g \nabla_x \L_\b(x,y))$, we also have that 
 \begin{align}
-\g (G - \nabla_x \L_\b(x,y)) = \xi  \in \partial g(x^+). 
+G - \nabla_x \L_\b(x,y) = \xi  \in \partial g(x^+). 
 \label{eq:opt of prox}
 \end{align}
 By combining (\ref{eq:descent pr 1},\ref{eq:opt of prox}), we find that 
 \begin{align}
 \L_\b(x^+,y)+g(x^+) & 
 \le \L_\b(x,y) -\g \|G\|^2 + \g \langle G, \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\
 & = \L_\b(x,y) -\g \|G\|^2 + \langle x- x^+ , \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\
 & \le \L_\b(x,y) + g(x)  - \g\left( 1-\frac{\g\lambda_\b}{2}\right) \|G\|^2,
 \end{align}
 where the last line above uses the convexity of $g$. Recalling that $\g\le 1/\lambda_\b$ completes the proof of Lemma \ref{lem:11}. 
 
 
 \section{Proof of Lemma \ref{lem:eval Lipsc cte}\label{sec:proof of linesearch}}
 
 
 Recalling $x^+_{\gamma}$ in \eqref{eq:defn of gamma line search}, we note that 
 \begin{equation}
-x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = -\xi \in -\partial g (x^+_{\gamma}).
+x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = - \g\xi \in -\partial g (x^+_{\gamma}).
 \label{eq:optimality of uplus}
 \end{equation}
 Lastly, $\gamma$ by definition in \eqref{eq:defn of gamma line search}  satisfies 
 \begin{align}
 & \mathcal{L}_{\beta}(x^+_{\gamma},y) + g(x_\g^+) \nonumber\\
- & \le \mathcal{L}_\beta(x,y) + \g \left\langle 
+ & \le \mathcal{L}_\beta(x,y) +  \left\langle 
 x^+_{\gamma} - x , \nabla_x \mathcal{L}_\beta (x,y) 
 \right\rangle + \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 
 + g(x_\g^+)
 \nonumber\\
 & =  \mathcal{L}_\beta(x,y) + \left\langle 
 x - x^+_{\gamma}  ,\xi 
 \right\rangle 
 - \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+)\nonumber\\
 & \le  \mathcal{L}_\beta(x,y) 
 - \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x) - g(x^+_\g)
 \qquad (\text{convexity of } g) \nonumber\\
 & = \mathcal{L}_\beta(x,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(x,y)\|^2 +g(x) - g(x^+_\g), 
 \qquad \text{(see Definition \ref{def:grad map})}
 \end{align}
 which completes the proof of Lemma \ref{lem:eval Lipsc cte}.
 
 
 
 
 
 
 
 \section{Proof of Lemma \ref{lem:bnd bnd Ak} \label{sec:proof of bnd Ak}}
 
 Throughout, we assume that 
 \begin{align} 
 \max_{k\in K} \|A(x_k)\| \le \rho,
 \qquad \max_{k\in K} \| x_k\| \le \rho',
 \qquad \max_{k\in K} \| x_{k+1}-x_k \| \le \rho''. 
 \label{eq:bndness in slater proof}
 \end{align}
 From the $x$ update in \eqref{eq:update uk recall}, it follows that 
 \begin{align}
 x_{k+1}-x_k + \g_k \nabla f(x_k) + \g_k DA(x_k)^\top (y_k + \b_k A(x_k) ) =- \xi \in - \partial g(x_{k+1}),
 \end{align}
 which, after recalling \eqref{eq:grad map recall}, can be written as 
 \begin{align}
 -\frac{ G_k}{\b_k} + \frac{ \nabla f(x_k)}{\b_k} 
 + \frac{ DA(x_k)^\top y_k }{\b_k} +  DA(x_k)^\top A(x_k)  \in - \frac{\partial g(x_{k+1})}{\b_k \g_k}.
 \label{eq:to be projected}
 \end{align}
 Let 
 \begin{align}
 \cone(x_{k+1}) = \text{cone}(\partial g(x_{k+1}) )^*, 
 \end{align}
 be the shorthand for the dual cone associated with 
 \begin{equation}
 \cone(\partial g(x_{k+1})) = \bigcup_{\alpha \ge 0} \alpha \cdot \partial g(x_{k+1}) \subseteq \RR^d.
 \label{eq:defn of dual cone}
 \end{equation}
 By projecting both sides \eqref{eq:to be projected} onto $\cone_{k+1}$, we find that 
 \begin{align}
 & P_{\cone(x_{k+1})}\left(  - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{DA(x_k)^\top y_k}{\b_k} +  DA(x_k)^\top A(x_k) \right) \nonumber\\
 &  \in  P_{\cone(x_{k+1})}\left( - \frac{\partial g(x_{k+1}) }{\b_k  \g_k} \right) = \{ 0\},
 \label{eq:before Sk} 
 \end{align}
 where the equality follows from the duality of $\cone(x_{k+1})$ and $\cone(\partial g(x_{k+1}))$, see \eqref{eq:defn of dual cone}. 
 Consider a subspace $S_K\subseteq \RR^d$ such that 
 \begin{align}
 S _{K} \supseteq \bigcup_{k\in K}  P_{\cone(x_{k+1})}\left( DA(x_{k+1})^\top A(x_{k+1}) \right),
 \label{eq:defn of S}
 \end{align}
 and project both sides of \eqref{eq:before Sk} onto $S_K$ to reach 
 \begin{align}
 & P_{S_K}P_{\cone(x_{k+1})}\left(  - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{DA(x_k)^\top y_k}{\b_k} +  DA(x_k)^\top A(x_k) \right) = 0.
 \label{eq:pre norm}
 \end{align}
 By taking the norm and then applying the triangle inequality, we argue that 
 \begin{align}
 & \left\| 
 P_{S_K} P_{\cone(x_{k+1})} ( DA(x_k)^\top A(x_k)  )
  \right\| \nonumber\\
  & \le 
 \left\| P_{S_K} P_{\cone(x_{k+1})} \left(  - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{DA(x_k)^\top y_k}{\b_k} 
 \right) \right\|
  \qquad \text{(see \eqref{eq:pre norm})}.
  \label{eq:post norm}
 \end{align}
 Because proximal map is non-expansive and $P_{S_K}P_{\cone_{k+1}}(0) = 0$, we may upper bound the last line above as 
 \begin{align}
 & \left\| 
 P_{S_K} P_{\cone(x_{k+1})} ( DA(x_k)^\top A(x_k)  )
  \right\|  \nonumber\\
   & \le 
 \left\|   - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{DA(x_k)^\top y_k}{\b_k} 
  \right\| \nonumber\\
  & \le \frac{1}{\b_k} \left( \|G_k\| + \|\nabla f(x_k) \| + \|DA(x_k)^\top y_k\| \right). 
 \qquad \text{(triangle inequality)} \nonumber\\
 & \le  \frac{1}{\b_k} \left( \|G_k\| +\lambda'_f+ \lambda'_A \|y_k\| \right) ,
 \label{eq:slater proof 0}
 \end{align}
 where 
 \begin{align}
 \lambda'_f := \max_{\|x\| \le \rho'} \| \nabla f(x)\|,
 \qquad 
 \lambda'_A := \max_{\|x\| \le \rho'} \| DA(x)\|.
 \end{align}
 To lower bound the first line of \eqref{eq:slater proof 0}, we must impose a restricted injectivity as follows. 
 We let $S_{K}$ with orthonormal columns  denote a basis for the subspace $S_K$ in \eqref{eq:defn of S}, to simplify the notation. We then assume that  $ \nu_A>0$, where
 \begin{align}
 \nu_A :=
 \begin{cases}
 \min_{v,x} \,  \frac{\left\| S_{K}^\top  P_{\cone(x_{k+1})} ( DA(x)^\top v)  \right\|}{\|v\|} \\
 \|v\|\le  \rho\\
 \|x\|\le \rho'.
 \end{cases}
 \label{eq:new slater proof}
 \end{align}
 Recalling the first bound in \eqref{eq:bndness in slater proof}, for every $k\in K $, we may therefore write that 
 \begin{align}
 & \left\|  
 P_{S_K} P_{\cone(x_{k+1})} ( DA(x_{k})^\top A(x_k)) 
  \right\|  \nonumber\\
  & \ge \left\|  
 P_{S_K} P_{\cone(x_{k+1})} ( DA(x_{k+k})^\top A(x_{k})) 
  \right\|
  - \left\| 
  (DA(x_{k+1}) -  DA(x_{k})) ^\top A(x_{k})
  \right\| \nonumber\\
  & \ge \nu_A \|A(x_k)\| - \| DA(x_{k+1})-DA(x_k) \| \|A(x_k) \|,
  \qquad \text{(see \eqref{eq:new slater proof})}
  \label{eq:slater proof 1}
 \end{align}
 where the second line above again uses the non-expansiveness of $P_{S_K}$ and $P_{\cone_{k+1}}$. The remaining term in \eqref{eq:slater proof 1} is bounded as 
 \begin{align}
  \| DA(x_{k+1})-DA(x_k) \| & \le \lambda_A \|x_{k+1}-x_k\| \le \lambda_A \rho''.
  \qquad \text{(see (\ref{eq:smoothness basic},\ref{eq:bndness in slater proof}))}
 \end{align}
 Assuming that 
 \begin{align}
 \lambda_A \rho'' \le \nu_A/2,
 \end{align}
 allows us to simplify the last line of \eqref{eq:slater proof 1} as 
 \begin{align}
  \left\|  
 P_{S_K} P_{\cone(x_{k+1})} ( DA(x_{k})^\top A(x_k)) 
  \right\|   \ge  \frac{\nu_A }{2} \|A(x_k)\|,
 \end{align}
 which, after substituting in \eqref{eq:slater proof 0}, yields that 
 \begin{align}
 \|A(x_k)\|
 & \le \frac{2}{\b_k\nu_A} \left( \|G_k\| + \lambda'_f + \lambda'_A \|y_k\| \right),
 \end{align}
 and completes the proof of Lemma \ref{lem:bnd bnd Ak}.
 
 
 
 
diff --git a/tex/Paper/sections/preliminaries.tex b/tex/Paper/sections/preliminaries.tex
index 7faa17c..983063d 100644
--- a/tex/Paper/sections/preliminaries.tex
+++ b/tex/Paper/sections/preliminaries.tex
@@ -1,101 +1,101 @@
 \section{Preliminaries \label{sec:preliminaries}}
 \paragraph{\textbf{\emph{Notation.}}}
 We use the notations $\langle\cdot ,\cdot \rangle $ and $\|\cdot\|$ for the {standard inner} product and norm on $\RR^d${, respectively}. For a convex function $g:\RR^d\rightarrow\RR$, the subdifferential at $x\in \RR^d$ is denoted by $\partial g(x)$ and the proximal operator $P_g:\RR^d\rightarrow\RR^d$ takes $x$ to
 \begin{align}
 P_g(x) = \underset{y}{\argmin} \, g(y) + \frac{1}{2}\|x-y\|^2.
 \label{eq:defn of prox}
 \end{align}
 In addition, if $g=1_C$ is the indicator function of a convex set or cone, we use the simpler notation $P_C$ instead of $P_{1_C}$ to denote the orthogonal projection onto $C$.  
 Throughout, $g^*$ and $\partial g^*$ will denote the Fenchel conjugate of $g$ and its subdifferential, respectively. For a cone $C$, we let $C^*$ denote its polar, namely, 
 \begin{align}
 C^* = \{x: \langle x,x'\rangle \le 0,\,\, \forall x' \}.
 \end{align}
 An integer interval is denoted by $[k_0:k_1]=\{k_0,\cdots,k_1\}$ for integers $k_0 \le k_1$. For matrices, $\|\cdot\|$ and $\|\cdot\|_F$ denote the spectral and Frobenius norms, respectively. 
 
 
 \paragraph{\textbf{\emph{{Necessary optimality conditions.}}} \label{sec:opt cnds}}
 {First order necessary optimality conditions} for  {Program} \eqref{prob:01} are well studied in the literature \cite{rockafellar2009variational}. {Indeed, $u$ is a (first-order) stationary point of Program \eqref{prob:01} if there exists $y\in \RR^m$ for which 
 \begin{align}
 \begin{cases}
 -\nabla h(x) - DA(x)^\top y \in \partial g(x)\\
 A(x) = 0,
 \end{cases}
 \label{e:inclu1}
 \end{align}
 where $DA(x)$ is the Jacobian of $A$ at $x$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to 
 \begin{align}
 \begin{cases}
 -\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x)\\
 A(x) = 0,
 \end{cases}
 \label{e:inclu2}
 \end{align}
 which is in turn the necessary optimality condition for Program \eqref{eq:minmax}. }
 
 
 \paragraph{\emph{\textbf{Technical lemmas.}}} The proof of the following result is trivial but included in Appendix for completeness. 
 \begin{lemma}[Smoothness]\label{lem:smoothness}
  For fixed $y\in \RR^m$ and $\rho,\rho'\ge 0$, it holds that 
  \begin{align}
 \| \nabla_x \mathcal{L}_{\beta}(x, y)-  \nabla_x \mathcal{L}_{\beta}(x', y) \| \le \lambda_\b \|x-x'\|,
  \end{align}
  for every $x,x' \in \{ x'': \|A(x'')\|\le \rho, \|x''\|\le \rho'\}$, where  
  \begin{align}
 \lambda_\beta 
 & \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho  \right) + \b d \lambda'^2_A.
 \label{eq:smoothness of Lagrangian}
 \end{align}
 Above, $\lambda_f,\lambda_A$ were defined in (\ref{eq:smoothness basic}) and 
 \begin{align}
 \lambda'_A := \max_{\|x\|\le \rho'}\|DA(x)\|.
 \end{align}
 \end{lemma}
 Gradient mapping, defined below, plays an important role in our convergence analysis. 
 \begin{definition}\label{def:grad map}\textbf{{(Gradient Mapping)}} Given $x\in \RR^d$ and $\gamma >0$,   the gradient mapping $G_{\beta,\gamma}(\cdot; y): \RR^d\rightarrow\RR^d$ takes $x\in \RR^d$ to 
 \begin{equation} 
 G_{\b,\gamma}(x,y) = \frac{x-x^+}{\gamma},
  \label{eq:grad map}
 \end{equation} 
 where $x^+=P_{g}(x-\gamma \nabla \mathcal{L}_ {\beta}(x,y))$.
 \end{definition}
 In particular, if we remove $g$ from Program \eqref{prob:01}, the gradient mapping reduces to $G_{\beta,\gamma}(x,y)=\nabla h(x) $. For a sufficiently small step size $\g$, the gradient mapping determines the corresponding descent in the objective function of Program \eqref{eq:minmax}. The following result is standard \cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal},  but the proof is given in Appendix \ref{sec:proof of descent lemma} for completeness. 
 
 \begin{lemma}[Descent lemma]\label{lem:11}
 For $x\in \RR^d$ and $y\in \RR^m$, let $x^+ = P_g(x - \g \nabla_x \L_\b(x,y) )$, where $\g< 1/\lambda_\b$. For $\rho,\rho'\ge 0$, suppose that 
 \begin{align}
 x,x^+ \in \{ x': \|A(x')\| \le \rho, \|x'\| \le \rho' \}. 
 \end{align}
 Then  it holds that 
  \begin{equation}
  \label{e:deslem}
  \| G_{\beta,\gamma}(x,y)\|^{2}\leq \frac{2}{\gamma} (\mathcal{L}_\beta(x,y) + g (x)- \mathcal{L}_\beta(x^+,y)- g(x^+) ).
 \end{equation}
 \end{lemma}
 In {practice}, determining the step size $\gamma$ by computing the right-hand side of \eqref{eq:smoothness of Lagrangian}  might be intractable. Instead, we often resort to the classic line search technique, reviewed below and proved in Appendix \ref{sec:proof of linesearch} for completeness. 
 \begin{lemma}[Line search] \label{lem:eval Lipsc cte} Fix $\theta \in (0,1)$ and ${\gamma}_0>0$. For $\gamma'>0$, let 
 \begin{align}
 x^+_{\gamma'} = P_g(x - \gamma' \nabla_x \mathcal{L}_\beta(x,y)),
 \label{eq:defn of x+gamma}
 \end{align}
 and  define 
 \begin{align}
 \gamma & := 
 \max \Big\{
 \gamma' ={\gamma}_0 \theta^i : 
 \mathcal{L}_\beta (x^+_{\gamma'},y) \nonumber\\
  & \qquad \qquad \quad 
 \le \mathcal{L}_\beta (x,y) + 
-\g' \left\langle x^+_{\gamma'} - x ,  \nabla_x \mathcal{L}_{\beta}(x,y) \right\rangle 
+\left\langle x^+_{\gamma'} - x ,  \nabla_x \mathcal{L}_{\beta}(x,y) \right\rangle 
 + \frac{1}{2\gamma'} \| x^+_{\gamma'} - x\|^2   
 \Big\}.
 \label{eq:defn of gamma line search}
 \end{align}
 Then, (\ref{e:deslem}) holds and, moreover, we have that 
 \begin{align}
  \gamma \ge \frac{\theta}{\lambda_\beta}.
  \label{eq:moreover}
 \end{align}
 \end{lemma}