diff --git a/Neural_ODE.tex b/Neural_ODE.tex
index debf20f..3edaead 100644
--- a/Neural_ODE.tex
+++ b/Neural_ODE.tex
@@ -1,1018 +1,1025 @@
 \documentclass[usenames,dvipsnames,aspectratio=169,10pt]{beamer}
 \usepackage{multicol}
 \usetheme{metropolis}
 \usepackage{appendixnumberbeamer}
 \usepackage{autonum}
 \usepackage{booktabs}
 \usepackage[scale=2]{ccicons}
 \usepackage{bm}
 \usepackage{pgfplots}
 \usepackage[utf8]{inputenc}
 \usepackage{media9}
 \usepackage{subcaption}
 \usepackage[english]{babel}
 \usepackage{amsmath}
 \usepackage{mathtools}
 \usepackage{amsfonts}
 \usepackage{amssymb}
 \usepackage{graphicx}
 \usepackage{xmpmulti}
 \usepackage{animate}
 \newcommand{\notimplies}{\;\not\!\!\!\implies}
 \usepackage{fontspec}  % optional
 \pgfplotsset{compat=newest}
 \usepgfplotslibrary{groupplots}
 \usepgfplotslibrary{dateplot}
 \usepgfplotslibrary{dateplot}
 \newcommand{\inputTikZ}[2]{%  
 	\scalebox{#1}{\input{#2}}  
 }
 \newcommand\blfootnote[1]{%
 	\begingroup
 	\renewcommand\thefootnote{}\footnote{#1}%
 	\addtocounter{footnote}{-1}%
 	\endgroup
 }
 
 
 \usepgfplotslibrary{groupplots,dateplot}
 \usetikzlibrary{patterns,shapes.arrows}
 \pgfplotsset{compat=newest}
 \pgfplotsset{compat=1.13}
 \usepgfplotslibrary{fillbetween}
 \pgfmathdeclarefunction{gauss}{2}
 {\pgfmathparse{1/(#2*sqrt(2*pi))*exp(-((x-#1)^2)/(2*#2^2))}}
 
 \usepackage{xspace}
 \newcommand{\themename}{\textbf{\textsc{metropolis}}\xspace}
 \definecolor{burgundy}{RGB}{255,0,90}
 \usepackage{algorithm}
 \usepackage{algpseudocode}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % Listings
 %
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 \usepackage{listings,bera}
 \definecolor{keywords}{RGB}{255,0,90}
 \definecolor{comments}{RGB}{60,179,113}
 \lstset{language=Python,
 	keywordstyle=\color{keywords},
 	commentstyle=\color{comments}\emph}
 \lstset{escapeinside={<@}{@>}}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % Color stuff
 %
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \usepackage{xcolor,pifont}
 \newcommand*\colourcheck[1]{%
 	\expandafter\newcommand\csname #1check\endcsname{\textcolor{#1}{\ding{52}}}%
 }
 \colourcheck{blue}
 \colourcheck{green}
 \colourcheck{red}
 \definecolor{fore}{RGB}{249,242,215}
 \definecolor{back}{RGB}{51,51,51}
 \definecolor{title}{RGB}{255,0,90}
 
 \definecolor{mDarkBrown}{HTML}{604c38}
 \definecolor{mDarkTeal}{HTML}{23373b}
 \definecolor{mLightBrown}{HTML}{EB811B}
 \definecolor{mLightGreen}{HTML}{14B03D}
 \definecolor{aqb}{HTML}{6FEBBE}
 \setbeamercolor{titlelike}{fg=}
 \setbeamercolor{normal text}{fg=fore,bg=back}
 \newcommand{\pink}[1]{{\color{magenta} #1}}
 \newcommand{\blue}[1]{{\color{aqb} #1}}
 \newcommand{\tinto}[1]{{\color{burgundy} #1}}
 %symbol definitions
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % Variables
 %
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \newcommand{\yj}{{Y}_{j+1}}
 \newcommand{\aat}{\tinto{{\lambda}^T}}
 \newcommand{\dat}{\tinto{\dot{{\lambda}}^T}}
 \newcommand{\ym}{{Y}_{j}}
 \newcommand{\kj}{{K}_j}
 \newcommand{\bj}{b_j}
 \newcommand{\yy}{{Y}}
 \newcommand{\te}{{\theta}}
 \newcommand{\R}{\mathbb{R}}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % delimiters
 %
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \newcommand{\lno}{\left \Vert}
 \newcommand{\rno}{\right \Vert}
 \newcommand{\lv}{\lvert}
 \newcommand{\rv}{ \rvert}
 \newcommand{\inner}[2]{\left\langle #1,#2 \right\rangle}
 \newcommand{\abs}[1]{\left \vert#1 \right \vert}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % operators
 %
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \newcommand{\calc}{\mathcal{C}}
 \newcommand{\calku}{\mathcal{K}(u)}
 \newcommand{\ff}{\mathcal{F}}
 \newcommand{\diff}{\mathsf{d}}
 \newcommand{\pd}[2]{\frac{\partial #1}{\partial #2}}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % check mark
 %
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 
 
 \title{Continuous limits of DNN: Neural networks as ODEs}
 %\subtitle{An overview}
 \date{November 13, 2020}
 \author{Eva Vidli\v ckov\'a and Juan Pablo Madrigal Cianci,\\
 CSQI.}
 \institute{Foundations of deep neural networks}
 \titlegraphic{\hfill\includegraphics[height=.5cm]{logo.png}}
 
 \begin{document}
 
 \maketitle
 
 %\begin{frame}{Table of contents}
 %  \setbeamertemplate{section in toc}[sections numbered]
 %  \tableofcontents[hideallsubsections]
 %\end{frame}
 
 
 
 
 %\section{Introduction and Motivation: ResNets}
 
 
 \begin{frame}{ResNets}
 
 Here we briefly describe what resnets are, maybe define some notation and make the point that they look like an ODE. 
 
 \end{frame}
 
 
 \begin{frame}{Outline}
 The rest of the talk summarizes the following two articles:
 
 \nocite{*}
 \bibliography{bib_intro}
 \bibliographystyle{plain}
 
 \end{frame}
 
 \section{Stable architectures for deep neural networks}
 
 
 %\section{Introduction}
 
 \begin{frame}{Classification problem}
 \begin{itemize}
 	\item training data: 
 	\begin{align}
 	& y_1,\dots, y_\tinto{s} \in \R^\tinto{n} \quad \text{ feature vectors }\\
 	& c_1,\dots, c_\tinto{s} \in \R^\tinto{m} \quad \text{ label vectors } \\
 	& (c_l)_k \text{ - likelihood of $y_l$ belonging to class $k$} 
 	\end{align}
 	\item objective: learn data-label relation function that \alert{generalizes} well\\[20pt]\pause
 	\item \blue{deep architectures}\\[10pt]
 	\begin{itemize}
 		\item[+] successful for highly nonlinear data-label relationships\\[5pt]
 		\item[--] dimensionality, non-convexity, \textbf{instability} of forward model
 	\end{itemize}
 \end{itemize}
 \end{frame}
 
 \begin{frame}{ResNets: Forward propagation}
 %\textbf{Forward propagation}
 \begin{gather}
 Y_{j+1} = Y_j + \tinto{h}\sigma(Y_j \blue{K_j} + \blue{b_j}), \quad j= 0,\dots,N-1\\[10pt]
 Y_j \in\R^{s\times n},\, \blue{K_j}\in\R^{n\times n},\, \blue{b_j}\in\R
 \end{gather}
 
 \begin{itemize}
 \item $Y_0 = [y_1,\dots,y_s]^\intercal$
 \item $Y_1,\dots, Y_{N-1}$ - hidden layers,\; $Y_N$ - output layer
 \item activation function \[\sigma_{ht}(Y) = \tanh(Y),\quad \sigma_{ReLU} = \max(0,Y)\]
 \end{itemize}
 
 
 \end{frame}
 
 \begin{frame}{ResNets: Classification}
 
 %\textbf{Classification} 
 \[ h_{hyp}(Y_N \pink{W} + e_s\pink{\mu}^\intercal),\quad \pink{W}\in \R^{n\times m}, \pink{\mu}\in\R^m \]
 \begin{itemize}
 \item hypothesis function \[ h_{hyp}(x) = \exp(x)./(1+\exp(x)) \]
 \end{itemize}
 
 \end{frame}
 
 \begin{frame}{Learning process}
 \begin{columns}
 \begin{column}{0.5\textwidth}
 \begin{center}
 forward prop. parameters\\[10pt] $(\blue{K_j}, \blue{b_j},\;\; j=0,\dots, N-1)$
 \end{center}
 \end{column}
 \begin{column}{0.5\textwidth}
 \begin{center}
 classification parameters \\[10pt] $(\pink{W}, \pink{\mu})$
 \end{center}
 \end{column}
 \end{columns}
 \vspace{1cm}
 \centerline{\alert{\textbf{Optimization problem}}}
 \begin{gather}
 \min \frac{1}{s} \alert{S}\big(h_{hypo}(Y_N \pink{W} + e_s\pink{\mu}^\intercal), C\big) + \alpha \alert{R}(\pink{W},\pink{\mu},\blue{K_j}, \blue{b_j}) \\[5pt]
 \text{ s.t. } Y_{j+1} = Y_j + h\sigma(Y_j \blue{K_j} + \blue{b_j}), \qquad j= 0,\dots,N-1
 \end{gather}
 \begin{itemize}
 \item $C = [c_1,c_2,\dots,c_s]^\intercal\in\mathbb{R}^{s\times m}$
 \item e.g. $\alert{S}(C_{pred},C) = \frac{1}{2}\|C_{pred} - C\|_F^2$
 \end{itemize}
 \end{frame}
 
 \begin{frame}{Learning process}
 \begin{itemize}
 \item block coordinate descent method
 \item \begin{align}
 \frac{1}{\pink{s}} S\big(h_{hypo}(Y_N W + e_s\mu^\intercal), C\big) &= \frac{1}{\pink{s}} \sum_{i=1}^{\pink{s}} S\Big(h_{hypo}\big((Y_N)_i^\intercal W + \mu^\intercal\big), c_i^\intercal\Big)\\
 &\approx \frac{1}{|\blue{\mathcal{T}}|} \sum_{i\in\blue{\mathcal{T}}} S\Big(h\big((Y_N)_i^\intercal W + \mu^\intercal\big), c_i^\intercal\Big)
 \end{align}
 \item learning data \& validation data
 \end{itemize}
 \end{frame}
 
 %\section{ODE interpretation}
 
 \begin{frame}{ResNets as discretized ODEs}
 \textbf{ResNet}
 \[
 Y_{j+1} = Y_j + h\sigma(Y_j K_j + b_j), \quad j= 0,\dots,N-1
 \]
 \textbf{Continuous ODE}
 \begin{align}
 &\dot{y}(t) = \sigma\big( K^\intercal(t)y(t) + b(t) \big), \quad t\in [0,T]\\
 &y(0) = Y_0
 \end{align}
 \end{frame}
 
 \begin{frame}{Stability of continuous ODEs}
 \[
 \dot{y}(t) = f(t, y(t))
 \]
 \begin{enumerate}
 \item \alert{linear} with \alert{constant} coefficients:\qquad $\dot{y}(t) = Ay(t) + b$\\ 
 	\begin{itemize}
 	\item asymptotically stable if $\text{Re}(\lambda_i(A)) < 0,\;\forall i$
 	\item stable if $\text{Re}(\lambda_i(A)) < 0$ or $\text{Re}(\lambda_i(A)) = 0$ and geometrical multiplicity  = algebraic multiplicity $\forall i$
 	\item unstable otherwise
 	\end{itemize}
 \item \alert{nonlinear} : \qquad $\dot{y}(t) = f(t, y(t))$\\
 
 $\to$ variational equation 
 \begin{gather}
 \dot{z} = J(t) z\\
 J(t) = \frac{\partial f}{\partial y}(t, y(t))
 \end{gather}
 \item \alert{linear} with \alert{non-constant} coefficients:\qquad $\dot{y}(t) = A(t)y(t) + b(t)$\\
 complicated (kinematic eigenvalues)
 
 \end{enumerate}
 
 \end{frame}
 
 \begin{frame}{Stability for continuous forward propagation of NN }
 \textbf{Continuous problem}
 \[
 \dot{y}(t) = \sigma\big( K(t)^\intercal y(t) + b(t) \big), \quad y(0) = y_0
 \]
 stability conditions for ODEs inspires following conditions:
 \begin{enumerate}
 \item $\blue{K(t)}, \blue{b(t)}$ changes sufficiently \alert{slowly}
 \item \begin{align}
 \max_{i=1,\dots,n} &\text{Re}\Big(\pink{\lambda_i}\big(J(t)\big)\Big) \leq 0,\quad\forall t\in [0,T]\\[10pt]
 \text{where } J(t) &= \nabla_y\Big(\sigma\big( K(t)^\intercal y(t) + b(t) \big)\Big)^\intercal\\
 &= \text{diag}\Big(\underbrace{\sigma'\big( K(t)^\intercal y(t) + b(t) \big)}_{\geq 0}\Big) K(t)^\intercal
 \end{align}
 
 $ \to \max_{i=1,\dots,n} \text{Re}\Big(\pink{\lambda_i}\big(K(t)\big)\Big) \leq 0,\quad\forall t\in [0,T].
 $
 \end{enumerate}
 \end{frame}
 
 
 \begin{frame}{Stability of discretized ODE and NNs}
 \begin{columns}
 \begin{column}{0.5\textwidth}
 \center{\alert{Discrete ODEs}}
 \end{column}
 
 \begin{column}{0.5\textwidth}
 \center{\alert{Neural networks}}
 \end{column}
 \end{columns}
 \bigskip
 \begin{columns}
 \begin{column}{0.5\textwidth}
 \[
 \dot{y} = Ay
 \]
 \end{column}
 \begin{column}{0.5\textwidth}
 \[
 \dot{y}(t) = \sigma\big( K(t)^\intercal y(t) + b(t) \big)
 \]
 \end{column}
 \end{columns}
 
 \begin{columns}
 \begin{column}{0.5\textwidth}
 \[
 y_{j+1} = y_j + h A y_j
 \]
 \end{column}
 \begin{column}{0.5\textwidth}
 \[
 Y_{j+1} = Y_j + h\sigma(Y_j K_j + b_j)
 \]
 \end{column}
 \end{columns}
 \bigskip
 \begin{columns}
 \begin{column}{0.5\textwidth}
 stability condition on \tinto{h}:
 \[
 | 1 + h\lambda_i(A) | \leq 1\quad \forall i
 \]
 \end{column}
 \begin{column}{0.5\textwidth}
 stability condition on \tinto{h}
 \[
 \max_{i=1,\dots,n} |1+\tinto{h}\lambda_i(J_j)| \leq 1,\quad \forall j=0,1,\dots,N-1
 \]
 \end{column}
 \end{columns}
 
 \begin{columns}
 \begin{column}{0.5\textwidth}
 \end{column}
 \begin{column}{0.5\textwidth}
 \end{column}
 \end{columns}
 
 \begin{figure}
 \includegraphics[scale=0.05]{figures/Euler_stab.png}
 \end{figure}
 
 \end{frame}
 
 \begin{frame}{Example: Stability of ResNet}
 \begin{figure}
 \centering
 \includegraphics[width = 0.8\textwidth]{figures/ResNet_stab.png}
 \end{figure}
 \begin{align}
 K_{+}&= \begin{pmatrix}
 2 & -2\\ 0& 2
 \end{pmatrix} & K_{-} &= \begin{pmatrix}
 -2 & 0 \\ 2 & -1
 \end{pmatrix} & K_0 &= \begin{pmatrix}
 0 & -1 \\ 1 & 0
 \end{pmatrix}\\
 \lambda(K_+) &= 2 & \lambda(K_-) &= -2 & \lambda(K_0)&= i,-i
 \end{align}
 \begin{itemize}
 \item $s = 3,\, n=2,\, h = 0.1, \, b = 0,\, \sigma = \tanh,\, N = 10$
 \end{itemize}
 \end{frame}
 
 \begin{frame}{Well-posed forward propagation}
 \begin{enumerate}
 \item $\max_i \text{Re}(\pink{\lambda_i}(K)) > 0$\\[10pt]
 \begin{itemize}
 \item neurons amplify signal with no upper bound
 \item unreliable generalization\\[20pt]
 \end{itemize}
 \item $\max_i \text{Re}(\pink{\lambda_i}(K)) << 0$\\[10pt]
 \begin{itemize}
 \item inverse problem highly ill-posed
 \item vanishing gradients problem
 \item lossy network\\[20pt]
 \end{itemize}
 \end{enumerate}
 $\implies \text{Re}(\pink{\lambda_i}(K(t))) \alert{\approx 0},\quad\forall i=1,2,\dots,n,\;\forall t\in [0,T]$
 \end{frame}
 
 %\section{Stable architectures}
 
 \begin{frame}{Antisymmetric weight matrices}
 \[
 \dot{y}(t) = \sigma \Big( \frac{1}{2}\big(\underbrace{ K(t) - K(t)^\intercal }_{ \mathclap{\text{antisymmetric $\to$ imaginary eigenvalues}} } - \alert{\gamma} I\big)y(t) + b(t)\Big),\quad t\in [0,T]
 \]
 \bigskip
 
 \begin{enumerate}
 \onslide<2->{\item $\alert{\gamma = 0}$} \onslide<3->{\begin{figure}
 \centering
 \includegraphics[scale = 0.25]{figures/RK_stab.png}
 \end{figure} }
 \onslide<4->{
 \item $\alert{\gamma > 0}$ \quad $\to$ Forward Euler discretization
 $$Y_{j+1} = Y_j + \tinto{h}\sigma\Big(\frac{1}{2}Y_j (K_j - K_j^\intercal - \alert{\gamma} I) + b_j\Big)$$}
 \end{enumerate}
 \end{frame}
 
 \begin{frame}{Hamiltonian inspired NN}
 \vspace{-0.9cm}
 \[
 \dot{y}(t) = -\nabla_z \blue{H}(y,z,t), \quad \dot{z}(t) = \nabla_y \blue{H}(y,z,t),\quad t\in [0,T]
 \]
 \begin{itemize}
 \item Hamiltonian $\blue{H}: \R^n\times \R^n\times [0,T]\to \R$ conserved\\[10pt]
 \item energy \alert{conserved}, not dissipated\\[20pt]
 \end{itemize}    
 \end{frame}
 
 \begin{frame}{Hamiltonian inspired NN}
 Hamiltonian $\blue{H}(y,z) = \frac{1}{2}z^\intercal z - f(y)$\\[5pt]
 
 $$\dot{y}(t) = -z(t), \; \dot{z}(t) = -\nabla_y f(y(t))\quad\implies \ddot{y}(t) = \nabla_y f(y(t))$$
 \pause
 \begin{itemize}
 \item $\alert{\ddot{y}(t) = \sigma\Big( K^\intercal (t) y(t) + b(t)\Big)},\; y(0) = Y_0,\; \dot{y}(0) = 0$\\[5pt]\pause
 \item stable for $K$ with non-positive real eigenvalues\\[5pt]
 \item $K(C) = -C^\intercal C,\quad C\in\R^{n\times n}$\\[5pt]
 \item nonlinear parametrization - complicated optimization\\[5pt]
 \item leapfrog discretization scheme (symplectic integrator)
 \end{itemize}
 \end{frame}
 
 \begin{frame}{Hamiltonian inspired NN}
 \[
 \alert{\dot{y}(t) = \sigma\Big( K (t) z(t) + b(t)\Big) \qquad \dot{z}(t) = \sigma\Big( K^\intercal (t) y(t) + b(t)\Big)}
 \]
 Associated ODE:
 \begin{align}
 \frac{\partial}{\partial t} \begin{pmatrix}
 y\\
 z
 \end{pmatrix}(t) &= \sigma \begin{pmatrix}\begin{pmatrix} 0 & K(t) \\ -K(t)^\intercal & 0 \end{pmatrix} \begin{pmatrix}
 y\\
 z
 \end{pmatrix}(t) + b(t) \end{pmatrix}, \\
 \begin{pmatrix}
 y\\
 z
 \end{pmatrix}(0) &= \begin{pmatrix}
 y_0\\
 0
 \end{pmatrix}
 \end{align}
 \pause
 \begin{itemize}
 \item antisymmetric matrix
 \item Verlet integration scheme (symplectic)
 $$ z_{j+1/2} = z_{j-1/2} - h\sigma(K_j^\intercal y_j + b_j),\quad y_{j+1} = y_j + h\sigma (K_j z_{j+1/2} + b_j)$$
 \end{itemize}
 \end{frame}
 
 \begin{frame}{Regularization}
 \[
 \min \frac{1}{s} {S}\big(h_{hypo}(Y_N {W} + e_s{\mu}^\intercal), C\big) + \alpha \alert{R}(\pink{W},\pink{\mu},\blue{K_j}, \blue{b_j})
 \]
 \pause
 \begin{enumerate}
 \item Forward propagation\\[10pt]
 \begin{itemize}
 \item standard: weight decay (Tikhonov regularization) \[ R(K) = \frac{1}{2}\|K\|_F^2 \]
 \item $\blue{K,\,b}$ to be sufficiently smooth
 \[ \alert{R}(\blue{K}) = \frac{1}{2h}\sum \|K_j - K_{j-1}\|_F^2\quad \alert{R}(\blue{b}) = \frac{1}{2h}\sum \|b_j - b_{j-1}\|^2\]
 \end{itemize}\pause
 \item Classification\\[10pt]
 \begin{itemize}
 \item $h_{hypo}(y_j^\intercal w_k + \mu_k)\approx h_{hypo}\Big(\text{vol}(\Omega) \int_{\Omega} y(x)w(x)\mathrm{d}x + \mu_k \Big)$
 \item $$\alert{R}(\pink{w_k}) = \frac{1}{2} \|L w_k\|^2\quad L - \text{discretized differential operator}$$
 \end{itemize}
 \item Multi-level learning
 \end{enumerate}
 \end{frame}
 %\section{Numerical examples}
 \begin{frame}{Concentric ellipses}
 \begin{figure}
 \centering
 \includegraphics[width = \textwidth]{figures/Elipses.png}
 \end{figure}
 \begin{itemize}
 \item 1200 points: 1000 training + 200 validation
 \item multi-level: 4, 8, 16, \dots, 1024 layers
 \item T = 20, n = 2, $\alpha = 10^{-3}$, $\sigma = \tanh$
 \item standard ResNet, antisymmetric ResNet, Hamiltonian - Verlet network
 \end{itemize}
 \end{frame}
 
 \begin{frame}{Convergence}
 \begin{figure}
 \centering
 \includegraphics[width = \textwidth]{figures/Convergence.png}
 \end{figure}
 \end{frame}
 
 \begin{frame}{Swiss roll}
 \begin{figure}
 \centering
 \includegraphics[width = \textwidth]{figures/Swiss_roll.png}
 \end{figure}
 \begin{itemize}
 \item 513 points: 257 training + 256 validation
 \item multi-level: 4, 8, 16, \dots, 1024 layers
 \item T = 20, n = 4,4,2, $\alpha = 5\cdot 10^{-3}$, $\sigma = \tanh$
 \item standard ResNet, antisymmetric ResNet, Hamiltonian - Verlet network
 \end{itemize}
 \end{frame}
 
 \begin{frame}{Peaks}
 \begin{figure}
 \centering
 \includegraphics[width = \textwidth]{figures/Peaks.png}
 \end{figure}
 \begin{itemize}
 \item 5000 samples: 20\% for validation
 \item multi-level: 4, 8, 16, \dots, 1024 layers
 \item T = 5, n = 8,8,2, $\alpha = 5\cdot 10^{-6}$, $\sigma = \tanh$
 \item standard ResNet, antisymmetric ResNet, Hamiltonian - Verlet network
 \end{itemize}
 \end{frame}
 
 \begin{frame}{MNIST}
 \begin{figure}
 \centering
 \includegraphics[scale = 0.8]{figures/MNIST.png}
 \end{figure}
 \end{frame}
 
 
 \begin{frame}{MNIST}
 \begin{itemize}
 \item 60 000 labeled images: 50 000 training, 10 000 validation,
 \item 28 $\times$ 28, multi-level: 4,8,16
 \item T = 6, n = 4704, $\alpha = 0.005$, $3 \times 3$ convolution operators
 \item standard ResNet, antisymmetric ResNet, Hamiltonian - Verlet network
 \bigskip
 \end{itemize}
 \begin{figure}
 \centering
 \includegraphics[width = 0.9\textwidth]{figures/MNIST_table.png}
 \end{figure}
 \end{frame}
 
 
 \section{Neural ODEs}
 %%
 %%
 %%
 \begin{frame}[fragile]{Motivation: ResNets and Euler's method}
 		$$\yj=\ym+ \underbrace{h\sigma\left( \ym\kj+\bj\right)}_\text{$=f(\ym,\te_j)$}, \quad \text{Euler discretization of $\frac{\diff Y}{\diff t}=f(Y,\te(t))$}$$ 
 \begin{columns}
 	\begin{column}{0.5\linewidth}
 
 %\begin{lstlisting}
 %#D
 %-+*efines the architecture
 %def f(Y,t,θ):
 %return neural_net(z,θ[t])
 %
 %#Defines the resnet
 %def resnet(Y):
 %for t in [1:T]:
 %	Y=Y+f(Y,t,θ)
 %return Y
 %\end{lstlisting}
 
 \vspace{0.5cm}
 \texttt{
 	\blue{\#Defines the architecture}\\
 	\tinto{def} f(Y,t,θ):\\
 	\tinto{return} neural\_net(z,θ[t])\\ \vspace{1cm}}
 
 \texttt{
 	\blue{\#Defines the ResNet}\\
 	\tinto{def} ODE\_Net(Y0):\\
 	\pink{for} t in [1:T]:\\
 	\hspace{0.5 cm} Y=Y+f(Y,t,θ)\\
  {\tinto{return} Y}}
 \vspace{1cm}
 
 
 	\end{column}
 
 \begin{column}{0.5\linewidth}
 \begin{figure}
 	\centering
 	\uncover<2->{
 	\includegraphics[width=0.8\linewidth]{figures/eulers.png}
 }
 \end{figure}
 \uncover<3->{
 Can we do better?
 }
 \end{column}
 \end{columns}
 
 \end{frame}
 %%
 
 \begin{frame}[fragile]{Improving on Euler's method}
 		$$\yj=\ym+ \underbrace{h\sigma\left( \ym\kj+\bj\right)}_\text{$=f(\ym,\te_j)$}, \quad \text{Euler discretization of $\frac{\diff Y}{\diff t}=f(Y,\te(t))$}$$ 
 
 \begin{columns}[t]
 	\begin{column}{0.6\linewidth}
 
 		
 %\begin{lstlisting}
 %#Defines the architecture
 %def f(Y,t,θ):
 %return neural_net([z,t],θ[t])
 %
 %#Defines the ODE Net
 %def ODE_Net(Y0):
 %return ODE_Solver(f,Y0,\theta,t_0=0,t_f=1)
 %\end{lstlisting}
 \vspace{0.5cm}
 \texttt{
 \blue{\#Defines the architecture}\\
 \tinto{def} f(Y,t,θ):\\
 \tinto{return} neural\_net(\pink{[z,t]},θ[t])\\ \vspace{0.5cm}}
 
 \texttt{
 \blue{\#Defines the ODE Net}\\
 \tinto{def} ODE\_Net(Y0):\\
 {\tinto{return} \alert{ODE\_Solver}(f,Y0,θ,t\_0=0,t\_f=1)}}
 \vspace{0.5cm}
 
 Here \texttt{\alert{ODE\_Solver}} is a black-box ODE solver. 		
 		
 	
 		
 	\end{column}
 	\begin{column}{0.5\linewidth}
 
 			\centering
 			\uncover<2->{
 				\includegraphics[width=0.7\linewidth]{figures/adaptive.png}
 			}
 	\end{column}
 \end{columns}
 \uncover<3->{
 	\begin{center}
 \pink{Main idea:} Continuous depth + good ODE solver.
 \end{center}
 }
 \end{frame}
 
 
 
 
 
 \begin{frame}[fragile]{Comparison}
 \begin{columns}[t]
 	\begin{column}{0.5\linewidth}
 		\textbf{ResNet:}
 		
 		
 \texttt{
 	\blue{\#Defines the architecture}\\
 	\tinto{def} f(Y,t,θ):\\
 	\tinto{return} neural\_net(z,θ[t])\\ \vspace{0.25cm}}
 
 \texttt{
 	\blue{\#Defines the ResNet}\\
 	\tinto{def} ResNet(Y):\\
 	\pink{for} t in [1:T]:\\
 	\hspace{0.5 cm} Y=Y+f(Y,t,θ)\\
 	{\tinto{return} Y}}		
 	\end{column}
 	\begin{column}{0.5\linewidth}
 	\textbf{ODENet:}
 	
 	
 	\texttt{
 		\blue{\#Defines the architecture}\\
 		\tinto{def} f(Y,t,θ):\\
 		\tinto{return} neural\_net(\pink{[z,t]},θ[t])\\ \vspace{0.25cm}}
 	
 	\texttt{
 		\blue{\#Defines the ODENet}\\
 \tinto{def} ODE\_Net(Y0):\\
 {\tinto{return} \alert{ODE\_Solver}(f,Y0,θ,t\_0=0,t\_f=1)}}
 	\end{column}
 \end{columns}
 \begin{center}
 			\includegraphics[width=0.35\linewidth]{figures/ode_res}
 \end{center}
 
 \end{frame}
 %%
 %%
 %%
 
 %\begin{frame}{Some considerations}
 %\begin{center}
 %	\includegraphics[width=0.2\linewidth]{figures/resnett}\hspace{2cm}
 %	\includegraphics[width=0.2\linewidth]{figures/odenett}
 %\end{center}
 %\end{frame}
 
+
 \begin{frame}{Training the Neural Network: Adjoint Method}
 We aim at minimizing $J:R^p\mapsto R,$ $$J(\yy,t_f,\te)=J\left(\yy(t_0)+\int_{t_0}^{t_f}f(\yy,t,\te)\diff t \right)=J(\text{\texttt{\alert{ODE\_Solver}}}(f,\yy(t_0),\te,t_0=0,t_f=1)).$$
 
 
+$$\frac{\partial J}{\partial \te}=?$$
+
+
+\textbf{Backprop:}
+\begin{align}
+\frac{\partial Y_t}{\partial Y_{t+1}}=\frac{\partial J}{\partial Y_{t+1}}\frac{\partial f(Y_t,\te)}{\partial Y_t}, \quad \pd{J}{\te_t}=\pd{J}{Y_T}\frac{\partial f(Y_t,\te)}{\partial \te_t}\end{align}
 \textbf{Difficulties: }
 
 \begin{enumerate}
 	\item  \alert{\texttt{ODE\_Solver}} is a black-box.
 	\item  There is no notion of layers, since we are on a continuous limit. 
 \end{enumerate}
 
 
 
 
-$$\frac{\partial J}{\partial \te}=?$$
+
 
 
 How does $\te$ depend on $\yy(t)$ at each instant $t$?
 
 Don't use back-prop, but rather the \tinto{adjoint-state method} (Pontryagin et al. 1962.). 
 \end{frame}
 
 %%
 %%
 %%
 \begin{frame}{Training the Neural Network: Adjoint Method}
 Define first $$G(\yy,t_f,\te):=\int_{t_0}^{t_f} J(\yy,t,\te)\diff t, \quad \frac{\diff}{\diff t_f}G(\yy,t_f,\te)=J(\yy,t,\te)$$ and the Lagrangian $$L=G(\yy,t_f,\te)+\int_{t_0}^{t_f}\aat(t)\left( \dot{\yy}(t,\te)-f(\yy,t,\te)  \right)\diff \te $$
 Then,
 
 \begin{align}
 \frac{\partial L}{\partial \te}=\int_{t_0}^{t_f} \left(\frac{\partial J}{\partial \yy}\frac{\partial \yy}{\partial \te} +\frac{\partial J}{\partial \te}\right)\diff t +\int_{t_0}^{t_f}\aat(t)\left( \blue{\frac{\partial\dot{\yy}}{\partial \te}}-\frac{\partial f}{\partial \yy}\frac{\partial \yy}{\partial \te}- \frac{\partial f}{\partial \te} \right)\diff t
 \end{align}
 IBP:
 
 \begin{align}
 \int_{t_0}^{t_f}\aat(t)\blue{\frac{\partial\dot{\yy}}{\partial \te}}\diff t=\aat(t)\frac{\partial{\yy}}{\partial \te}\rvert_{t_0}^{t_f}-\int_{t_0}^{t_f}\dat(t)\blue{\frac{\partial {\yy}}{\partial \te}}\diff t
 \end{align}
 
 \end{frame}
 
 
 \begin{frame}{Adjoint method (cont'd)}
 
 \begin{align}
 \frac{\partial L}{\partial \te}&=\int_{t_0}^{t_f} \left(\frac{\partial J}{\partial \yy}\frac{\partial \yy}{\partial \te} +\frac{\partial J}{\partial \te}\right)\diff t +\int_{t_0}^{t_f}\aat(t)\left( \blue{\frac{\partial\dot{\yy}}{\partial \te}}-\frac{\partial f}{\partial \yy}\frac{\partial \yy}{\partial \te}- \frac{\partial f}{\partial \te} \right)\diff \te\\
 %%
 %%
 &=\int_{t_0}^{t_f} \left(\frac{\partial \yy}{\partial \te}\right)\alert{\left(\frac{\partial \yy}{\partial \te} -\aat\pd{f}{\yy}-\dat\right)}\diff t+\int_{t_0}^{t_f}-\aat \pd{f}{\te}+\pink{\pd{J}{\te}}\diff t +\left(\aat \pd{\yy}{\te}\right)_{t_0}^{t_f}\\
 \end{align}
 Setting $\alert{\left(\frac{\partial J}{\partial \yy} -\aat\pd{f}{\yy}-\dat\right)}=0$, $\aat(t_f)=0$, one gets
 
 
 \begin{align}
 \frac{\partial L}{\partial \te}&=\int_{t_0}^{t_f}-\aat \pd{f}{\te}+\pink{\pd{J}{\te}}\diff t +\left(\aat \pd{\yy}{\te}\right)_{\blue{t_0}}^{t_f}\\&=\int_{t_0}^{t_f}-\aat \pd{f}{\te}+\pink{\pd{J}{\te}}\diff t +\left(\aat(t_0) \pd{\yy}{\te}(t_0)\right)
 \end{align}
 
 
 \end{frame}
 %%
 %%
 %%
 \begin{frame}{Adjoint method (cont'd)}
 From $J(\yy,\te)=\frac{\diff}{\diff t_f} G(\yy,t_f,\te)$ then,
 \begin{align}
 \pd{J}{\te}&=\frac{\partial }{\partial t_f}\left(\int_{t_0}^{t_f}-\aat \pd{f}{\te}+\pink{\pd{J}{\te}}\diff t +\left(\aat(t_0) \pd{\yy}{\te}(t_0)\right)\right),\\
 &\frac{\partial }{\partial t_f} \left(\frac{\partial J}{\partial \yy} -\aat\pd{f}{\yy}-\dat\right)=0,\quad  \frac{\partial }{\partial t_f}\aat(t_f)=0. 
 \end{align}
 Setting $\blue{\frac{\partial \aat}{\partial t_f}=a^\mathsf{T}(t)},$ one then has
 \begin{align}
 \blue{\dot{a}^\mathsf{T}(t)}&\blue{={a}^\mathsf{T}(t)\pd{f}{\te}},\quad \blue{{a}^\mathsf{T}(t_f)=\pd{J}{Y}(t_f)} \quad \text{(Adjoint equations)},\\
 \pd{J}{\te}&=\int_{t_0}^{t_f}\left(-\blue{a^\mathsf{T}(t)}\pd{f}{\te} +\pink{\pd{J}{\te}}\right)\diff t+ \left(\blue{a^\mathsf{T}(t_0)} \pd{\yy}{\te}(t_0)\right)
 \end{align}
 \uncover<2->{
 \begin{enumerate}
 	\item Run forward dynamic for $Y$.
 	\item Run backward dynamic for $\blue{a^\mathsf{T}(t)}$.
 	\item Compute $\pd{J}{\te}$.
 \end{enumerate}
 }
 
 \uncover<3->{Can be done without storing values \alert{implies} big save in memory, but solves 2 ODEs.}
 \end{frame}
 
 
 
 \begin{frame}{Some considerations}
 		\begin{enumerate}
 			\item \alert{How deep are ODENets?} left to the ODE solver, complexity in terms of NFE		
 			\item \tinto{Accuracy-cost trade-off} Evaluate forward pass at a lower accuracy/cheaper cost
 			\item \blue{Constant Memory Cost} Due to adjoint.
 			\item In practice, 2-4X more expensive to train than corresponding ResNet
 		\end{enumerate}
 	\includegraphics[width=1\linewidth]{figures/four_plots}
 
 
 \end{frame}
 %%
 %%
 %%
 
 
 
 
 \begin{frame}{Application: Density transform}
 
 \alert{Normalizing flows}
 
 
 
 Given $\yy_0\sim p_0$ and \tinto{$f_\te$} s.t $ \yy_1=\tinto{f_\te}(\yy_0),$ can we sample from $p_1$, with $Y_1\sim p_1$?
 
 \uncover<2->{
 
 if $\tinto{f_\te}$ is invertible, then one has that 
 
 $$p_1(Y_1)=p_0(\tinto{f}^{-1}(Y_1))\left \lvert \blue{\det} \ \frac{\partial \tinto{f_\te}^{-1}}{\partial \yy_0}\right \rvert \text{\alert{$\implies$}} \log p_1(\yy_1)=\log p_0(\yy_0)- \log \left \lvert \blue{\det} \ \frac{\partial \tinto{f_\te}}{\partial \yy_0}\right \rvert$$
 }
 \uncover<3->{ Thus, if one knows \tinto{$f_\te$} and can compute \blue{$\det$}, one can evaluate the transformed density $p_1$.  }
  \uncover<3->{
 This has applications in Bayesian inference, image generation, etc. 
 }
 
 
 \uncover<3->{ \textbf{Issues }
 	\begin{enumerate}
 		\item Needs invertible \tinto{$f_\te$}.
 		\item $\blue{\det}$ can be, at worst $\mathcal{O}(n^3)$,  $Y\in \R^n$.
 	\end{enumerate}
 }
 \uncover<4->{ One solution is to take \tinto{$f_\te$} triangular, but this reduces expressability of the transformation}
 
 \uncover<5->{\pink{Continuous normalizing flows might help}}
 
 \end{frame}
 %%
 %%
 %%
 \begin{frame}{Change of variable formula via continuous transformation}
 \textbf{Idea:} Don't consider a "one shot" transformation, but a continuous one.
 
 
 \textbf{Theorem:}
 Consider  a \alert{continuous-in-time} transformation of $\yy(t,\te)=\yy_\te(t)$ given by $$\frac{\diff \yy_\te}{\diff t}(t,\te)=f\left(t, \yy_\te(t,\te),\te\right)=\tinto{f_\te}\left( \yy_\te(t),t\right)$$
 Then, under the assumption that $f_\te$ is uniformly  Lipschitz continuous in $t$, it follows that the change in log-probability is given by: $$\frac{\partial \log (p(\yy_\te(t)))}{\partial t}=-\text{Tr}\left(\frac{\partial\tinto{f_\te}}{\partial \yy_\te }(Y_\te(t),t)\right).$$
 
 \uncover<2->{
 Notice that:
 
 \begin{enumerate}
 	\item It involves a \pink{trace} instead of a \blue{determinant} $\implies$ cheaper.
 	\item $f_\te$ need not be bijective; if solution is unique, then, whole transf. is bijective.  
 \end{enumerate}
 
 }
 
 \end{frame}
 %%
 %%
 %%
 
 \begin{frame}{Proof}
 Want to show:
 $$\frac{\partial \log (p(\yy_\te(t)))}{\partial t}=-\text{Tr}\left(\frac{\diff f}{\diff \yy_\te (t)}\right).$$
 
 Take $\epsilon>0$ and let $\yy_\te(t+\epsilon)=T_\epsilon(\yy_\te(t))$. 
 	\begin{align}
 	\frac{\partial \log (p(\yy_\te(t)))}{\partial t}&=\lim_{\epsilon \to 0+}\frac{\log p(\yy_\te(y))-\log\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert-\log(p(\yy_\te(t)))}{\epsilon}\\
 	&=\lim_{\epsilon \to 0+}\frac{-\log\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert}{\epsilon}
 	=-\lim_{\epsilon \to 0+} \frac{\frac{\partial}{\partial \epsilon}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert}{\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert}\quad \text{ (L'H\^opital)}\\
 		&=-\underbrace{\left(\lim_{\epsilon \to 0+} \frac{\partial}{\partial \epsilon}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert\right)}_\text{bounded}\underbrace{\left(\lim_{\epsilon \to 0+}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert\right)}_\text{=1}\\
 		&=-\left(\lim_{\epsilon \to 0+} \frac{\partial}{\partial \epsilon}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert\right)
 	\end{align}
 \end{frame}
 
 %%
 %%
 %%
 
 \begin{frame}{proof}
 Recall \alert{Jacobi's formula} for an $n\times n$ matrix A: $\frac{d}{\diff t}\det{A(t)}=\text{Tr}\left( \text{Adj}(A(t))\frac{\diff A(t)}{\diff t}\right) .$ 
 
 Then, 
 \begin{align}
 =&-\lim_{\epsilon \to 0+} \frac{\partial}{\partial \epsilon}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert=-\lim_{\epsilon \to 0+}\text{Tr}\left( \text{Adj}\left(\frac{\partial}{\partial \yy_\te} T_\epsilon (\yy_\te(t))\right)\frac{\partial}{\partial \epsilon}\frac{\partial}{\partial \yy_\te}T_\epsilon(\yy_\te(t))\right)\\
 &=\text{Tr}\left(\underbrace{\left( -\lim_{\epsilon \to 0+} \text{Adj} \left(\frac{\partial}{\partial \yy_\te}T_\epsilon (\yy_\te(y))\right) \right)}_\text{=I} \left(-\lim_{\epsilon \to 0+} \frac{\partial }{\partial \epsilon}\frac{\partial}{\partial \yy_\te}T_\epsilon(\yy_\te(t))\right)      \right)\\
 &=\text{Tr}\left(-\lim_{\epsilon \to 0+} \frac{\partial }{\partial \epsilon}\frac{\partial}{\partial \yy_\te}\left(\yy_\te+\epsilon \tinto{f_\te}(\yy_\te(t),t)+\mathcal{O}(\epsilon ^2)\right)\right)\\
 &=\text{Tr}\left(-\lim_{\epsilon \to 0+} \frac{\partial }{\partial \epsilon}\left(I+\frac{\partial}{\partial \yy_\te}\epsilon \tinto{f_\te}(\yy_\te(t),t)+\mathcal{O}(\epsilon ^2)\right)\right)\\
 &=\text{Tr}\left(-\lim_{\epsilon \to 0+} \left(\frac{\partial}{\partial \yy_\te} \tinto{f_\te}(\yy_\te(t),t)+\mathcal{O}(\epsilon) \right)\right)
 =-\text{Tr}\left( \pd{\tinto{f_\te}(\yy_\te(t),t)}{Y}\right)
 \end{align}
 
 \end{frame}
 %%
 %%
 %%
 \begin{frame}{Example: Density Matching}
  Given a \tinto{target} $p$ we construct a \alert{flow} $q$, minimizing $J=\text{KL}(q\lVert p):=\int \log\left(\frac{q(\te)}{p(\te)}\right)q(\te)\diff \te$ (assuming we can evaluate both $p$ and $q$.)
 
 
 \uncover<2->{ 
 	
 \begin{enumerate}
 	\item \pink{Normalizing flow (NF)} $q(\yy_1)=\log p_0(\yy_0)- \log \left \lvert \blue{\det} \ \frac{\partial \tinto{f_\te}}{\partial \yy_0}\right \rvert$
 	\item \blue{Continuous normalizing flow (CNF)} $q$ solves $\frac{\partial \log (q(\yy(t)))}{\partial t}=-\text{Tr}\left(\frac{\partial f}{\partial \yy (t)}\right).$
 \end{enumerate}
 }
 
 \only<3>{
 	\begin{center}
 			\includegraphics[width=0.7\linewidth]{figures/comparisson_final}
 	\end{center}
 
 }
 \only<4>{
 		\begin{center}
 	\includegraphics[width=0.9\linewidth]{figures/noise_to_data}
 	\end{center}
 
 }
 
 
 
 
 \end{frame}
 
 
 %%
 %%
 %%
 \begin{frame}{Other applications: Time series}
 
 \begin{center}
 		\includegraphics[width=1\linewidth]{figures/time_model}
 \end{center}
 
 
 
 
 
 
 
 \end{frame}
 
 \begin{frame}{Other applications: Time series}
 
 
 
 \begin{center}
 	\includegraphics[width=0.6\linewidth]{figures/time_dyn}
 \end{center}
 
 
 
 
 \end{frame}
 %%
 %%
 %%
 \begin{frame}{Summary and conclusions}
 
 This paper can be seen more towards from a computational perspective than the previous one. Aim is to consider the time-continuous limit of the DNN and its interpretation as an ODE. Using this, one can use \alert{black-box} ODE solving routines.
  
 \begin{enumerate}
 	\item There is no notion of layers. Use number of function evaluations as a measure of depth.
 	\item Can speed up in terms of accuracy/cost. 
 	\item No control during training phase (due to black-box nature). More expensive than equivalent ResNet
 	\item Constant memory cost
 	\item Nice applications for density transport and continuous time models. 
 \end{enumerate}
 \end{frame}
 %%
 %%
 %%
 \end{document}
diff --git a/Neural_ODE.tex b/Neural_ODE_jp.tex
similarity index 92%
copy from Neural_ODE.tex
copy to Neural_ODE_jp.tex
index debf20f..b30fa74 100644
--- a/Neural_ODE.tex
+++ b/Neural_ODE_jp.tex
@@ -1,1018 +1,956 @@
 \documentclass[usenames,dvipsnames,aspectratio=169,10pt]{beamer}
 \usepackage{multicol}
 \usetheme{metropolis}
 \usepackage{appendixnumberbeamer}
 \usepackage{autonum}
 \usepackage{booktabs}
 \usepackage[scale=2]{ccicons}
 \usepackage{bm}
 \usepackage{pgfplots}
 \usepackage[utf8]{inputenc}
 \usepackage{media9}
 \usepackage{subcaption}
 \usepackage[english]{babel}
 \usepackage{amsmath}
 \usepackage{mathtools}
 \usepackage{amsfonts}
 \usepackage{amssymb}
 \usepackage{graphicx}
 \usepackage{xmpmulti}
 \usepackage{animate}
 \newcommand{\notimplies}{\;\not\!\!\!\implies}
 \usepackage{fontspec}  % optional
 \pgfplotsset{compat=newest}
 \usepgfplotslibrary{groupplots}
 \usepgfplotslibrary{dateplot}
 \usepgfplotslibrary{dateplot}
 \newcommand{\inputTikZ}[2]{%  
 	\scalebox{#1}{\input{#2}}  
 }
 \newcommand\blfootnote[1]{%
 	\begingroup
 	\renewcommand\thefootnote{}\footnote{#1}%
 	\addtocounter{footnote}{-1}%
 	\endgroup
 }
 
 
 \usepgfplotslibrary{groupplots,dateplot}
 \usetikzlibrary{patterns,shapes.arrows}
 \pgfplotsset{compat=newest}
 \pgfplotsset{compat=1.13}
 \usepgfplotslibrary{fillbetween}
 \pgfmathdeclarefunction{gauss}{2}
 {\pgfmathparse{1/(#2*sqrt(2*pi))*exp(-((x-#1)^2)/(2*#2^2))}}
 
 \usepackage{xspace}
 \newcommand{\themename}{\textbf{\textsc{metropolis}}\xspace}
 \definecolor{burgundy}{RGB}{255,0,90}
 \usepackage{algorithm}
 \usepackage{algpseudocode}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % Listings
 %
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 \usepackage{listings,bera}
 \definecolor{keywords}{RGB}{255,0,90}
 \definecolor{comments}{RGB}{60,179,113}
 \lstset{language=Python,
 	keywordstyle=\color{keywords},
 	commentstyle=\color{comments}\emph}
 \lstset{escapeinside={<@}{@>}}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % Color stuff
 %
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \usepackage{xcolor,pifont}
 \newcommand*\colourcheck[1]{%
 	\expandafter\newcommand\csname #1check\endcsname{\textcolor{#1}{\ding{52}}}%
 }
 \colourcheck{blue}
 \colourcheck{green}
 \colourcheck{red}
 \definecolor{fore}{RGB}{249,242,215}
 \definecolor{back}{RGB}{51,51,51}
 \definecolor{title}{RGB}{255,0,90}
 
 \definecolor{mDarkBrown}{HTML}{604c38}
 \definecolor{mDarkTeal}{HTML}{23373b}
 \definecolor{mLightBrown}{HTML}{EB811B}
 \definecolor{mLightGreen}{HTML}{14B03D}
 \definecolor{aqb}{HTML}{6FEBBE}
 \setbeamercolor{titlelike}{fg=}
 \setbeamercolor{normal text}{fg=fore,bg=back}
 \newcommand{\pink}[1]{{\color{magenta} #1}}
 \newcommand{\blue}[1]{{\color{aqb} #1}}
 \newcommand{\tinto}[1]{{\color{burgundy} #1}}
 %symbol definitions
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % Variables
 %
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \newcommand{\yj}{{Y}_{j+1}}
 \newcommand{\aat}{\tinto{{\lambda}^T}}
 \newcommand{\dat}{\tinto{\dot{{\lambda}}^T}}
 \newcommand{\ym}{{Y}_{j}}
 \newcommand{\kj}{{K}_j}
 \newcommand{\bj}{b_j}
 \newcommand{\yy}{{Y}}
 \newcommand{\te}{{\theta}}
 \newcommand{\R}{\mathbb{R}}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % delimiters
 %
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \newcommand{\lno}{\left \Vert}
 \newcommand{\rno}{\right \Vert}
 \newcommand{\lv}{\lvert}
 \newcommand{\rv}{ \rvert}
 \newcommand{\inner}[2]{\left\langle #1,#2 \right\rangle}
 \newcommand{\abs}[1]{\left \vert#1 \right \vert}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % operators
 %
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \newcommand{\calc}{\mathcal{C}}
 \newcommand{\calku}{\mathcal{K}(u)}
 \newcommand{\ff}{\mathcal{F}}
 \newcommand{\diff}{\mathsf{d}}
 \newcommand{\pd}[2]{\frac{\partial #1}{\partial #2}}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % check mark
 %
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 
 
 \title{Continuous limits of DNN: Neural networks as ODEs}
 %\subtitle{An overview}
 \date{November 13, 2020}
 \author{Eva Vidli\v ckov\'a and Juan Pablo Madrigal Cianci,\\
 CSQI.}
 \institute{Foundations of deep neural networks}
 \titlegraphic{\hfill\includegraphics[height=.5cm]{logo.png}}
 
 \begin{document}
 
 \maketitle
 
 %\begin{frame}{Table of contents}
 %  \setbeamertemplate{section in toc}[sections numbered]
 %  \tableofcontents[hideallsubsections]
 %\end{frame}
 
 
 
 
 %\section{Introduction and Motivation: ResNets}
 
 
 \begin{frame}{ResNets}
 
 Here we briefly describe what resnets are, maybe define some notation and make the point that they look like an ODE. 
 
 \end{frame}
 
 
 \begin{frame}{Outline}
 The rest of the talk summarizes the following two articles:
 
 \nocite{*}
 \bibliography{bib_intro}
 \bibliographystyle{plain}
 
 \end{frame}
 
 \section{Stable architectures for deep neural networks}
 
 
 %\section{Introduction}
 
 \begin{frame}{Classification problem}
 \begin{itemize}
 	\item training data: 
 	\begin{align}
 	& y_1,\dots, y_\tinto{s} \in \R^\tinto{n} \quad \text{ feature vectors }\\
 	& c_1,\dots, c_\tinto{s} \in \R^\tinto{m} \quad \text{ label vectors } \\
 	& (c_l)_k \text{ - likelihood of $y_l$ belonging to class $k$} 
 	\end{align}
 	\item objective: learn data-label relation function that \alert{generalizes} well\\[20pt]\pause
 	\item \blue{deep architectures}\\[10pt]
 	\begin{itemize}
 		\item[+] successful for highly nonlinear data-label relationships\\[5pt]
 		\item[--] dimensionality, non-convexity, \textbf{instability} of forward model
 	\end{itemize}
 \end{itemize}
 \end{frame}
 
 \begin{frame}{ResNets: Forward propagation}
 %\textbf{Forward propagation}
 \begin{gather}
 Y_{j+1} = Y_j + \tinto{h}\sigma(Y_j \blue{K_j} + \blue{b_j}), \quad j= 0,\dots,N-1\\[10pt]
 Y_j \in\R^{s\times n},\, \blue{K_j}\in\R^{n\times n},\, \blue{b_j}\in\R
 \end{gather}
 
 \begin{itemize}
 \item $Y_0 = [y_1,\dots,y_s]^\intercal$
 \item $Y_1,\dots, Y_{N-1}$ - hidden layers,\; $Y_N$ - output layer
 \item activation function \[\sigma_{ht}(Y) = \tanh(Y),\quad \sigma_{ReLU} = \max(0,Y)\]
 \end{itemize}
 
 
 \end{frame}
 
 \begin{frame}{ResNets: Classification}
 
 %\textbf{Classification} 
 \[ h_{hyp}(Y_N \pink{W} + e_s\pink{\mu}^\intercal),\quad \pink{W}\in \R^{n\times m}, \pink{\mu}\in\R^m \]
 \begin{itemize}
 \item hypothesis function \[ h_{hyp}(x) = \exp(x)./(1+\exp(x)) \]
 \end{itemize}
 
 \end{frame}
 
 \begin{frame}{Learning process}
 \begin{columns}
 \begin{column}{0.5\textwidth}
 \begin{center}
 forward prop. parameters\\[10pt] $(\blue{K_j}, \blue{b_j},\;\; j=0,\dots, N-1)$
 \end{center}
 \end{column}
 \begin{column}{0.5\textwidth}
 \begin{center}
 classification parameters \\[10pt] $(\pink{W}, \pink{\mu})$
 \end{center}
 \end{column}
 \end{columns}
 \vspace{1cm}
 \centerline{\alert{\textbf{Optimization problem}}}
 \begin{gather}
 \min \frac{1}{s} \alert{S}\big(h_{hypo}(Y_N \pink{W} + e_s\pink{\mu}^\intercal), C\big) + \alpha \alert{R}(\pink{W},\pink{\mu},\blue{K_j}, \blue{b_j}) \\[5pt]
 \text{ s.t. } Y_{j+1} = Y_j + h\sigma(Y_j \blue{K_j} + \blue{b_j}), \qquad j= 0,\dots,N-1
 \end{gather}
 \begin{itemize}
 \item $C = [c_1,c_2,\dots,c_s]^\intercal\in\mathbb{R}^{s\times m}$
 \item e.g. $\alert{S}(C_{pred},C) = \frac{1}{2}\|C_{pred} - C\|_F^2$
 \end{itemize}
 \end{frame}
 
 \begin{frame}{Learning process}
 \begin{itemize}
 \item block coordinate descent method
 \item \begin{align}
 \frac{1}{\pink{s}} S\big(h_{hypo}(Y_N W + e_s\mu^\intercal), C\big) &= \frac{1}{\pink{s}} \sum_{i=1}^{\pink{s}} S\Big(h_{hypo}\big((Y_N)_i^\intercal W + \mu^\intercal\big), c_i^\intercal\Big)\\
 &\approx \frac{1}{|\blue{\mathcal{T}}|} \sum_{i\in\blue{\mathcal{T}}} S\Big(h\big((Y_N)_i^\intercal W + \mu^\intercal\big), c_i^\intercal\Big)
 \end{align}
 \item learning data \& validation data
 \end{itemize}
 \end{frame}
 
 %\section{ODE interpretation}
 
 \begin{frame}{ResNets as discretized ODEs}
 \textbf{ResNet}
 \[
 Y_{j+1} = Y_j + h\sigma(Y_j K_j + b_j), \quad j= 0,\dots,N-1
 \]
 \textbf{Continuous ODE}
 \begin{align}
 &\dot{y}(t) = \sigma\big( K^\intercal(t)y(t) + b(t) \big), \quad t\in [0,T]\\
 &y(0) = Y_0
 \end{align}
 \end{frame}
 
 \begin{frame}{Stability of continuous ODEs}
-\[
-\dot{y}(t) = f(t, y(t))
-\]
-\begin{enumerate}
-\item \alert{linear} with \alert{constant} coefficients:\qquad $\dot{y}(t) = Ay(t) + b$\\ 
-	\begin{itemize}
-	\item asymptotically stable if $\text{Re}(\lambda_i(A)) < 0,\;\forall i$
-	\item stable if $\text{Re}(\lambda_i(A)) < 0$ or $\text{Re}(\lambda_i(A)) = 0$ and geometrical multiplicity  = algebraic multiplicity $\forall i$
-	\item unstable otherwise
-	\end{itemize}
-\item \alert{nonlinear} : \qquad $\dot{y}(t) = f(t, y(t))$\\
-
-$\to$ variational equation 
-\begin{gather}
-\dot{z} = J(t) z\\
-J(t) = \frac{\partial f}{\partial y}(t, y(t))
-\end{gather}
-\item \alert{linear} with \alert{non-constant} coefficients:\qquad $\dot{y}(t) = A(t)y(t) + b(t)$\\
-complicated (kinematic eigenvalues)
-
-\end{enumerate}
 
 \end{frame}
 
-\begin{frame}{Stability for continuous forward propagation of NN }
+\begin{frame}{Stability of NN forward propagation}
 \textbf{Continuous problem}
 \[
 \dot{y}(t) = \sigma\big( K(t)^\intercal y(t) + b(t) \big), \quad y(0) = y_0
 \]
 stability conditions for ODEs inspires following conditions:
 \begin{enumerate}
-\item $\blue{K(t)}, \blue{b(t)}$ changes sufficiently \alert{slowly}
+\item $\blue{K(t)}$ changes sufficiently \alert{slowly}
 \item \begin{align}
 \max_{i=1,\dots,n} &\text{Re}\Big(\pink{\lambda_i}\big(J(t)\big)\Big) \leq 0,\quad\forall t\in [0,T]\\[10pt]
 \text{where } J(t) &= \nabla_y\Big(\sigma\big( K(t)^\intercal y(t) + b(t) \big)\Big)^\intercal\\
 &= \text{diag}\Big(\underbrace{\sigma'\big( K(t)^\intercal y(t) + b(t) \big)}_{\geq 0}\Big) K(t)^\intercal
 \end{align}
 
 $ \to \max_{i=1,\dots,n} \text{Re}\Big(\pink{\lambda_i}\big(K(t)\big)\Big) \leq 0,\quad\forall t\in [0,T].
 $
 \end{enumerate}
-\end{frame}
-
 
-\begin{frame}{Stability of discretized ODE and NNs}
-\begin{columns}
-\begin{column}{0.5\textwidth}
-\center{\alert{Discrete ODEs}}
-\end{column}
-
-\begin{column}{0.5\textwidth}
-\center{\alert{Neural networks}}
-\end{column}
-\end{columns}
-\bigskip
-\begin{columns}
-\begin{column}{0.5\textwidth}
-\[
-\dot{y} = Ay
-\]
-\end{column}
-\begin{column}{0.5\textwidth}
-\[
-\dot{y}(t) = \sigma\big( K(t)^\intercal y(t) + b(t) \big)
-\]
-\end{column}
-\end{columns}
+\end{frame}
 
-\begin{columns}
-\begin{column}{0.5\textwidth}
+\begin{frame}{Stability of NN forward propagation}
+\textbf{Forward Euler method}
 \[
-y_{j+1} = y_j + h A y_j
-\]
-\end{column}
-\begin{column}{0.5\textwidth}
-\[
-Y_{j+1} = Y_j + h\sigma(Y_j K_j + b_j)
-\]
-\end{column}
-\end{columns}
-\bigskip
-\begin{columns}
-\begin{column}{0.5\textwidth}
-stability condition on \tinto{h}:
-\[
-| 1 + h\lambda_i(A) | \leq 1\quad \forall i
+Y_{j+1} = Y_j + h\sigma(Y_j K_j + b_j), \quad j= 0,\dots,N-1
 \]
-\end{column}
-\begin{column}{0.5\textwidth}
 stability condition on \tinto{h}
 \[
-\max_{i=1,\dots,n} |1+\tinto{h}\lambda_i(J_j)| \leq 1,\quad \forall j=0,1,\dots,N-1
+\max_{i=1,\dots,n} |1+\tinto{h}\lambda_i(J_i)| \leq 1,\quad \forall j=0,1,\dots,N-1
 \]
-\end{column}
-\end{columns}
-
-\begin{columns}
-\begin{column}{0.5\textwidth}
-\end{column}
-\begin{column}{0.5\textwidth}
-\end{column}
-\end{columns}
-
-\begin{figure}
-\includegraphics[scale=0.05]{figures/Euler_stab.png}
-\end{figure}
-
 \end{frame}
 
 \begin{frame}{Example: Stability of ResNet}
 \begin{figure}
 \centering
 \includegraphics[width = 0.8\textwidth]{figures/ResNet_stab.png}
 \end{figure}
 \begin{align}
 K_{+}&= \begin{pmatrix}
 2 & -2\\ 0& 2
 \end{pmatrix} & K_{-} &= \begin{pmatrix}
 -2 & 0 \\ 2 & -1
 \end{pmatrix} & K_0 &= \begin{pmatrix}
 0 & -1 \\ 1 & 0
 \end{pmatrix}\\
 \lambda(K_+) &= 2 & \lambda(K_-) &= -2 & \lambda(K_0)&= i,-i
 \end{align}
 \begin{itemize}
 \item $s = 3,\, n=2,\, h = 0.1, \, b = 0,\, \sigma = \tanh,\, N = 10$
 \end{itemize}
 \end{frame}
 
 \begin{frame}{Well-posed forward propagation}
 \begin{enumerate}
 \item $\max_i \text{Re}(\pink{\lambda_i}(K)) > 0$\\[10pt]
 \begin{itemize}
 \item neurons amplify signal with no upper bound
 \item unreliable generalization\\[20pt]
 \end{itemize}
 \item $\max_i \text{Re}(\pink{\lambda_i}(K)) << 0$\\[10pt]
 \begin{itemize}
 \item inverse problem highly ill-posed
 \item vanishing gradients problem
 \item lossy network\\[20pt]
 \end{itemize}
 \end{enumerate}
 $\implies \text{Re}(\pink{\lambda_i}(K(t))) \alert{\approx 0},\quad\forall i=1,2,\dots,n,\;\forall t\in [0,T]$
 \end{frame}
 
 %\section{Stable architectures}
 
 \begin{frame}{Antisymmetric weight matrices}
 \[
 \dot{y}(t) = \sigma \Big( \frac{1}{2}\big(\underbrace{ K(t) - K(t)^\intercal }_{ \mathclap{\text{antisymmetric $\to$ imaginary eigenvalues}} } - \alert{\gamma} I\big)y(t) + b(t)\Big),\quad t\in [0,T]
 \]
 \bigskip
 
 \begin{enumerate}
 \onslide<2->{\item $\alert{\gamma = 0}$} \onslide<3->{\begin{figure}
 \centering
 \includegraphics[scale = 0.25]{figures/RK_stab.png}
 \end{figure} }
 \onslide<4->{
 \item $\alert{\gamma > 0}$ \quad $\to$ Forward Euler discretization
 $$Y_{j+1} = Y_j + \tinto{h}\sigma\Big(\frac{1}{2}Y_j (K_j - K_j^\intercal - \alert{\gamma} I) + b_j\Big)$$}
 \end{enumerate}
 \end{frame}
 
 \begin{frame}{Hamiltonian inspired NN}
 \vspace{-0.9cm}
 \[
 \dot{y}(t) = -\nabla_z \blue{H}(y,z,t), \quad \dot{z}(t) = \nabla_y \blue{H}(y,z,t),\quad t\in [0,T]
 \]
 \begin{itemize}
 \item Hamiltonian $\blue{H}: \R^n\times \R^n\times [0,T]\to \R$ conserved\\[10pt]
 \item energy \alert{conserved}, not dissipated\\[20pt]
 \end{itemize}    
 \end{frame}
 
 \begin{frame}{Hamiltonian inspired NN}
 Hamiltonian $\blue{H}(y,z) = \frac{1}{2}z^\intercal z - f(y)$\\[5pt]
 
 $$\dot{y}(t) = -z(t), \; \dot{z}(t) = -\nabla_y f(y(t))\quad\implies \ddot{y}(t) = \nabla_y f(y(t))$$
 \pause
 \begin{itemize}
 \item $\alert{\ddot{y}(t) = \sigma\Big( K^\intercal (t) y(t) + b(t)\Big)},\; y(0) = Y_0,\; \dot{y}(0) = 0$\\[5pt]\pause
 \item stable for $K$ with non-positive real eigenvalues\\[5pt]
 \item $K(C) = -C^\intercal C,\quad C\in\R^{n\times n}$\\[5pt]
 \item nonlinear parametrization - complicated optimization\\[5pt]
 \item leapfrog discretization scheme (symplectic integrator)
 \end{itemize}
 \end{frame}
 
 \begin{frame}{Hamiltonian inspired NN}
 \[
 \alert{\dot{y}(t) = \sigma\Big( K (t) z(t) + b(t)\Big) \qquad \dot{z}(t) = \sigma\Big( K^\intercal (t) y(t) + b(t)\Big)}
 \]
 Associated ODE:
 \begin{align}
 \frac{\partial}{\partial t} \begin{pmatrix}
 y\\
 z
 \end{pmatrix}(t) &= \sigma \begin{pmatrix}\begin{pmatrix} 0 & K(t) \\ -K(t)^\intercal & 0 \end{pmatrix} \begin{pmatrix}
 y\\
 z
 \end{pmatrix}(t) + b(t) \end{pmatrix}, \\
 \begin{pmatrix}
 y\\
 z
 \end{pmatrix}(0) &= \begin{pmatrix}
 y_0\\
 0
 \end{pmatrix}
 \end{align}
 \pause
 \begin{itemize}
 \item antisymmetric matrix
 \item Verlet integration scheme (symplectic)
 $$ z_{j+1/2} = z_{j-1/2} - h\sigma(K_j^\intercal y_j + b_j),\quad y_{j+1} = y_j + h\sigma (K_j z_{j+1/2} + b_j)$$
+\item $K_j$ non-square
 \end{itemize}
 \end{frame}
 
 \begin{frame}{Regularization}
 \[
 \min \frac{1}{s} {S}\big(h_{hypo}(Y_N {W} + e_s{\mu}^\intercal), C\big) + \alpha \alert{R}(\pink{W},\pink{\mu},\blue{K_j}, \blue{b_j})
 \]
 \pause
 \begin{enumerate}
 \item Forward propagation\\[10pt]
 \begin{itemize}
 \item standard: weight decay (Tikhonov regularization) \[ R(K) = \frac{1}{2}\|K\|_F^2 \]
 \item $\blue{K,\,b}$ to be sufficiently smooth
 \[ \alert{R}(\blue{K}) = \frac{1}{2h}\sum \|K_j - K_{j-1}\|_F^2\quad \alert{R}(\blue{b}) = \frac{1}{2h}\sum \|b_j - b_{j-1}\|^2\]
 \end{itemize}\pause
 \item Classification\\[10pt]
 \begin{itemize}
 \item $h_{hypo}(y_j^\intercal w_k + \mu_k)\approx h_{hypo}\Big(\text{vol}(\Omega) \int_{\Omega} y(x)w(x)\mathrm{d}x + \mu_k \Big)$
 \item $$\alert{R}(\pink{w_k}) = \frac{1}{2} \|L w_k\|^2\quad L - \text{discretized differential operator}$$
 \end{itemize}
 \item Multi-level learning
 \end{enumerate}
 \end{frame}
 %\section{Numerical examples}
 \begin{frame}{Concentric ellipses}
 \begin{figure}
 \centering
 \includegraphics[width = \textwidth]{figures/Elipses.png}
 \end{figure}
 \begin{itemize}
 \item 1200 points: 1000 training + 200 validation
 \item multi-level: 4, 8, 16, \dots, 1024 layers
 \item T = 20, n = 2, $\alpha = 10^{-3}$, $\sigma = \tanh$
 \item standard ResNet, antisymmetric ResNet, Hamiltonian - Verlet network
 \end{itemize}
 \end{frame}
 
 \begin{frame}{Convergence}
 \begin{figure}
 \centering
 \includegraphics[width = \textwidth]{figures/Convergence.png}
 \end{figure}
 \end{frame}
 
 \begin{frame}{Swiss roll}
 \begin{figure}
 \centering
 \includegraphics[width = \textwidth]{figures/Swiss_roll.png}
 \end{figure}
 \begin{itemize}
 \item 513 points: 257 training + 256 validation
 \item multi-level: 4, 8, 16, \dots, 1024 layers
 \item T = 20, n = 4,4,2, $\alpha = 5\cdot 10^{-3}$, $\sigma = \tanh$
 \item standard ResNet, antisymmetric ResNet, Hamiltonian - Verlet network
 \end{itemize}
 \end{frame}
 
 \begin{frame}{Peaks}
 \begin{figure}
 \centering
 \includegraphics[width = \textwidth]{figures/Peaks.png}
 \end{figure}
 \begin{itemize}
 \item 5000 samples: 20\% for validation
 \item multi-level: 4, 8, 16, \dots, 1024 layers
 \item T = 5, n = 8,8,2, $\alpha = 5\cdot 10^{-6}$, $\sigma = \tanh$
 \item standard ResNet, antisymmetric ResNet, Hamiltonian - Verlet network
 \end{itemize}
 \end{frame}
 
 \begin{frame}{MNIST}
 \begin{figure}
 \centering
 \includegraphics[scale = 0.8]{figures/MNIST.png}
 \end{figure}
 \end{frame}
 
 
 \begin{frame}{MNIST}
 \begin{itemize}
 \item 60 000 labeled images: 50 000 training, 10 000 validation,
 \item 28 $\times$ 28, multi-level: 4,8,16
 \item T = 6, n = 4704, $\alpha = 0.005$, $3 \times 3$ convolution operators
 \item standard ResNet, antisymmetric ResNet, Hamiltonian - Verlet network
 \bigskip
 \end{itemize}
 \begin{figure}
 \centering
 \includegraphics[width = 0.9\textwidth]{figures/MNIST_table.png}
 \end{figure}
 \end{frame}
 
 
 \section{Neural ODEs}
 %%
 %%
 %%
 \begin{frame}[fragile]{Motivation: ResNets and Euler's method}
-		$$\yj=\ym+ \underbrace{h\sigma\left( \ym\kj+\bj\right)}_\text{$=f(\ym,\te_j)$}, \quad \text{Euler discretization of $\frac{\diff Y}{\diff t}=f(Y,\te(t))$}$$ 
+		$$\yj=\ym+ h\underbrace{\sigma\left( \ym\kj+\bj\right)}_\text{$=f(\ym,\te_j)$}, \quad \text{Euler discretization of $\frac{\diff Y}{\diff t}=f(Y,\te(t))$}$$ 
 \begin{columns}
 	\begin{column}{0.5\linewidth}
 
 %\begin{lstlisting}
 %#D
 %-+*efines the architecture
 %def f(Y,t,θ):
 %return neural_net(z,θ[t])
 %
 %#Defines the resnet
 %def resnet(Y):
 %for t in [1:T]:
 %	Y=Y+f(Y,t,θ)
 %return Y
 %\end{lstlisting}
 
 \vspace{0.5cm}
 \texttt{
 	\blue{\#Defines the architecture}\\
 	\tinto{def} f(Y,t,θ):\\
 	\tinto{return} neural\_net(z,θ[t])\\ \vspace{1cm}}
 
 \texttt{
 	\blue{\#Defines the ResNet}\\
 	\tinto{def} ODE\_Net(Y0):\\
 	\pink{for} t in [1:T]:\\
 	\hspace{0.5 cm} Y=Y+f(Y,t,θ)\\
  {\tinto{return} Y}}
 \vspace{1cm}
 
 
 	\end{column}
 
 \begin{column}{0.5\linewidth}
 \begin{figure}
 	\centering
 	\uncover<2->{
 	\includegraphics[width=0.8\linewidth]{figures/eulers.png}
 }
 \end{figure}
 \uncover<3->{
 Can we do better?
 }
 \end{column}
 \end{columns}
 
 \end{frame}
 %%
 
 \begin{frame}[fragile]{Improving on Euler's method}
-		$$\yj=\ym+ \underbrace{h\sigma\left( \ym\kj+\bj\right)}_\text{$=f(\ym,\te_j)$}, \quad \text{Euler discretization of $\frac{\diff Y}{\diff t}=f(Y,\te(t))$}$$ 
+		$$\yj=\ym+ h\underbrace{f\left( \ym\kj+\bj\right)}_\text{$=f(\ym,\te_j)$}, \quad \text{Euler discretization of $\frac{\diff Y}{\diff t}=f(Y,\te(t))$}$$ 
 
 \begin{columns}[t]
 	\begin{column}{0.6\linewidth}
 
 		
 %\begin{lstlisting}
 %#Defines the architecture
 %def f(Y,t,θ):
 %return neural_net([z,t],θ[t])
 %
 %#Defines the ODE Net
 %def ODE_Net(Y0):
 %return ODE_Solver(f,Y0,\theta,t_0=0,t_f=1)
 %\end{lstlisting}
 \vspace{0.5cm}
 \texttt{
 \blue{\#Defines the architecture}\\
 \tinto{def} f(Y,t,θ):\\
 \tinto{return} neural\_net(\pink{[z,t]},θ[t])\\ \vspace{0.5cm}}
 
 \texttt{
 \blue{\#Defines the ODE Net}\\
 \tinto{def} ODE\_Net(Y0):\\
 {\tinto{return} \alert{ODE\_Solver}(f,Y0,θ,t\_0=0,t\_f=1)}}
 \vspace{0.5cm}
 
 Here \texttt{\alert{ODE\_Solver}} is a black-box ODE solver. 		
 		
 	
 		
 	\end{column}
 	\begin{column}{0.5\linewidth}
 
 			\centering
 			\uncover<2->{
 				\includegraphics[width=0.7\linewidth]{figures/adaptive.png}
 			}
 	\end{column}
 \end{columns}
 \uncover<3->{
 	\begin{center}
 \pink{Main idea:} Continuous depth + good ODE solver.
 \end{center}
 }
 \end{frame}
 
 
 
 
 
 \begin{frame}[fragile]{Comparison}
 \begin{columns}[t]
 	\begin{column}{0.5\linewidth}
 		\textbf{ResNet:}
 		
 		
 \texttt{
 	\blue{\#Defines the architecture}\\
 	\tinto{def} f(Y,t,θ):\\
 	\tinto{return} neural\_net(z,θ[t])\\ \vspace{0.25cm}}
 
 \texttt{
 	\blue{\#Defines the ResNet}\\
 	\tinto{def} ResNet(Y):\\
 	\pink{for} t in [1:T]:\\
 	\hspace{0.5 cm} Y=Y+f(Y,t,θ)\\
 	{\tinto{return} Y}}		
 	\end{column}
 	\begin{column}{0.5\linewidth}
 	\textbf{ODENet:}
 	
 	
 	\texttt{
 		\blue{\#Defines the architecture}\\
 		\tinto{def} f(Y,t,θ):\\
 		\tinto{return} neural\_net(\pink{[z,t]},θ[t])\\ \vspace{0.25cm}}
 	
 	\texttt{
 		\blue{\#Defines the ODENet}\\
 \tinto{def} ODE\_Net(Y0):\\
 {\tinto{return} \alert{ODE\_Solver}(f,Y0,θ,t\_0=0,t\_f=1)}}
 	\end{column}
 \end{columns}
 \begin{center}
 			\includegraphics[width=0.35\linewidth]{figures/ode_res}
 \end{center}
 
 \end{frame}
 %%
 %%
 %%
 
 %\begin{frame}{Some considerations}
 %\begin{center}
 %	\includegraphics[width=0.2\linewidth]{figures/resnett}\hspace{2cm}
 %	\includegraphics[width=0.2\linewidth]{figures/odenett}
 %\end{center}
 %\end{frame}
 
 \begin{frame}{Training the Neural Network: Adjoint Method}
 We aim at minimizing $J:R^p\mapsto R,$ $$J(\yy,t_f,\te)=J\left(\yy(t_0)+\int_{t_0}^{t_f}f(\yy,t,\te)\diff t \right)=J(\text{\texttt{\alert{ODE\_Solver}}}(f,\yy(t_0),\te,t_0=0,t_f=1)).$$
 
 
+$$\frac{\partial J}{\partial \te}=?$$
+
+
+\textbf{Backprop:}
+\begin{align}
+\frac{\partial Y_t}{\partial Y_{t+1}}=\frac{\partial J}{\partial Y_{t+1}}\frac{\partial f(Y_t,\te)}{\partial Y_t}, \quad \pd{J}{\te_t}=\pd{J}{Y_T}\frac{\partial f(Y_t,\te)}{\partial \te_t}\end{align}
 \textbf{Difficulties: }
 
 \begin{enumerate}
 	\item  \alert{\texttt{ODE\_Solver}} is a black-box.
 	\item  There is no notion of layers, since we are on a continuous limit. 
 \end{enumerate}
 
 
 
 
-$$\frac{\partial J}{\partial \te}=?$$
+
 
 
 How does $\te$ depend on $\yy(t)$ at each instant $t$?
 
 Don't use back-prop, but rather the \tinto{adjoint-state method} (Pontryagin et al. 1962.). 
 \end{frame}
 
+
+\begin{frame}{The adjoint method}
+
+\end{frame}
+
+
 %%
 %%
 %%
 \begin{frame}{Training the Neural Network: Adjoint Method}
 Define first $$G(\yy,t_f,\te):=\int_{t_0}^{t_f} J(\yy,t,\te)\diff t, \quad \frac{\diff}{\diff t_f}G(\yy,t_f,\te)=J(\yy,t,\te)$$ and the Lagrangian $$L=G(\yy,t_f,\te)+\int_{t_0}^{t_f}\aat(t)\left( \dot{\yy}(t,\te)-f(\yy,t,\te)  \right)\diff \te $$
 Then,
 
 \begin{align}
 \frac{\partial L}{\partial \te}=\int_{t_0}^{t_f} \left(\frac{\partial J}{\partial \yy}\frac{\partial \yy}{\partial \te} +\frac{\partial J}{\partial \te}\right)\diff t +\int_{t_0}^{t_f}\aat(t)\left( \blue{\frac{\partial\dot{\yy}}{\partial \te}}-\frac{\partial f}{\partial \yy}\frac{\partial \yy}{\partial \te}- \frac{\partial f}{\partial \te} \right)\diff t
 \end{align}
 IBP:
 
 \begin{align}
 \int_{t_0}^{t_f}\aat(t)\blue{\frac{\partial\dot{\yy}}{\partial \te}}\diff t=\aat(t)\frac{\partial{\yy}}{\partial \te}\rvert_{t_0}^{t_f}-\int_{t_0}^{t_f}\dat(t)\blue{\frac{\partial {\yy}}{\partial \te}}\diff t
 \end{align}
 
 \end{frame}
 
 
 \begin{frame}{Adjoint method (cont'd)}
 
 \begin{align}
 \frac{\partial L}{\partial \te}&=\int_{t_0}^{t_f} \left(\frac{\partial J}{\partial \yy}\frac{\partial \yy}{\partial \te} +\frac{\partial J}{\partial \te}\right)\diff t +\int_{t_0}^{t_f}\aat(t)\left( \blue{\frac{\partial\dot{\yy}}{\partial \te}}-\frac{\partial f}{\partial \yy}\frac{\partial \yy}{\partial \te}- \frac{\partial f}{\partial \te} \right)\diff \te\\
 %%
 %%
 &=\int_{t_0}^{t_f} \left(\frac{\partial \yy}{\partial \te}\right)\alert{\left(\frac{\partial \yy}{\partial \te} -\aat\pd{f}{\yy}-\dat\right)}\diff t+\int_{t_0}^{t_f}-\aat \pd{f}{\te}+\pink{\pd{J}{\te}}\diff t +\left(\aat \pd{\yy}{\te}\right)_{t_0}^{t_f}\\
 \end{align}
 Setting $\alert{\left(\frac{\partial J}{\partial \yy} -\aat\pd{f}{\yy}-\dat\right)}=0$, $\aat(t_f)=0$, one gets
 
 
 \begin{align}
 \frac{\partial L}{\partial \te}&=\int_{t_0}^{t_f}-\aat \pd{f}{\te}+\pink{\pd{J}{\te}}\diff t +\left(\aat \pd{\yy}{\te}\right)_{\blue{t_0}}^{t_f}\\&=\int_{t_0}^{t_f}-\aat \pd{f}{\te}+\pink{\pd{J}{\te}}\diff t +\left(\aat(t_0) \pd{\yy}{\te}(t_0)\right)
 \end{align}
 
 
 \end{frame}
 %%
 %%
 %%
 \begin{frame}{Adjoint method (cont'd)}
 From $J(\yy,\te)=\frac{\diff}{\diff t_f} G(\yy,t_f,\te)$ then,
 \begin{align}
 \pd{J}{\te}&=\frac{\partial }{\partial t_f}\left(\int_{t_0}^{t_f}-\aat \pd{f}{\te}+\pink{\pd{J}{\te}}\diff t +\left(\aat(t_0) \pd{\yy}{\te}(t_0)\right)\right),\\
 &\frac{\partial }{\partial t_f} \left(\frac{\partial J}{\partial \yy} -\aat\pd{f}{\yy}-\dat\right)=0,\quad  \frac{\partial }{\partial t_f}\aat(t_f)=0. 
 \end{align}
 Setting $\blue{\frac{\partial \aat}{\partial t_f}=a^\mathsf{T}(t)},$ one then has
 \begin{align}
 \blue{\dot{a}^\mathsf{T}(t)}&\blue{={a}^\mathsf{T}(t)\pd{f}{\te}},\quad \blue{{a}^\mathsf{T}(t_f)=\pd{J}{Y}(t_f)} \quad \text{(Adjoint equations)},\\
 \pd{J}{\te}&=\int_{t_0}^{t_f}\left(-\blue{a^\mathsf{T}(t)}\pd{f}{\te} +\pink{\pd{J}{\te}}\right)\diff t+ \left(\blue{a^\mathsf{T}(t_0)} \pd{\yy}{\te}(t_0)\right)
 \end{align}
 \uncover<2->{
 \begin{enumerate}
 	\item Run forward dynamic for $Y$.
 	\item Run backward dynamic for $\blue{a^\mathsf{T}(t)}$.
 	\item Compute $\pd{J}{\te}$.
 \end{enumerate}
 }
 
 \uncover<3->{Can be done without storing values \alert{implies} big save in memory, but solves 2 ODEs.}
 \end{frame}
 
 
 
 \begin{frame}{Some considerations}
 		\begin{enumerate}
 			\item \alert{How deep are ODENets?} left to the ODE solver, complexity in terms of NFE		
 			\item \tinto{Accuracy-cost trade-off} Evaluate forward pass at a lower accuracy/cheaper cost
 			\item \blue{Constant Memory Cost} Due to adjoint.
 			\item In practice, 2-4X more expensive to train than corresponding ResNet
 		\end{enumerate}
 	\includegraphics[width=1\linewidth]{figures/four_plots}
 
 
 \end{frame}
 %%
 %%
 %%
 
 
 
 
 \begin{frame}{Application: Density transform}
 
 \alert{Normalizing flows}
 
 
 
 Given $\yy_0\sim p_0$ and \tinto{$f_\te$} s.t $ \yy_1=\tinto{f_\te}(\yy_0),$ can we sample from $p_1$, with $Y_1\sim p_1$?
 
 \uncover<2->{
 
 if $\tinto{f_\te}$ is invertible, then one has that 
 
 $$p_1(Y_1)=p_0(\tinto{f}^{-1}(Y_1))\left \lvert \blue{\det} \ \frac{\partial \tinto{f_\te}^{-1}}{\partial \yy_0}\right \rvert \text{\alert{$\implies$}} \log p_1(\yy_1)=\log p_0(\yy_0)- \log \left \lvert \blue{\det} \ \frac{\partial \tinto{f_\te}}{\partial \yy_0}\right \rvert$$
 }
 \uncover<3->{ Thus, if one knows \tinto{$f_\te$} and can compute \blue{$\det$}, one can evaluate the transformed density $p_1$.  }
  \uncover<3->{
 This has applications in Bayesian inference, image generation, etc. 
 }
 
 
 \uncover<3->{ \textbf{Issues }
 	\begin{enumerate}
 		\item Needs invertible \tinto{$f_\te$}.
 		\item $\blue{\det}$ can be, at worst $\mathcal{O}(n^3)$,  $Y\in \R^n$.
 	\end{enumerate}
 }
 \uncover<4->{ One solution is to take \tinto{$f_\te$} triangular, but this reduces expressability of the transformation}
 
-\uncover<5->{\pink{Continuous normalizing flows might help}}
+\uncover<5->{\pink{Continuous normalizing as an alternative}}
 
 \end{frame}
 %%
 %%
 %%
 \begin{frame}{Change of variable formula via continuous transformation}
 \textbf{Idea:} Don't consider a "one shot" transformation, but a continuous one.
 
 
 \textbf{Theorem:}
 Consider  a \alert{continuous-in-time} transformation of $\yy(t,\te)=\yy_\te(t)$ given by $$\frac{\diff \yy_\te}{\diff t}(t,\te)=f\left(t, \yy_\te(t,\te),\te\right)=\tinto{f_\te}\left( \yy_\te(t),t\right)$$
 Then, under the assumption that $f_\te$ is uniformly  Lipschitz continuous in $t$, it follows that the change in log-probability is given by: $$\frac{\partial \log (p(\yy_\te(t)))}{\partial t}=-\text{Tr}\left(\frac{\partial\tinto{f_\te}}{\partial \yy_\te }(Y_\te(t),t)\right).$$
 
 \uncover<2->{
 Notice that:
 
 \begin{enumerate}
 	\item It involves a \pink{trace} instead of a \blue{determinant} $\implies$ cheaper.
 	\item $f_\te$ need not be bijective; if solution is unique, then, whole transf. is bijective.  
 \end{enumerate}
 
 }
 
 \end{frame}
 %%
 %%
 %%
 
 \begin{frame}{Proof}
 Want to show:
 $$\frac{\partial \log (p(\yy_\te(t)))}{\partial t}=-\text{Tr}\left(\frac{\diff f}{\diff \yy_\te (t)}\right).$$
 
 Take $\epsilon>0$ and let $\yy_\te(t+\epsilon)=T_\epsilon(\yy_\te(t))$. 
 	\begin{align}
 	\frac{\partial \log (p(\yy_\te(t)))}{\partial t}&=\lim_{\epsilon \to 0+}\frac{\log p(\yy_\te(y))-\log\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert-\log(p(\yy_\te(t)))}{\epsilon}\\
 	&=\lim_{\epsilon \to 0+}\frac{-\log\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert}{\epsilon}
 	=-\lim_{\epsilon \to 0+} \frac{\frac{\partial}{\partial \epsilon}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert}{\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert}\quad \text{ (L'H\^opital)}\\
 		&=-\underbrace{\left(\lim_{\epsilon \to 0+} \frac{\partial}{\partial \epsilon}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert\right)}_\text{bounded}\underbrace{\left(\lim_{\epsilon \to 0+}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert\right)}_\text{=1}\\
 		&=-\left(\lim_{\epsilon \to 0+} \frac{\partial}{\partial \epsilon}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert\right)
 	\end{align}
 \end{frame}
 
 %%
 %%
 %%
 
 \begin{frame}{proof}
 Recall \alert{Jacobi's formula} for an $n\times n$ matrix A: $\frac{d}{\diff t}\det{A(t)}=\text{Tr}\left( \text{Adj}(A(t))\frac{\diff A(t)}{\diff t}\right) .$ 
 
 Then, 
 \begin{align}
 =&-\lim_{\epsilon \to 0+} \frac{\partial}{\partial \epsilon}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert=-\lim_{\epsilon \to 0+}\text{Tr}\left( \text{Adj}\left(\frac{\partial}{\partial \yy_\te} T_\epsilon (\yy_\te(t))\right)\frac{\partial}{\partial \epsilon}\frac{\partial}{\partial \yy_\te}T_\epsilon(\yy_\te(t))\right)\\
 &=\text{Tr}\left(\underbrace{\left( -\lim_{\epsilon \to 0+} \text{Adj} \left(\frac{\partial}{\partial \yy_\te}T_\epsilon (\yy_\te(y))\right) \right)}_\text{=I} \left(-\lim_{\epsilon \to 0+} \frac{\partial }{\partial \epsilon}\frac{\partial}{\partial \yy_\te}T_\epsilon(\yy_\te(t))\right)      \right)\\
 &=\text{Tr}\left(-\lim_{\epsilon \to 0+} \frac{\partial }{\partial \epsilon}\frac{\partial}{\partial \yy_\te}\left(\yy_\te+\epsilon \tinto{f_\te}(\yy_\te(t),t)+\mathcal{O}(\epsilon ^2)\right)\right)\\
 &=\text{Tr}\left(-\lim_{\epsilon \to 0+} \frac{\partial }{\partial \epsilon}\left(I+\frac{\partial}{\partial \yy_\te}\epsilon \tinto{f_\te}(\yy_\te(t),t)+\mathcal{O}(\epsilon ^2)\right)\right)\\
 &=\text{Tr}\left(-\lim_{\epsilon \to 0+} \left(\frac{\partial}{\partial \yy_\te} \tinto{f_\te}(\yy_\te(t),t)+\mathcal{O}(\epsilon) \right)\right)
 =-\text{Tr}\left( \pd{\tinto{f_\te}(\yy_\te(t),t)}{Y}\right)
 \end{align}
 
 \end{frame}
 %%
 %%
 %%
 \begin{frame}{Example: Density Matching}
  Given a \tinto{target} $p$ we construct a \alert{flow} $q$, minimizing $J=\text{KL}(q\lVert p):=\int \log\left(\frac{q(\te)}{p(\te)}\right)q(\te)\diff \te$ (assuming we can evaluate both $p$ and $q$.)
 
 
 \uncover<2->{ 
 	
 \begin{enumerate}
 	\item \pink{Normalizing flow (NF)} $q(\yy_1)=\log p_0(\yy_0)- \log \left \lvert \blue{\det} \ \frac{\partial \tinto{f_\te}}{\partial \yy_0}\right \rvert$
 	\item \blue{Continuous normalizing flow (CNF)} $q$ solves $\frac{\partial \log (q(\yy(t)))}{\partial t}=-\text{Tr}\left(\frac{\partial f}{\partial \yy (t)}\right).$
 \end{enumerate}
 }
 
 \only<3>{
 	\begin{center}
 			\includegraphics[width=0.7\linewidth]{figures/comparisson_final}
 	\end{center}
 
 }
 \only<4>{
 		\begin{center}
 	\includegraphics[width=0.9\linewidth]{figures/noise_to_data}
 	\end{center}
 
 }
 
 
 
 
 \end{frame}
 
 
 %%
 %%
 %%
 \begin{frame}{Other applications: Time series}
 
 \begin{center}
 		\includegraphics[width=1\linewidth]{figures/time_model}
 \end{center}
 
 
 
 
 
 
 
 \end{frame}
 
 \begin{frame}{Other applications: Time series}
 
 
 
 \begin{center}
 	\includegraphics[width=0.6\linewidth]{figures/time_dyn}
 \end{center}
 
 
 
 
 \end{frame}
 %%
 %%
 %%
 \begin{frame}{Summary and conclusions}
 
 This paper can be seen more towards from a computational perspective than the previous one. Aim is to consider the time-continuous limit of the DNN and its interpretation as an ODE. Using this, one can use \alert{black-box} ODE solving routines.
  
 \begin{enumerate}
 	\item There is no notion of layers. Use number of function evaluations as a measure of depth.
 	\item Can speed up in terms of accuracy/cost. 
 	\item No control during training phase (due to black-box nature). More expensive than equivalent ResNet
 	\item Constant memory cost
 	\item Nice applications for density transport and continuous time models. 
 \end{enumerate}
 \end{frame}
 %%
 %%
 %%
 \end{document}