\usepackage{fontspec} % optional
% Listings
% Color stuff
\expandafter\newcommand\csname #1check\endcsname{\textcolor{#1}{\ding{52}}}%
\setbeamercolor{normal text}{fg=fore,bg=back}
\newcommand{\pink}[1]{{\color{magenta} #1}}
\newcommand{\blue}[1]{{\color{aqb} #1}}
\newcommand{\tinto}[1]{{\color{burgundy} #1}}
%symbol definitions
% Variables
% delimiters
\newcommand{\lno}{\left \Vert}
\newcommand{\rno}{\right \Vert}
\newcommand{\rv}{ \rvert}
\newcommand{\inner}[2]{\left\langle #1,#2 \right\rangle}
\newcommand{\abs}[1]{\left \vert#1 \right \vert}
% operators
\newcommand{\pd}[2]{\frac{\partial #1}{\partial #2}}
% check mark
\title{Continuous limits of DNN: Neural networks as ODEs}
%\subtitle{An overview}
\date{November 13, 2020}
\author{Eva Vidli\v ckov\'a and Juan Pablo Madrigal Cianci,\\
\institute{Foundations of deep neural networks}
%\begin{frame}{Table of contents}
% \setbeamertemplate{section in toc}[sections numbered]
% \tableofcontents[hideallsubsections]
%\section{Introduction and Motivation: ResNets}
Here we briefly describe what resnets are, maybe define some notation and make the point that they look like an ODE.
The rest of the talk summarizes the following two articles:
\section{Stable architectures for deep neural networks}
\begin{frame}{Classification problem}
\item training data:
& y_1,\dots, y_\tinto{s} \in \R^\tinto{n} \quad \text{ feature vectors }\\
& c_1,\dots, c_\tinto{s} \in \R^\tinto{m} \quad \text{ label vectors } \\
& (c_l)_k \text{ - likelihood of $y_l$ belonging to class $k$}
\item objective: learn data-label relation function that \alert{generalizes} well\\[20pt]\pause
\item \blue{deep architectures}\\[10pt]
\item[+] successful for highly nonlinear data-label relationships\\[5pt]
\item[--] dimensionality, non-convexity, \textbf{instability} of forward model
\begin{frame}{ResNets: Forward propagation}
%\textbf{Forward propagation}
Y_{j+1} = Y_j + \onslide<2->{\tinto{h}}\sigma(Y_j \blue{K_j} + \blue{b_j}), \quad j= 0,\dots,N-1\\[10pt]
Y_j \in\R^{s\times n},\, \blue{K_j}\in\R^{n\times n},\, \blue{b_j}\in\R
\item $Y_0 = [y_1,\dots,y_s]^\intercal$
\item $Y_1,\dots, Y_{N-1}$ - hidden layers,\; $Y_N$ - output layer
\item activation function \[\sigma_{ht}(Y) = \tanh(Y),\quad \sigma_{ReLU} = \max(0,Y)\]
\begin{frame}{ResNets: Classification}
\[ h_{hyp}(Y_N \pink{W} + e_s\pink{\mu}^\intercal),\quad \pink{W}\in \R^{n\times m}, \pink{\mu}\in\R^m \]
\item $h_{hyp}$ - hypothesis function\\
e.g. for Bernoulli variables ($C\in \{0,1\}^{s\times m}$):
\[ h_{hyp}(x) = \exp(x)./(1+\exp(x)) \]
\begin{frame}{Learning process}
forward prop. parameters\\[10pt] $(\blue{K_j}, \blue{b_j},\;\; j=0,\dots, N-1)$
classification parameters \\[10pt] $(\pink{W}, \pink{\mu})$
\centerline{\alert{\textbf{Optimization problem}}}
\min \frac{1}{s} \alert{S}\big(h_{hypo}(Y_N \pink{W} + e_s\pink{\mu}^\intercal), C\big) + \alpha \alert{R}(\pink{W},\pink{\mu},\blue{K_j}, \blue{b_j}) \\[5pt]
\text{ s.t. } Y_{j+1} = Y_j + h\sigma(Y_j \blue{K_j} + \blue{b_j}), \qquad j= 0,\dots,N-1
\item $C = [c_1,c_2,\dots,c_s]^\intercal\in\mathbb{R}^{s\times m}$
\item e.g. $\alert{S}(C_{pred},C) = \frac{1}{2}\|C_{pred} - C\|_F^2$
\begin{frame}{Learning process}
\item block coordinate descent method
\item \begin{align}
\frac{1}{\pink{s}} S\big(h_{hypo}(Y_N W + e_s\mu^\intercal), C\big) &= \frac{1}{\pink{s}} \sum_{i=1}^{\pink{s}} S\Big(h_{hypo}\big((Y_N)_i^\intercal W + \mu^\intercal\big), c_i^\intercal\Big)\\
&\approx \frac{1}{|\blue{\mathcal{T}}|} \sum_{i\in\blue{\mathcal{T}}} S\Big(h\big((Y_N)_i^\intercal W + \mu^\intercal\big), c_i^\intercal\Big)
\item learning data \& validation data
%\section{ODE interpretation}
\begin{frame}{ResNets as discretized ODEs}
Y_{j+1} = Y_j + h\sigma(Y_j K_j + b_j), \quad j= 0,\dots,N-1
\textbf{Continuous ODE}
&\dot{y}(t) = \sigma\big( K^\intercal(t)y(t) + b(t) \big), \quad t\in [0,T]\\
&y(0) = Y_0
\begin{frame}{Stability of continuous ODEs}
\dot{y}(t) = f(t, y(t))
\item \alert{linear} with \alert{constant} coefficients:\qquad $\dot{y}(t) = Ay(t) + b$\\
\item asymptotically stable if $\text{Re}(\lambda_i(A)) < 0,\;\forall i$
\item stable if $\text{Re}(\lambda_i(A)) < 0$ or $\text{Re}(\lambda_i(A)) = 0$ and geometrical multiplicity = algebraic multiplicity $\forall i$
\item unstable otherwise
\item \alert{nonlinear} : \qquad $\dot{y}(t) = f(t, y(t))$\\
$\to$ variational equation
\dot{z} = J(t) z\\
J(t) = \frac{\partial f}{\partial y}(t, y(t))
\item \alert{linear} with \alert{non-constant} coefficients:\qquad $\dot{y}(t) = A(t)y(t) + b(t)$\\
complicated (kinematic eigenvalues)
\begin{frame}{Stability for continuous forward propagation of NN }
\textbf{Continuous problem}
\dot{y}(t) = \sigma\big( K(t)^\intercal y(t) + b(t) \big), \quad y(0) = y_0
stability conditions for ODEs inspires following conditions:
\item $\blue{K(t)}, \blue{b(t)}$ changes sufficiently \alert{slowly}
\item \begin{align}
\max_{i=1,\dots,n} &\text{Re}\Big(\pink{\lambda_i}\big(J(t)\big)\Big) \leq 0,\quad\forall t\in [0,T]\\[10pt]
\text{where } J(t) &= \nabla_y\Big(\sigma\big( K(t)^\intercal y(t) + b(t) \big)\Big)^\intercal\\
&= \text{diag}\Big(\underbrace{\sigma'\big( K(t)^\intercal y(t) + b(t) \big)}_{\geq 0}\Big) K(t)^\intercal
$ \to \max_{i=1,\dots,n} \text{Re}\Big(\pink{\lambda_i}\big(K(t)\big)\Big) \leq 0,\quad\forall t\in [0,T].
\begin{frame}{Stability of discretized ODE and NNs}
\center{\alert{Discrete ODEs}}
\center{\alert{Neural networks}}
\dot{y} = Ay
\dot{y}(t) = \sigma\big( K(t)^\intercal y(t) + b(t) \big)
y_{j+1} = y_j + h A y_j
Y_{j+1} = Y_j + h\sigma(Y_j K_j + b_j)
stability condition on \tinto{h}:
| 1 + h\lambda_i(A) | \leq 1\quad \forall i
stability condition on \tinto{h}
\max_{i=1,\dots,n} |1+\tinto{h}\lambda_i(J_j)| \leq 1,\quad \forall j=0,1,\dots,N-1
\begin{frame}{Example: Stability of ResNet}
\includegraphics[width = 0.8\textwidth]{figures/ResNet_stab.png}
K_{+}&= \begin{pmatrix}
2 & -2\\ 0& 2
\end{pmatrix} & K_{-} &= \begin{pmatrix}
-2 & 0 \\ 2 & -1
\end{pmatrix} & K_0 &= \begin{pmatrix}
0 & -1 \\ 1 & 0
\lambda(K_+) &= 2 & \lambda(K_-) &= -2 & \lambda(K_0)&= i,-i
\item $s = 3,\, n=2,\, h = 0.1, \, b = 0,\, \sigma = \tanh,\, N = 10$
\begin{frame}{Well-posed forward propagation}
\item $\max_i \text{Re}(\pink{\lambda_i}(K)) > 0$\\[10pt]
\item neurons amplify signal with no upper bound
\item unreliable generalization\\[20pt]
\item $\max_i \text{Re}(\pink{\lambda_i}(K)) << 0$\\[10pt]
\item inverse problem highly ill-posed
\item vanishing gradients problem
\item lossy network\\[20pt]
$\implies \text{Re}(\pink{\lambda_i}(K(t))) \alert{\approx 0},\quad\forall i=1,2,\dots,n,\;\forall t\in [0,T]$
%\section{Stable architectures}
\begin{frame}{Antisymmetric weight matrices}
\dot{y}(t) = \sigma \Big( \frac{1}{2}\big(\underbrace{ K(t) - K(t)^\intercal }_{ \mathclap{\text{antisymmetric $\to$ imaginary eigenvalues}} } - \alert{\gamma} I\big)y(t) + b(t)\Big),\quad t\in [0,T]
\onslide<2->{\item $\alert{\gamma = 0}$} \onslide<3->{\begin{figure}
\includegraphics[scale = 0.25]{figures/RK_stab.png}
\end{figure} }
\item $\alert{\gamma > 0}$ \quad $\to$ Forward Euler discretization
$$Y_{j+1} = Y_j + \tinto{h}\sigma\Big(\frac{1}{2}Y_j (K_j - K_j^\intercal - \alert{\gamma} I) + b_j\Big)$$}
\begin{frame}{Hamiltonian inspired NN}
\dot{y}(t) = -\nabla_z \blue{H}(y,z,t), \quad \dot{z}(t) = \nabla_y \blue{H}(y,z,t),\quad t\in [0,T]
\item Hamiltonian $\blue{H}: \R^n\times \R^n\times [0,T]\to \R$ conserved\\[10pt]
\item energy \alert{conserved}, not dissipated\\[20pt]
\begin{frame}{Hamiltonian inspired NN}
Hamiltonian $\blue{H}(y,z) = \frac{1}{2}z^\intercal z - f(y)$\\[5pt]
$$\dot{y}(t) = -z(t), \; \dot{z}(t) = -\nabla_y f(y(t))\quad\implies \ddot{y}(t) = \nabla_y f(y(t))$$
\item $\alert{\ddot{y}(t) = \sigma\Big( K^\intercal (t) y(t) + b(t)\Big)},\; y(0) = Y_0,\; \dot{y}(0) = 0$\\[5pt]\pause
\item stable for $K$ with non-positive real eigenvalues\\[5pt]
\item $K(C) = -C^\intercal C,\quad C\in\R^{n\times n}$\\[5pt]
\item nonlinear parametrization - complicated optimization\\[5pt]
\item leapfrog discretization scheme (symplectic integrator)
\begin{frame}{Hamiltonian inspired NN}
\alert{\dot{y}(t) = \sigma\Big( K (t) z(t) + b(t)\Big) \qquad \dot{z}(t) = \sigma\Big( K^\intercal (t) y(t) + b(t)\Big)}
Associated ODE:
\frac{\partial}{\partial t} \begin{pmatrix}
\end{pmatrix}(t) &= \sigma \begin{pmatrix}\begin{pmatrix} 0 & K(t) \\ -K(t)^\intercal & 0 \end{pmatrix} \begin{pmatrix}
\end{pmatrix}(t) + b(t) \end{pmatrix}, \\
\end{pmatrix}(0) &= \begin{pmatrix}
\item antisymmetric matrix
\item Verlet integration scheme (symplectic)
$$ z_{j+1/2} = z_{j-1/2} - h\sigma(K_j^\intercal y_j + b_j),\quad y_{j+1} = y_j + h\sigma (K_j z_{j+1/2} + b_j)$$
\min \frac{1}{s} {S}\big(h_{hypo}(Y_N {W} + e_s{\mu}^\intercal), C\big) + \alpha \alert{R}(\pink{W},\pink{\mu},\blue{K_j}, \blue{b_j})
\item Forward propagation\\[10pt]
\item standard: weight decay (Tikhonov regularization) \[ R(K) = \frac{1}{2}\|K\|_F^2 \]
\item $\blue{K,\,b}$ to be sufficiently smooth
\[ \alert{R}(\blue{K}) = \frac{1}{2h}\sum \|K_j - K_{j-1}\|_F^2\quad \alert{R}(\blue{b}) = \frac{1}{2h}\sum \|b_j - b_{j-1}\|^2\]
\item Classification\\[10pt]
\item $h_{hypo}(y_j^\intercal w_k + \mu_k)\approx h_{hypo}\Big(\text{vol}(\Omega) \int_{\Omega} y(x)w(x)\mathrm{d}x + \mu_k \Big)$
\item $$\alert{R}(\pink{w_k}) = \frac{1}{2} \|L w_k\|^2\quad L - \text{discretized differential operator}$$
\item Multi-level learning
%\section{Numerical examples}
\begin{frame}{Concentric ellipses}
\includegraphics[width = \textwidth]{figures/Elipses.png}
\item 1200 points: 1000 training + 200 validation
\item multi-level: 4, 8, 16, \dots, 1024 layers
\item T = 20, n = 2, $\alpha = 10^{-3}$, $\sigma = \tanh$
\item standard ResNet, antisymmetric ResNet, Hamiltonian - Verlet network
\includegraphics[width = \textwidth]{figures/Convergence.png}
\begin{frame}{Swiss roll}
\includegraphics[width = \textwidth]{figures/Swiss_roll.png}
\item 513 points: 257 training + 256 validation
\item multi-level: 4, 8, 16, \dots, 1024 layers
\item T = 20, n = 4,4,2, $\alpha = 5\cdot 10^{-3}$, $\sigma = \tanh$
\item standard ResNet, antisymmetric ResNet, Hamiltonian - Verlet network
\includegraphics[width = \textwidth]{figures/Peaks.png}
\item 5000 samples: 20\% for validation
\item multi-level: 4, 8, 16, \dots, 1024 layers
\item T = 5, n = 8,8,2, $\alpha = 5\cdot 10^{-6}$, $\sigma = \tanh$
\item standard ResNet, antisymmetric ResNet, Hamiltonian - Verlet network
\includegraphics[scale = 0.8]{figures/MNIST.png}
\item 60 000 labeled images: 50 000 training, 10 000 validation,
\item 28 $\times$ 28, multi-level: 4,8,16
\item T = 6, width of network: 6 (n = 4704), $\alpha = 0.005$
\item $3 \times 3$ convolution operators, fully connected
\item standard ResNet, antisymmetric ResNet, Hamiltonian - Verlet network
\includegraphics[width = 0.9\textwidth]{figures/MNIST_table.png}
\section{Neural ODEs}
\begin{frame}[fragile]{Motivation: ResNets and Euler's method}
$$\yj=\ym+ h\underbrace{\sigma\left( \ym\kj+\bj\right)}_\text{$=f(\ym,\te_j)$}, \quad \text{Euler discretization of $\frac{\diff Y}{\diff t}=f(Y,\te(t))$}$$
%-+*efines the architecture
%def f(Y,t,θ):
%return neural_net(z,θ[t])
%#Defines the resnet
%def resnet(Y):
%for t in [1:T]:
% Y=Y+f(Y,t,θ)
%return Y
\blue{\#Defines the architecture}\\
\tinto{def} f(Y,t,θ):\\
\tinto{return} neural\_net(z,θ[t])\\ \vspace{1cm}}
\blue{\#Defines the ResNet}\\
\tinto{def} ODE\_Net(Y0):\\
\pink{for} t in [1:T]:\\
\hspace{0.5 cm} Y=Y+f(Y,t,θ)\\
{\tinto{return} Y}}
Can we do better?
\begin{frame}[fragile]{Improving on Euler's method}
$$\yj=\ym+ h\underbrace{f\left( \ym\kj+\bj\right)}_\text{$=f(\ym,\te_j)$}, \quad \text{Euler discretization of $\frac{\diff Y}{\diff t}=f(Y,\te(t))$}$$
%#Defines the architecture
%def f(Y,t,θ):
%return neural_net([z,t],θ[t])
%#Defines the ODE Net
%def ODE_Net(Y0):
%return ODE_Solver(f,Y0,\theta,t_0=0,t_f=1)
\blue{\#Defines the architecture}\\
\tinto{def} f(Y,t,θ):\\
\tinto{return} neural\_net(\pink{[z,t]},θ[t])\\ \vspace{0.5cm}}
\blue{\#Defines the ODE Net}\\
\tinto{def} ODE\_Net(Y0):\\
{\tinto{return} \alert{ODE\_Solver}(f,Y0,θ,t\_0=0,t\_f=1)}}
Here \texttt{\alert{ODE\_Solver}} is a black-box ODE solver.
\pink{Main idea:} Continuous depth + good ODE solver.
\blue{\#Defines the architecture}\\
\tinto{def} f(Y,t,θ):\\
\tinto{return} neural\_net(z,θ[t])\\ \vspace{0.25cm}}
\blue{\#Defines the ResNet}\\
\tinto{def} ResNet(Y):\\
\pink{for} t in [1:T]:\\
\hspace{0.5 cm} Y=Y+f(Y,t,θ)\\
{\tinto{return} Y}}
\blue{\#Defines the architecture}\\
\tinto{def} f(Y,t,θ):\\
\tinto{return} neural\_net(\pink{[z,t]},θ[t])\\ \vspace{0.25cm}}
\blue{\#Defines the ODENet}\\
\tinto{def} ODE\_Net(Y0):\\
{\tinto{return} \alert{ODE\_Solver}(f,Y0,θ,t\_0=0,t\_f=1)}}
%\begin{frame}{Some considerations}
% \includegraphics[width=0.2\linewidth]{figures/resnett}\hspace{2cm}
% \includegraphics[width=0.2\linewidth]{figures/odenett}
\begin{frame}{Training the Neural Network: Adjoint Method}
We aim at minimizing $J:R^p\mapsto R,$ $$J(\yy(t_f,\te))=J\left(\yy(t_0)+\int_{t_0}^{t_f}f(\yy,t,\te)\diff t \right)=J(\text{\texttt{\alert{ODE\_Solver}}}(f,\yy(t_0),\te,t_0=0,t_f=1)).$$
$$\frac{\partial J}{\partial \te}=?$$
\frac{\partial Y_t}{\partial Y_{t+1}}=\frac{\partial J}{\partial Y_{t+1}}\frac{\partial f(Y_t,\te)}{\partial Y_t}, \quad \pd{J}{\te_t}=\pd{J}{Y_T}\frac{\partial f(Y_t,\te)}{\partial \te_t}\end{align}
\textbf{Difficulties: }
\item \alert{\texttt{ODE\_Solver}} is a black-box.
\item There is no notion of layers, since we are on a continuous limit.
How does $\te$ depend on $\yy(t)$ at each instant $t$?
Don't use back-prop, but rather the \tinto{adjoint-state method} (Pontryagin et al. 1962.).
\begin{frame}{The adjoint method}
Define the Lagrangian $L$
L=J(Y(t_f,\te))+\int_{t_0}^{t_f} \blue{a^\mathsf{T}(t)}\alert{\left(\dot Y(t,\te)-f(t,Y(t,\te),\te)\right)}\diff t.
Clearly, since \alert{$\dot Y(t,\te)-f(t,Y(t,\te),\te)=0$}, $\pd{L}{\te}=\pd{J}{\te}$
\pd{L}{\te}=\pd{J}{Y}\pd{Y}{\te}(t_f)+\int_{t_0}^{t_f}\blue{a^\mathsf{T}(t)}\left(\pd{\dot Y}{\te}-\pd{f}{Y}\pd{Y}{\te}-\pd{f}{\te}\right)\diff t.
\int_{t_0}^{t_f}\blue{a^\mathsf{T}(t)}\pd{\dot Y}{\te}\diff t=\blue{a^\mathsf{T}(t)}\pd{Y}{\te}\left.\right \rvert_{t_0}^{t_f}-\int_{t_0}^{t_f}\blue{\dot{a}^\mathsf{T}(t)}\pd{Y}{\te}\diff t.
\pd{L}{\te}=\pd{J}{Y}\pd{Y}{\te}(t_f)-\int_{t_0}^{t_f}\left(\blue{\dot a^\mathsf{T}(t)}\pd{ Y}{\te}+\blue{a^\mathsf{T}(t)}\pd{f}{Y}\pd{Y}{\te}+\blue{a^\mathsf{T}(t)}\pd{f}{\te}\right)\diff t+ \blue{a^\mathsf{T}(t_f)}\pd{Y}{\te}(t_f)-\blue{a^\mathsf{T}(t_0)}\pd{Y}{\te}(t_0)
\begin{frame}{The adjoint method (cont'd)}
\pd{L}{\te}=\pd{J}{Y}\pd{Y}{\te}(t_f)-\int_{t_0}^{t_f}\left(\blue{\dot a^\mathsf{T}(t)}+\pd{f}{Y}\right)\left(\pd{ Y}{\te}\right)\diff t-\int_{t_0}^{t_f}\blue{a^\mathsf{T}(t)}\pd{f}{\te}\diff t+ \blue{a^\mathsf{T}(t_f)}\pd{Y}{\te}(t_f)-\blue{a^\mathsf{T}(t_0)}\pd{Y}{\te}(t_0)
\pink{\dot a^\mathsf{T}(t)=-a^\mathsf{T}(t)\pd{f}{Y}, \quad \quad a^\mathsf{T}(t_f)=-\pd{J}{Y}(t_f)},
and, since $\pd{Y}{\te}(t_0)=0$, then}
\pd{L}{\te}=-\int_{t_0}^{t_f}\blue{a^\mathsf{T}(t)}\pd{f}{\te}\diff t,
where $\blue{a^\mathsf{T}(t)}$ solves the \pink{adjoint equation}.
\item Run forward dynamic for $Y$.
\item Run backward dynamic for $\blue{a^\mathsf{T}(t)}$.
\item Compute $\pd{J}{\te}$.
\uncover<5->{Can be done without storing values \alert{implies} big save in memory, but solves 2 ODEs.}
%\begin{frame}{Training the Neural Network: Adjoint Method}
%Define first $$G(\yy,t_f,\te):=\int_{t_0}^{t_f} J(\yy,t,\te)\diff t, \quad \frac{\diff}{\diff t_f}G(\yy,t_f,\te)=J(\yy,t,\te)$$ and the Lagrangian $$L=G(\yy,t_f,\te)+\int_{t_0}^{t_f}\aat(t)\left( \dot{\yy}(t,\te)-f(\yy,t,\te) \right)\diff \te $$
%\frac{\partial L}{\partial \te}=\int_{t_0}^{t_f} \left(\frac{\partial J}{\partial \yy}\frac{\partial \yy}{\partial \te} +\frac{\partial J}{\partial \te}\right)\diff t +\int_{t_0}^{t_f}\aat(t)\left( \blue{\frac{\partial\dot{\yy}}{\partial \te}}-\frac{\partial f}{\partial \yy}\frac{\partial \yy}{\partial \te}- \frac{\partial f}{\partial \te} \right)\diff t
%\int_{t_0}^{t_f}\aat(t)\blue{\frac{\partial\dot{\yy}}{\partial \te}}\diff t=\aat(t)\frac{\partial{\yy}}{\partial \te}\rvert_{t_0}^{t_f}-\int_{t_0}^{t_f}\dat(t)\blue{\frac{\partial {\yy}}{\partial \te}}\diff t
%\begin{frame}{Adjoint method (cont'd)}
%\frac{\partial L}{\partial \te}&=\int_{t_0}^{t_f} \left(\frac{\partial J}{\partial \yy}\frac{\partial \yy}{\partial \te} +\frac{\partial J}{\partial \te}\right)\diff t +\int_{t_0}^{t_f}\aat(t)\left( \blue{\frac{\partial\dot{\yy}}{\partial \te}}-\frac{\partial f}{\partial \yy}\frac{\partial \yy}{\partial \te}- \frac{\partial f}{\partial \te} \right)\diff \te\\
%&=\int_{t_0}^{t_f} \left(\frac{\partial \yy}{\partial \te}\right)\alert{\left(\frac{\partial \yy}{\partial \te} -\aat\pd{f}{\yy}-\dat\right)}\diff t+\int_{t_0}^{t_f}-\aat \pd{f}{\te}+\pink{\pd{J}{\te}}\diff t +\left(\aat \pd{\yy}{\te}\right)_{t_0}^{t_f}\\
%Setting $\alert{\left(\frac{\partial J}{\partial \yy} -\aat\pd{f}{\yy}-\dat\right)}=0$, $\aat(t_f)=0$, one gets
%\frac{\partial L}{\partial \te}&=\int_{t_0}^{t_f}-\aat \pd{f}{\te}+\pink{\pd{J}{\te}}\diff t +\left(\aat \pd{\yy}{\te}\right)_{\blue{t_0}}^{t_f}\\&=\int_{t_0}^{t_f}-\aat \pd{f}{\te}+\pink{\pd{J}{\te}}\diff t +\left(\aat(t_0) \pd{\yy}{\te}(t_0)\right)
%\begin{frame}{Adjoint method (cont'd)}
%From $J(\yy,\te)=\frac{\diff}{\diff t_f} G(\yy,t_f,\te)$ then,
%\pd{J}{\te}&=\frac{\partial }{\partial t_f}\left(\int_{t_0}^{t_f}-\aat \pd{f}{\te}+\pink{\pd{J}{\te}}\diff t +\left(\aat(t_0) \pd{\yy}{\te}(t_0)\right)\right),\\
%&\frac{\partial }{\partial t_f} \left(\frac{\partial J}{\partial \yy} -\aat\pd{f}{\yy}-\dat\right)=0,\quad \frac{\partial }{\partial t_f}\aat(t_f)=0.
%Setting $\blue{\frac{\partial \aat}{\partial t_f}=a^\mathsf{T}(t)},$ one then has
%\blue{\dot{a}^\mathsf{T}(t)}&\blue{={a}^\mathsf{T}(t)\pd{f}{\te}},\quad \blue{{a}^\mathsf{T}(t_f)=\pd{J}{Y}(t_f)} \quad \text{(Adjoint equations)},\\
%\pd{J}{\te}&=\int_{t_0}^{t_f}\left(-\blue{a^\mathsf{T}(t)}\pd{f}{\te} +\pink{\pd{J}{\te}}\right)\diff t+ \left(\blue{a^\mathsf{T}(t_0)} \pd{\yy}{\te}(t_0)\right)
%\item Run forward dynamic for $Y$.
%\item Run backward dynamic for $\blue{a^\mathsf{T}(t)}$.
%\item Compute $\pd{J}{\te}$.
%\uncover<3->{Can be done without storing values \alert{implies} big save in memory, but solves 2 ODEs.}
\begin{frame}{Some considerations}
\item \alert{How deep are ODENets?} left to the ODE solver, complexity in terms of NFE
\item \tinto{Accuracy-cost trade-off} Evaluate forward pass at a lower accuracy/cheaper cost
\item \blue{Constant Memory Cost} Due to adjoint.
\item In practice, 2-4X more expensive to train than corresponding ResNet
\begin{frame}{Application: Density transform}
\alert{Normalizing flows}
Given $\yy_0\sim p_0$ and \tinto{$f_\te$} s.t $ \yy_1=\tinto{f_\te}(\yy_0),$ can we sample from $p_1$, with $Y_1\sim p_1$?
if $\tinto{f_\te}$ is invertible, then one has that
$$p_1(Y_1)=p_0(\tinto{f}^{-1}(Y_1))\left \lvert \blue{\det} \ \frac{\partial \tinto{f_\te}^{-1}}{\partial \yy_0}\right \rvert \text{\alert{$\implies$}} \log p_1(\yy_1)=\log p_0(\yy_0)- \log \left \lvert \blue{\det} \ \frac{\partial \tinto{f_\te}}{\partial \yy_0}\right \rvert$$
\uncover<3->{ Thus, if one knows \tinto{$f_\te$} and can compute \blue{$\det$}, one can evaluate the transformed density $p_1$. }
This has applications in Bayesian inference, image generation, etc.
\uncover<3->{ \textbf{Issues }
\item Needs invertible \tinto{$f_\te$}.
\item $\blue{\det}$ can be, at worst $\mathcal{O}(n^3)$, $Y\in \R^n$.
\uncover<4->{ One solution is to take \tinto{$f_\te$} triangular, but this reduces expressability of the transformation}
\uncover<5->{\pink{Continuous normalizing as an alternative}}
\begin{frame}{Change of variable formula via continuous transformation}
\textbf{Idea:} Don't consider a "one shot" transformation, but a continuous one.
Consider a \alert{continuous-in-time} transformation of $\yy(t,\te)=\yy_\te(t)$ given by $$\frac{\diff \yy_\te}{\diff t}(t,\te)=f\left(t, \yy_\te(t,\te),\te\right)=\tinto{f_\te}\left( \yy_\te(t),t\right)$$
Then, under the assumption that $f_\te$ is uniformly Lipschitz continuous in $t$, it follows that the change in log-probability is given by: $$\frac{\partial \log (p(\yy_\te(t)))}{\partial t}=-\text{Tr}\left(\frac{\partial\tinto{f_\te}}{\partial \yy_\te }(Y_\te(t),t)\right).$$
Notice that:
\item It involves a \pink{trace} instead of a \blue{determinant} $\implies$ cheaper.
\item $f_\te$ need not be bijective; if solution is unique, then, whole transf. is bijective.
Want to show:
$$\frac{\partial \log (p(\yy_\te(t)))}{\partial t}=-\text{Tr}\left(\frac{\diff f}{\diff \yy_\te (t)}\right).$$
Take $\epsilon>0$ and let $\yy_\te(t+\epsilon)=T_\epsilon(\yy_\te(t))$.
\frac{\partial \log (p(\yy_\te(t)))}{\partial t}&=\lim_{\epsilon \to 0+}\frac{\log p(\yy_\te(y))-\log\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert-\log(p(\yy_\te(t)))}{\epsilon}\\
&=\lim_{\epsilon \to 0+}\frac{-\log\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert}{\epsilon}
=-\lim_{\epsilon \to 0+} \frac{\frac{\partial}{\partial \epsilon}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert}{\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert}\quad \text{ (L'H\^opital)}\\
&=-\underbrace{\left(\lim_{\epsilon \to 0+} \frac{\partial}{\partial \epsilon}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert\right)}_\text{bounded}\underbrace{\left(\lim_{\epsilon \to 0+}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert\right)}_\text{=1}\\
&=-\left(\lim_{\epsilon \to 0+} \frac{\partial}{\partial \epsilon}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert\right)
Recall \alert{Jacobi's formula} for an $n\times n$ matrix A: $\frac{d}{\diff t}\det{A(t)}=\text{Tr}\left( \text{Adj}(A(t))\frac{\diff A(t)}{\diff t}\right) .$
=&-\lim_{\epsilon \to 0+} \frac{\partial}{\partial \epsilon}\lvert \det \frac{\partial T_\epsilon(\yy_\te(t))}{\partial \yy_\te}\rvert=-\lim_{\epsilon \to 0+}\text{Tr}\left( \text{Adj}\left(\frac{\partial}{\partial \yy_\te} T_\epsilon (\yy_\te(t))\right)\frac{\partial}{\partial \epsilon}\frac{\partial}{\partial \yy_\te}T_\epsilon(\yy_\te(t))\right)\\
&=\text{Tr}\left(\underbrace{\left( -\lim_{\epsilon \to 0+} \text{Adj} \left(\frac{\partial}{\partial \yy_\te}T_\epsilon (\yy_\te(y))\right) \right)}_\text{=I} \left(-\lim_{\epsilon \to 0+} \frac{\partial }{\partial \epsilon}\frac{\partial}{\partial \yy_\te}T_\epsilon(\yy_\te(t))\right) \right)\\
&=\text{Tr}\left(-\lim_{\epsilon \to 0+} \frac{\partial }{\partial \epsilon}\frac{\partial}{\partial \yy_\te}\left(\yy_\te+\epsilon \tinto{f_\te}(\yy_\te(t),t)+\mathcal{O}(\epsilon ^2)\right)\right)\\
&=\text{Tr}\left(-\lim_{\epsilon \to 0+} \frac{\partial }{\partial \epsilon}\left(I+\frac{\partial}{\partial \yy_\te}\epsilon \tinto{f_\te}(\yy_\te(t),t)+\mathcal{O}(\epsilon ^2)\right)\right)\\
&=\text{Tr}\left(-\lim_{\epsilon \to 0+} \left(\frac{\partial}{\partial \yy_\te} \tinto{f_\te}(\yy_\te(t),t)+\mathcal{O}(\epsilon) \right)\right)
=-\text{Tr}\left( \pd{\tinto{f_\te}(\yy_\te(t),t)}{Y}\right)
\begin{frame}{Example: Density Matching}
Given a \tinto{target} $p$ we construct a \alert{flow} $q$, minimizing $J=\text{KL}(q\lVert p):=\int \log\left(\frac{q(\te)}{p(\te)}\right)q(\te)\diff \te$ (assuming we can evaluate both $p$ and $q$.)
\item \pink{Normalizing flow (NF)} $q(\yy_1)=\log p_0(\yy_0)- \log \left \lvert \blue{\det} \ \frac{\partial \tinto{f_\te}}{\partial \yy_0}\right \rvert$
\item \blue{Continuous normalizing flow (CNF)} $q$ solves $\frac{\partial \log (q(\yy(t)))}{\partial t}=-\text{Tr}\left(\frac{\partial f}{\partial \yy (t)}\right).$
\begin{frame}{Other applications: Time series}
\begin{frame}{Other applications: Time series}
\begin{frame}{Summary and conclusions}
This paper can be seen more towards from a computational perspective than the previous one. Aim is to consider the time-continuous limit of the DNN and its interpretation as an ODE. Using this, one can use \alert{black-box} ODE solving routines.
\item There is no notion of layers. Use number of function evaluations as a measure of depth.
\item Can speed up in terms of accuracy/cost.
\item No control during training phase (due to black-box nature). More expensive than equivalent ResNet
\item Constant memory cost
\item Nice applications for density transport and continuous time models.

