\begin{frame}{Training the Neural Network: Adjoint Method}
We aim at minimizing $J:R^p\mapsto R,$$$J(\yy(t_f,\te))=J\left(\yy(t_0)+\int_{t_0}^{t_f}f(\yy,t,\te)\diff t \right)=J(\text{\texttt{\alert{ODE\_Solver}}}(f,\yy(t_0),\te,t_0=0,t_f=1)).$$
\item There is no notion of layers, since we are on a continuous limit.
\end{enumerate}
How does $\te$ depend on $\yy(t)$ at each instant $t$?
Don't use back-prop, but rather the \tinto{adjoint-state method} (Pontryagin et al. 1962.).
\end{frame}
\begin{frame}{The adjoint method}
Define the Lagrangian $L$
\begin{align}
L=J(Y(t_f,\te))+\int_{t_0}^{t_f}\blue{a^\mathsf{T}(t)}\alert{\left(\dot Y(t,\te)-f(t,Y(t,\te),\te)\right)}\diff t.
\end{align}
Clearly, since \alert{$\dot Y(t,\te)-f(t,Y(t,\te),\te)=0$}, $\pd{L}{\te}=\pd{J}{\te}$
\uncover<2->{
\begin{align}
\pd{L}{\te}=\pd{J}{Y}\pd{Y}{\te}(t_f)+\int_{t_0}^{t_f}\blue{a^\mathsf{T}(t)}\left(\pd{\dot Y}{\te}-\pd{f}{Y}\pd{Y}{\te}-\pd{f}{\te}\right)\diff t.
\end{align}
}
\uncover<4->{IBP
\begin{align}
\int_{t_0}^{t_f}\blue{a^\mathsf{T}(t)}\pd{\dot Y}{\te}\diff t=\blue{a^\mathsf{T}(t)}\pd{Y}{\te}\left.\right\rvert_{t_0}^{t_f}-\int_{t_0}^{t_f}\blue{\dot{a}^\mathsf{T}(t)}\pd{Y}{\te}\diff t.
\State\blue{\#Solve backwards in time:}$$\left[ z(t_0),\pd{J}{Y}(t_0),\pd{J}{\te}\right]=\text{\alert{\texttt{ODE\_Solver}}}(S_0,\text{\texttt{aug\_dynamics}},t_1,t_0,\te)$$
\EndProcedure
\end{algorithmic}
\end{algorithm}
\end{frame}
%%
%%
%%%
%\begin{frame}{Training the Neural Network: Adjoint Method}
%Define first $$G(\yy,t_f,\te):=\int_{t_0}^{t_f} J(\yy,t,\te)\diff t, \quad \frac{\diff}{\diff t_f}G(\yy,t_f,\te)=J(\yy,t,\te)$$ and the Lagrangian $$L=G(\yy,t_f,\te)+\int_{t_0}^{t_f}\aat(t)\left( \dot{\yy}(t,\te)-f(\yy,t,\te) \right)\diff \te $$
\uncover<3->{ Thus, if one knows \tinto{$f_\te$} and can compute \blue{$\det$}, one can evaluate the transformed density $p_1$. }
\uncover<3->{
This has applications in Bayesian inference, image generation, etc.
}
\uncover<3->{\textbf{Issues }
\begin{enumerate}
\item Needs invertible \tinto{$f_\te$}.
\item$\blue{\det}$ can be, at worst $\mathcal{O}(n^3)$, $Y\in\R^n$.
\end{enumerate}
}
\uncover<4->{ One solution is to take \tinto{$f_\te$} triangular, but this reduces expressability of the transformation}
\uncover<5->{\pink{Continuous normalizing flows as an alternative}}
\end{frame}
%%
%%
%%
\begin{frame}{Change of variable formula via continuous transformation}
\textbf{Idea:} Don't consider a "one shot" transformation, but a continuous one.
\textbf{Theorem:}
Consider a \alert{continuous-in-time} transformation of $\yy(t,\te)=\yy_\te(t)$ given by $$\frac{\diff\yy_\te}{\diff t}(t,\te)=f\left(t, \yy_\te(t,\te),\te\right)=\tinto{f_\te}\left(\yy_\te(t),t\right)$$
Then, under the assumption that $f_\te$ is uniformly Lipschitz continuous in $t$, it follows that the change in log-probability is given by: $$\frac{\partial\log(p(\yy_\te(t)))}{\partial t}=-\text{Tr}\left(\frac{\partial\tinto{f_\te}}{\partial\yy_\te }(Y_\te(t),t)\right).$$
\uncover<2->{
Notice that:
\begin{enumerate}
\item It involves a \pink{trace} instead of a \blue{determinant}$\implies$ cheaper.
\item$f_\te$ need not be bijective; if solution is unique, then, whole transf. is bijective.
This paper can be seen more towards from a computational perspective than the previous one. Aim is to consider the time-continuous limit of the DNN and its interpretation as an ODE. Using this, one can use \alert{black-box} ODE solving routines.
\begin{enumerate}
\item There is no notion of layers. Use number of function evaluations as a measure of depth.
\item Can speed up in terms of accuracy/cost.
\item No control during training phase (due to black-box nature). More expensive than equivalent ResNet
\item Constant memory cost
\item Nice applications for density transport and continuous time models.