mlkr_formulas.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, May 7, 20:19

mlkr_formulas.tex
View Options

	\documentclass[12pt]{iopart}

	\usepackage{xcolor}
	\usepackage{graphicx}
	\usepackage{amsmath}
	\usepackage{amssymb}
	\usepackage{amsthm}
	\usepackage{subcaption}
	\usepackage{siunitx}
	\usepackage{dirtytalk}
	\usepackage[version=3]{mhchem} % Formula subscripts using \ce{}
	\usepackage{bm}
	\usepackage{xr}
	\usepackage{xcolor}
	\usepackage[normalem]{ulem}
	\usepackage[style=numeric,citestyle=nature,sorting=none,backend=biber]{biblatex}
	\addbibresource{bibliography.bib}

	\newcommand*\mycommand[1]{\texttt{\emph{#1}}}
	\newcommand{\mc}[1]{{\color{blue}#1}}
	\newcommand\R{\mathbb{R}}
	\newcommand\XA{X^{(A)}}
	\newcommand\Xa{X^{(\alpha)}}
	\newcommand\yA{y^{(A)}}
	\newcommand\ya{y^{(\alpha)}}
	\newcommand\dA[1]{\frac{\partial #1}{\partial A}}
	\newcommand\yh{\hat{y}}
	\newcommand\kernel[2]{\frac{1}{\sqrt{2\pi} \sigma} e^{-\frac{d(x_{#1}, x_{#2})^2}{\sigma^2}}}
	\newcommand\Hmo{H^{-1}}

	\newtheorem*{remark}{Remark}
	\newtheorem{lemma}{Lemma}
	\newtheorem{corollary}{Corollary}

	\makeatletter
	\newcommand*{\addFileDependency}[1]{
	\typeout{(#1)}
	\@addtofilelist{#1}
	\IfFileExists{#1}{}{\typeout{No file #1.}}
	}
	\makeatother

	\newcommand*{\myexternaldocument}[1]{
	\externaldocument{#1}
	\addFileDependency{#1.tex}
	\addFileDependency{#1.aux}
	}

	\renewcommand{\thefootnote}{\textit{\alph{footnote}}}

	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	%%% Not sure how to include SI right now with this style but will do later
	\DeclareUnicodeCharacter{2212}{-}
	\DeclareUnicodeCharacter{394}{$\Delta$}
	% put all the external documents here!
	%\myexternaldocument{SI}
	%\listfiles

	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	\begin{document}


	First training set for A:

	$\Xa = \begin{pmatrix} -\quad x_1^T \quad - \\ \vdots \\ -\quad x_{n_1}^T \quad - \end{pmatrix} \in \R^{n_1 \times n}$ and $\ya = \begin{pmatrix} y_1 \\ \vdots \\ y_{n_1} \end{pmatrix}$.

	Second training set for $\alpha$:

	$\XA = \begin{pmatrix} -\quad x_1^T \quad - \\ \vdots \\ -\quad x_{n_2}^T \quad - \end{pmatrix} \in \R^{n_2 \times n}$ and $\yA = \begin{pmatrix} y_1 \\ \vdots \\ y_{n_2} \end{pmatrix}$.

	The line vectors of the matrix $\XA$ and $\Xa$ and the components of $\yA$ and $\ya$ are distinguished by the range of their index for ease of notation.

	$i$ will range over $\{1, \cdots, n_2 \}$ and $j, a, b$ over $\{1, \cdots, n_1\}$ which makes no sense but I don't want to change it right now.

	\subsection{Ridge regression}
	Consider the Mahalanobis distance $d(x,y)^2 = \Vert A(x-y) \Vert_2^2$.

	We define the first kernel matrix $K \in \R^{n_1 \times n_1}$ such that $K_{j,j'} = \kernel{j}{j'}$.

	Set the ridge regression coefficients $\alpha \in \R^{n_1}$ by $\alpha = (K + \lambda I)^{-1} \ya$.

	\begin{remark}
	If $K$ is singular, there are many choices of $\alpha$ and this one might not be optimal. However the exact derivation of the gradient needs a fixed formula and I am scared of SVD.
	\end{remark}

	From $\alpha$, we define $\yh_i = \sum_j \alpha_j k_{ij}$, where $k_{ij} = \kernel{i}{j}$.

	\subsection{Cost function}

	Let the cost function $\mathcal{L} = \sum_i (\yh_i - y_i)^2$ which implicitly depends on $A$.

	We aim to compute $\dA{\mathcal{L}}$ which is a matrix verifying $(\dA{\mathcal{L}})_{ij} = \frac{\partial \mathcal{L}}{\partial A_{ij}}$.

	\section{Computing the gradient}
	\begin{equation}
	\dA{\mathcal{L}} = 2 \sum_i (\yh_i - y_i) \dA{\yh_i}.
	\end{equation}

	From the expression of $\yh_i$,
	\begin{equation}
	\dA{\yh_i} = \sum_j \dA{}(\alpha_j k_{ij}) = \sum_j \dA{k_{ij}} \alpha_j + \sum_j \dA{\alpha_j} k_{ij}.
	\end{equation}

	The gradient boils down to computing $\dA{k_{ij}}$ and $\dA{\alpha_j}$.

	The first part is easy to compute so I will do it first because by now I know it by heart.

	\subsection{Computing $\dA{k_{ij}}$ }\label{ssec:1}
	Let $x_{ij} := x_i - x_j$, such that $d(x_i, x_j)^2 = \Vert A x_{ij} \Vert_2^2$. Then
	\begin{align*}
	\dA{k_{ij}} &= \dA{}(\kernel{i}{j}), \\
	&= -\frac{1}{\sigma^2} k_{ij} \dA{}(d(x_i,x_j)^2), \\
	&= -\frac{1}{\sigma^2} k_{ij} \dA{}(\Vert A(x_{ij}) \Vert_2^2), \\
	&= -\frac{2k_{ij}}{\sigma^2} A x_{ij}x_{ij}^T,
	\end{align*}
	where the last equation comes from Lemma \ref{lem:1} of the appendix.

	\subsection{Computing $\dA{\alpha_j}$}
	Brace yourself for this journey is long.

	Let $H=(K+ \lambda I)$ the matrix used to defined $\alpha = \Hmo \ya$. Clearly,
	\begin{equation}
	\dA{\alpha_j} = \sum_i \dA{}(\Hmo)_{ji} y_i
	\end{equation}
	Deriving $HH^-1 = I$, we get $H' = -\Hmo H' \Hmo = - \Hmo K' \Hmo$, and thus

	\begin{align*}
	\dA{}(\Hmo)_{ji} &= \sum_{m,n} \frac{\partial}{\partial A_{mn}} (\Hmo)_{ji} e_{mn}, \\
	&= \sum_{m,n} (\frac{\partial}{\partial A_{mn}} \Hmo)_{ji} e_{mn}, \\
	&= -\sum_{m,n} (\Hmo \frac{\partial K}{\partial A_{mn}} \Hmo)_{ji} e_{mn}, \\
	&= - \sum_{m,n,a,b} (\Hmo)_{ja} \frac{\partial K_{ab}}{\partial A_{mn}} (\Hmo)_{bi} e_{mn}, \\
	&= - \sum_{a,b} (\Hmo)_{ja} (\Hmo)_{bi} \sum_{m,n} \frac{\partial K_{ab}}{\partial A_{mn}} e_{mn}, \\
	&= - \sum_{a,b} (\Hmo)_{ja} (\Hmo)_{bi} \dA{K_{ab}}.
	\end{align*}
	Using the result from section \ref{ssec:1}, we find that $\dA{K_{ab}} = -\frac{2 k_{ab}}{\sigma^2} A x_{ab} x_{ab}^T$. Note that both $a$ and $b$ range over $\{ 1, \cdots, n_1\}$, as they index the first kernel $K$. Hence
	\begin{equation}
	\dA{}(\Hmo)_{ji} = \frac{2}{\sigma^2} A \sum_{a,b} (\Hmo)_{ja} (\Hmo)_{bi} k_{ab} x_{ab}x_{ab}^T,
	\end{equation}
	and
	\begin{align*}
	\dA{\alpha_j} &= \sum_i \dA{}(\Hmo)_{ji} y_i, \\
	&= \frac{2}{\sigma^2} A \sum_{i,a,b} (\Hmo)_{ja} (\Hmo)_{bi} k_{ab} x_{ab}x_{ab}^T y_i, \\
	&= \frac{2}{\sigma^2} A \sum_{a,b} (\Hmo)_{ja} k_{ab} x_{ab}x_{ab}^T \sum_{i} (\Hmo)_{bi} y_i, \\
	&= \frac{2}{\sigma^2} A \sum_{a,b} (\Hmo)_{ja} k_{ab} (\Hmo \ya)_b x_{ab}x_{ab}^T.
	\end{align*}

	\subsection{Final result}

	Combining everything we get hopefully the right answer.

	\begin{align*}
	\dA{\mathcal{L}} &= 2 \sum_i (\yh_i - y_i) \dA{\yh_i}, \\
	&= 2 \sum_{i,j} (\yh_i - y_i) [\dA{k_{ij}} \alpha_j + \dA{\alpha_j} k_{ij}], \\
	&= U + V.
	\end{align*}

	\begin{align*}
	U &= 2 \sum_{i,j} (\yh_i - y_i) \dA{k_{ij}} \alpha_j, \\
	&= -\frac{4}{\sigma^2} A \sum_{i,j} (\yh_i - y_i) \alpha_j k_{ij} x_{ij}x_{ij}^T, \\
	&= -\frac{4}{\sigma^2} A (-\XA^T W \Xa - \Xa W^T \XA^T + \XA^T R \XA + \Xa^T S \Xa),
	\end{align*}
	where $W_{ij} := (\yh_i - y_i) \alpha_j k_{ij}$, and $R, S$ are defined as in lemma \ref{lem:2}.

	\begin{align*}
	V &= 2 \sum_{i,j} (\yh_i - y_i) \dA{\alpha_j} k_{ij}, \\
	&= \frac{4}{\sigma^2} A \sum_{i,j,a,b} (\yh_i - y_i) k_{ij} (\Hmo)_{ja} k_{ab} (\Hmo \ya)_b x_{ab}x_{ab}^T, \\
	&= \frac{4}{\sigma^2} A \sum_{a,b} k_{ab} (\Hmo \ya)_b x_{ab}x_{ab}^T \sum_{i,j} (\yh_i - y_i) k_{ij} (\Hmo)_{ja}, \\
	&= \frac{4}{\sigma^2} A \sum_{a,b} k_{ab} (\Hmo \ya)_b x_{ab}x_{ab}^T [H^{-T} Q^T (\yh - \ya)]_{a},
	\end{align*}
	where $Q \in \R^{n_2 \times n_1}$ is the second kernel defined by $Q_{ij}=\kernel{i}{j}$.

	In this case, $\tilde{W}_{ab} := k_{ab} (\Hmo \ya)_b [H^{-T} Q^T (\yh - \ya)]_{a}$, and corollary \ref{cor:1} yields
	\begin{equation}
	V = \frac{4}{\sigma^2} A \Xa^T (-\tilde{W} - \tilde{W}^T + \tilde{R} + \tilde{S})\Xa,
	\end{equation}
	with matrices defined as in lemma \ref{lem:2}.

	Finally,
	\begin{align*}
	\dA{\mathcal{L}} = &-\frac{4}{\sigma^2} A (-\XA^T W \Xa - \Xa W^T \XA^T + \XA^T R \XA + \Xa^T S \Xa) \\
	&+ \frac{4}{\sigma^2} A \Xa^T (-\tilde{W} - \tilde{W}^T + \tilde{R} + \tilde{S})\Xa.
	\end{align*}

	\section{Appendix}

	\begin{lemma}\label{lem:1}
	\begin{equation}
	\dA{}(\Vert Az \Vert_2^2) = 2 A z z^T.
	\end{equation}
	\end{lemma}

	\begin{lemma}\label{lem:2}
	Consider two matrices $A$ and $B$ having the same shape as $\Xa$ and $\XA$ of the introduction, and $x_{ij} = a_i - b_j$ ($i$-th line of $A$ minus $j$-th line of $B$). Set further the matrix
	\begin{equation}
	\Sigma = \sum_{i,j} W_{ij} x_{ij}x_{ij}^T,
	\end{equation}
	with some coefficients $W_{ij}$ making up a matrix $W$. Then
	\begin{equation}
	\Sigma = -A^T W B^T - B^T W^T A + A^T R A + B^T S B,
	\end{equation}
	where $R$ and $S$ are both diagonal matrices with $R_{ii} = \sum_j W_{ij}$, and $S_{jj} = \sum_i W_{ij}$.
	\end{lemma}
	\begin{proof}
	AAAAAAAAAA
	\end{proof}

	\begin{corollary}\label{cor:1}
	In the case that $A=B$, we get that
	\begin{equation}
	\Sigma = A^T (-W-W^T+R+S) A^T.
	\end{equation}
	\end{corollary}



	\clearpage
	\printbibliography
	\clearpage
	\section*{Supporting Information}

	\end{document}


	\documentclass[12pt]{article}
	\usepackage[utf8]{inputenc}
	% \usepackage{amsmath}
	% \usepackage{amssymb}
	% \usepackage{amsthm}

	% \newcommand\R{\mathbb{R}}
	% \newcommand\XA{X^{(A)}}
	% \newcommand\Xa{X^{(\alpha)}}
	% \newcommand\yA{y^{(A)}}
	% \newcommand\ya{y^{(\alpha)}}
	% \newcommand\dA[1]{\frac{\partial #1}{\partial A}}
	% \newcommand\yh{\hat{y}}
	% \newcommand\kernel[2]{\frac{1}{\sqrt{2\pi} \sigma} e^{-\frac{d(x_{#1}, x_{#2})^2}{\sigma^2}}}
	% \newcommand\Hmo{H^{-1}}

	% \newtheorem*{remark}{Remark}
	% \newtheorem{lemma}{Lemma}
	% \newtheorem{corollary}{Corollary}

mlkr_formulas.texNo OneTemporaryActions

File Metadata

mlkr_formulas.texView Options

Event Timeline

mlkr_formulas.tex
No OneTemporary
Actions

mlkr_formulas.tex
View Options