Page MenuHomec4science

No OneTemporary

File Metadata

Wed, Aug 10, 23:23


\usepackage[version=3]{mhchem} % Formula subscripts using \ce{}
\newcommand\dA[1]{\frac{\partial #1}{\partial A}}
\newcommand\kernel[2]{\frac{1}{\sqrt{2\pi} \sigma} e^{-\frac{d(x_{#1}, x_{#2})^2}{\sigma^2}}}
\IfFileExists{#1}{}{\typeout{No file #1.}}
%%% Not sure how to include SI right now with this style but will do later
% put all the external documents here!
First training set for A:
$\Xa = \begin{pmatrix} -\quad x_1^T \quad - \\ \vdots \\ -\quad x_{n_1}^T \quad - \end{pmatrix} \in \R^{n_1 \times n}$ and $\ya = \begin{pmatrix} y_1 \\ \vdots \\ y_{n_1} \end{pmatrix}$.
Second training set for $\alpha$:
$\XA = \begin{pmatrix} -\quad x_1^T \quad - \\ \vdots \\ -\quad x_{n_2}^T \quad - \end{pmatrix} \in \R^{n_2 \times n}$ and $\yA = \begin{pmatrix} y_1 \\ \vdots \\ y_{n_2} \end{pmatrix}$.
The line vectors of the matrix $\XA$ and $\Xa$ and the components of $\yA$ and $\ya$ are distinguished by the range of their index for ease of notation.
$i$ will range over $\{1, \cdots, n_2 \}$ and $j, a, b$ over $\{1, \cdots, n_1\}$ which makes no sense but I don't want to change it right now.
\subsection{Ridge regression}
Consider the Mahalanobis distance $d(x,y)^2 = \Vert A(x-y) \Vert_2^2$.
We define the first kernel matrix $K \in \R^{n_1 \times n_1}$ such that $K_{j,j'} = \kernel{j}{j'}$.
Set the ridge regression coefficients $\alpha \in \R^{n_1}$ by $\alpha = (K + \lambda I)^{-1} \ya$.
If $K$ is singular, there are many choices of $\alpha$ and this one might not be optimal. However the exact derivation of the gradient needs a fixed formula and I am scared of SVD.
From $\alpha$, we define $\yh_i = \sum_j \alpha_j k_{ij}$, where $k_{ij} = \kernel{i}{j}$.
\subsection{Cost function}
Let the cost function $\mathcal{L} = \sum_i (\yh_i - y_i)^2$ which implicitly depends on $A$.
We aim to compute $\dA{\mathcal{L}}$ which is a matrix verifying $(\dA{\mathcal{L}})_{ij} = \frac{\partial \mathcal{L}}{\partial A_{ij}}$.
\section{Computing the gradient}
\dA{\mathcal{L}} = 2 \sum_i (\yh_i - y_i) \dA{\yh_i}.
From the expression of $\yh_i$,
\dA{\yh_i} = \sum_j \dA{}(\alpha_j k_{ij}) = \sum_j \dA{k_{ij}} \alpha_j + \sum_j \dA{\alpha_j} k_{ij}.
The gradient boils down to computing $\dA{k_{ij}}$ and $\dA{\alpha_j}$.
The first part is easy to compute so I will do it first because by now I know it by heart.
\subsection{Computing $\dA{k_{ij}}$ }\label{ssec:1}
Let $x_{ij} := x_i - x_j$, such that $d(x_i, x_j)^2 = \Vert A x_{ij} \Vert_2^2$. Then
\dA{k_{ij}} &= \dA{}(\kernel{i}{j}), \\
&= -\frac{1}{\sigma^2} k_{ij} \dA{}(d(x_i,x_j)^2), \\
&= -\frac{1}{\sigma^2} k_{ij} \dA{}(\Vert A(x_{ij}) \Vert_2^2), \\
&= -\frac{2k_{ij}}{\sigma^2} A x_{ij}x_{ij}^T,
where the last equation comes from Lemma \ref{lem:1} of the appendix.
\subsection{Computing $\dA{\alpha_j}$}
Brace yourself for this journey is long.
Let $H=(K+ \lambda I)$ the matrix used to defined $\alpha = \Hmo \ya$. Clearly,
\dA{\alpha_j} = \sum_i \dA{}(\Hmo)_{ji} y_i
Deriving $HH^-1 = I$, we get $H' = -\Hmo H' \Hmo = - \Hmo K' \Hmo$, and thus
\dA{}(\Hmo)_{ji} &= \sum_{m,n} \frac{\partial}{\partial A_{mn}} (\Hmo)_{ji} e_{mn}, \\
&= \sum_{m,n} (\frac{\partial}{\partial A_{mn}} \Hmo)_{ji} e_{mn}, \\
&= -\sum_{m,n} (\Hmo \frac{\partial K}{\partial A_{mn}} \Hmo)_{ji} e_{mn}, \\
&= - \sum_{m,n,a,b} (\Hmo)_{ja} \frac{\partial K_{ab}}{\partial A_{mn}} (\Hmo)_{bi} e_{mn}, \\
&= - \sum_{a,b} (\Hmo)_{ja} (\Hmo)_{bi} \sum_{m,n} \frac{\partial K_{ab}}{\partial A_{mn}} e_{mn}, \\
&= - \sum_{a,b} (\Hmo)_{ja} (\Hmo)_{bi} \dA{K_{ab}}.
Using the result from section \ref{ssec:1}, we find that $\dA{K_{ab}} = -\frac{2 k_{ab}}{\sigma^2} A x_{ab} x_{ab}^T$. Note that both $a$ and $b$ range over $\{ 1, \cdots, n_1\}$, as they index the first kernel $K$. Hence
\dA{}(\Hmo)_{ji} = \frac{2}{\sigma^2} A \sum_{a,b} (\Hmo)_{ja} (\Hmo)_{bi} k_{ab} x_{ab}x_{ab}^T,
\dA{\alpha_j} &= \sum_i \dA{}(\Hmo)_{ji} y_i, \\
&= \frac{2}{\sigma^2} A \sum_{i,a,b} (\Hmo)_{ja} (\Hmo)_{bi} k_{ab} x_{ab}x_{ab}^T y_i, \\
&= \frac{2}{\sigma^2} A \sum_{a,b} (\Hmo)_{ja} k_{ab} x_{ab}x_{ab}^T \sum_{i} (\Hmo)_{bi} y_i, \\
&= \frac{2}{\sigma^2} A \sum_{a,b} (\Hmo)_{ja} k_{ab} (\Hmo \ya)_b x_{ab}x_{ab}^T.
\subsection{Final result}
Combining everything we get hopefully the right answer.
\dA{\mathcal{L}} &= 2 \sum_i (\yh_i - y_i) \dA{\yh_i}, \\
&= 2 \sum_{i,j} (\yh_i - y_i) [\dA{k_{ij}} \alpha_j + \dA{\alpha_j} k_{ij}], \\
&= U + V.
U &= 2 \sum_{i,j} (\yh_i - y_i) \dA{k_{ij}} \alpha_j, \\
&= -\frac{4}{\sigma^2} A \sum_{i,j} (\yh_i - y_i) \alpha_j k_{ij} x_{ij}x_{ij}^T, \\
&= -\frac{4}{\sigma^2} A (-\XA^T W \Xa - \Xa W^T \XA^T + \XA^T R \XA + \Xa^T S \Xa),
where $W_{ij} := (\yh_i - y_i) \alpha_j k_{ij}$, and $R, S$ are defined as in lemma \ref{lem:2}.
V &= 2 \sum_{i,j} (\yh_i - y_i) \dA{\alpha_j} k_{ij}, \\
&= \frac{4}{\sigma^2} A \sum_{i,j,a,b} (\yh_i - y_i) k_{ij} (\Hmo)_{ja} k_{ab} (\Hmo \ya)_b x_{ab}x_{ab}^T, \\
&= \frac{4}{\sigma^2} A \sum_{a,b} k_{ab} (\Hmo \ya)_b x_{ab}x_{ab}^T \sum_{i,j} (\yh_i - y_i) k_{ij} (\Hmo)_{ja}, \\
&= \frac{4}{\sigma^2} A \sum_{a,b} k_{ab} (\Hmo \ya)_b x_{ab}x_{ab}^T [H^{-T} Q^T (\yh - \ya)]_{a},
where $Q \in \R^{n_2 \times n_1}$ is the second kernel defined by $Q_{ij}=\kernel{i}{j}$.
In this case, $\tilde{W}_{ab} := k_{ab} (\Hmo \ya)_b [H^{-T} Q^T (\yh - \ya)]_{a}$, and corollary \ref{cor:1} yields
V = \frac{4}{\sigma^2} A \Xa^T (-\tilde{W} - \tilde{W}^T + \tilde{R} + \tilde{S})\Xa,
with matrices defined as in lemma \ref{lem:2}.
\dA{\mathcal{L}} = &-\frac{4}{\sigma^2} A (-\XA^T W \Xa - \Xa W^T \XA^T + \XA^T R \XA + \Xa^T S \Xa) \\
&+ \frac{4}{\sigma^2} A \Xa^T (-\tilde{W} - \tilde{W}^T + \tilde{R} + \tilde{S})\Xa.
\dA{}(\Vert Az \Vert_2^2) = 2 A z z^T.
Consider two matrices $A$ and $B$ having the same shape as $\Xa$ and $\XA$ of the introduction, and $x_{ij} = a_i - b_j$ ($i$-th line of $A$ minus $j$-th line of $B$). Set further the matrix
\Sigma = \sum_{i,j} W_{ij} x_{ij}x_{ij}^T,
with some coefficients $W_{ij}$ making up a matrix $W$. Then
\Sigma = -A^T W B^T - B^T W^T A + A^T R A + B^T S B,
where $R$ and $S$ are both diagonal matrices with $R_{ii} = \sum_j W_{ij}$, and $S_{jj} = \sum_i W_{ij}$.
In the case that $A=B$, we get that
\Sigma = A^T (-W-W^T+R+S) A^T.
\section*{Supporting Information}
% \usepackage{amsmath}
% \usepackage{amssymb}
% \usepackage{amsthm}
% \newcommand\R{\mathbb{R}}
% \newcommand\XA{X^{(A)}}
% \newcommand\Xa{X^{(\alpha)}}
% \newcommand\yA{y^{(A)}}
% \newcommand\ya{y^{(\alpha)}}
% \newcommand\dA[1]{\frac{\partial #1}{\partial A}}
% \newcommand\yh{\hat{y}}
% \newcommand\kernel[2]{\frac{1}{\sqrt{2\pi} \sigma} e^{-\frac{d(x_{#1}, x_{#2})^2}{\sigma^2}}}
% \newcommand\Hmo{H^{-1}}
% \newtheorem*{remark}{Remark}
% \newtheorem{lemma}{Lemma}
% \newtheorem{corollary}{Corollary}

Event Timeline