mlkr_formulas.texNo OneTemporaryActions

 \documentclass[12pt]{iopart} \usepackage{xcolor} \usepackage{graphicx} \usepackage{amsmath} \usepackage{amssymb} \usepackage{amsthm} \usepackage{subcaption} \usepackage{siunitx} \usepackage{dirtytalk} \usepackage[version=3]{mhchem} % Formula subscripts using \ce{} \usepackage{bm} \usepackage{xr} \usepackage{xcolor} \usepackage[normalem]{ulem} \usepackage[style=numeric,citestyle=nature,sorting=none,backend=biber]{biblatex} \addbibresource{bibliography.bib} \newcommand*\mycommand[1]{\texttt{\emph{#1}}} \newcommand{\mc}[1]{{\color{blue}#1}} \newcommand\R{\mathbb{R}} \newcommand\XA{X^{(A)}} \newcommand\Xa{X^{(\alpha)}} \newcommand\yA{y^{(A)}} \newcommand\ya{y^{(\alpha)}} \newcommand\dA[1]{\frac{\partial #1}{\partial A}} \newcommand\yh{\hat{y}} \newcommand\kernel[2]{\frac{1}{\sqrt{2\pi} \sigma} e^{-\frac{d(x_{#1}, x_{#2})^2}{\sigma^2}}} \newcommand\Hmo{H^{-1}} \newtheorem*{remark}{Remark} \newtheorem{lemma}{Lemma} \newtheorem{corollary}{Corollary} \makeatletter \newcommand*{\addFileDependency}[1]{ \typeout{(#1)} \@addtofilelist{#1} \IfFileExists{#1}{}{\typeout{No file #1.}} } \makeatother \newcommand*{\myexternaldocument}[1]{ \externaldocument{#1} \addFileDependency{#1.tex} \addFileDependency{#1.aux} } \renewcommand{\thefootnote}{\textit{\alph{footnote}}} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Not sure how to include SI right now with this style but will do later \DeclareUnicodeCharacter{2212}{-} \DeclareUnicodeCharacter{394}{$\Delta$} % put all the external documents here! %\myexternaldocument{SI} %\listfiles %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{document} First training set for A: $\Xa = \begin{pmatrix} -\quad x_1^T \quad - \\ \vdots \\ -\quad x_{n_1}^T \quad - \end{pmatrix} \in \R^{n_1 \times n}$ and $\ya = \begin{pmatrix} y_1 \\ \vdots \\ y_{n_1} \end{pmatrix}$. Second training set for $\alpha$: $\XA = \begin{pmatrix} -\quad x_1^T \quad - \\ \vdots \\ -\quad x_{n_2}^T \quad - \end{pmatrix} \in \R^{n_2 \times n}$ and $\yA = \begin{pmatrix} y_1 \\ \vdots \\ y_{n_2} \end{pmatrix}$. The line vectors of the matrix $\XA$ and $\Xa$ and the components of $\yA$ and $\ya$ are distinguished by the range of their index for ease of notation. $i$ will range over $\{1, \cdots, n_2 \}$ and $j, a, b$ over $\{1, \cdots, n_1\}$ which makes no sense but I don't want to change it right now. \subsection{Ridge regression} Consider the Mahalanobis distance $d(x,y)^2 = \Vert A(x-y) \Vert_2^2$. We define the first kernel matrix $K \in \R^{n_1 \times n_1}$ such that $K_{j,j'} = \kernel{j}{j'}$. Set the ridge regression coefficients $\alpha \in \R^{n_1}$ by $\alpha = (K + \lambda I)^{-1} \ya$. \begin{remark} If $K$ is singular, there are many choices of $\alpha$ and this one might not be optimal. However the exact derivation of the gradient needs a fixed formula and I am scared of SVD. \end{remark} From $\alpha$, we define $\yh_i = \sum_j \alpha_j k_{ij}$, where $k_{ij} = \kernel{i}{j}$. \subsection{Cost function} Let the cost function $\mathcal{L} = \sum_i (\yh_i - y_i)^2$ which implicitly depends on $A$. We aim to compute $\dA{\mathcal{L}}$ which is a matrix verifying $(\dA{\mathcal{L}})_{ij} = \frac{\partial \mathcal{L}}{\partial A_{ij}}$. \section{Computing the gradient} \dA{\mathcal{L}} = 2 \sum_i (\yh_i - y_i) \dA{\yh_i}. From the expression of $\yh_i$, \dA{\yh_i} = \sum_j \dA{}(\alpha_j k_{ij}) = \sum_j \dA{k_{ij}} \alpha_j + \sum_j \dA{\alpha_j} k_{ij}. The gradient boils down to computing $\dA{k_{ij}}$ and $\dA{\alpha_j}$. The first part is easy to compute so I will do it first because by now I know it by heart. \subsection{Computing $\dA{k_{ij}}$ }\label{ssec:1} Let $x_{ij} := x_i - x_j$, such that $d(x_i, x_j)^2 = \Vert A x_{ij} \Vert_2^2$. Then \begin{align*} \dA{k_{ij}} &= \dA{}(\kernel{i}{j}), \\ &= -\frac{1}{\sigma^2} k_{ij} \dA{}(d(x_i,x_j)^2), \\ &= -\frac{1}{\sigma^2} k_{ij} \dA{}(\Vert A(x_{ij}) \Vert_2^2), \\ &= -\frac{2k_{ij}}{\sigma^2} A x_{ij}x_{ij}^T, \end{align*} where the last equation comes from Lemma \ref{lem:1} of the appendix. \subsection{Computing $\dA{\alpha_j}$} Brace yourself for this journey is long. Let $H=(K+ \lambda I)$ the matrix used to defined $\alpha = \Hmo \ya$. Clearly, \dA{\alpha_j} = \sum_i \dA{}(\Hmo)_{ji} y_i Deriving $HH^-1 = I$, we get $H' = -\Hmo H' \Hmo = - \Hmo K' \Hmo$, and thus \begin{align*} \dA{}(\Hmo)_{ji} &= \sum_{m,n} \frac{\partial}{\partial A_{mn}} (\Hmo)_{ji} e_{mn}, \\ &= \sum_{m,n} (\frac{\partial}{\partial A_{mn}} \Hmo)_{ji} e_{mn}, \\ &= -\sum_{m,n} (\Hmo \frac{\partial K}{\partial A_{mn}} \Hmo)_{ji} e_{mn}, \\ &= - \sum_{m,n,a,b} (\Hmo)_{ja} \frac{\partial K_{ab}}{\partial A_{mn}} (\Hmo)_{bi} e_{mn}, \\ &= - \sum_{a,b} (\Hmo)_{ja} (\Hmo)_{bi} \sum_{m,n} \frac{\partial K_{ab}}{\partial A_{mn}} e_{mn}, \\ &= - \sum_{a,b} (\Hmo)_{ja} (\Hmo)_{bi} \dA{K_{ab}}. \end{align*} Using the result from section \ref{ssec:1}, we find that $\dA{K_{ab}} = -\frac{2 k_{ab}}{\sigma^2} A x_{ab} x_{ab}^T$. Note that both $a$ and $b$ range over $\{ 1, \cdots, n_1\}$, as they index the first kernel $K$. Hence \dA{}(\Hmo)_{ji} = \frac{2}{\sigma^2} A \sum_{a,b} (\Hmo)_{ja} (\Hmo)_{bi} k_{ab} x_{ab}x_{ab}^T, and \begin{align*} \dA{\alpha_j} &= \sum_i \dA{}(\Hmo)_{ji} y_i, \\ &= \frac{2}{\sigma^2} A \sum_{i,a,b} (\Hmo)_{ja} (\Hmo)_{bi} k_{ab} x_{ab}x_{ab}^T y_i, \\ &= \frac{2}{\sigma^2} A \sum_{a,b} (\Hmo)_{ja} k_{ab} x_{ab}x_{ab}^T \sum_{i} (\Hmo)_{bi} y_i, \\ &= \frac{2}{\sigma^2} A \sum_{a,b} (\Hmo)_{ja} k_{ab} (\Hmo \ya)_b x_{ab}x_{ab}^T. \end{align*} \subsection{Final result} Combining everything we get hopefully the right answer. \begin{align*} \dA{\mathcal{L}} &= 2 \sum_i (\yh_i - y_i) \dA{\yh_i}, \\ &= 2 \sum_{i,j} (\yh_i - y_i) [\dA{k_{ij}} \alpha_j + \dA{\alpha_j} k_{ij}], \\ &= U + V. \end{align*} \begin{align*} U &= 2 \sum_{i,j} (\yh_i - y_i) \dA{k_{ij}} \alpha_j, \\ &= -\frac{4}{\sigma^2} A \sum_{i,j} (\yh_i - y_i) \alpha_j k_{ij} x_{ij}x_{ij}^T, \\ &= -\frac{4}{\sigma^2} A (-\XA^T W \Xa - \Xa W^T \XA^T + \XA^T R \XA + \Xa^T S \Xa), \end{align*} where $W_{ij} := (\yh_i - y_i) \alpha_j k_{ij}$, and $R, S$ are defined as in lemma \ref{lem:2}. \begin{align*} V &= 2 \sum_{i,j} (\yh_i - y_i) \dA{\alpha_j} k_{ij}, \\ &= \frac{4}{\sigma^2} A \sum_{i,j,a,b} (\yh_i - y_i) k_{ij} (\Hmo)_{ja} k_{ab} (\Hmo \ya)_b x_{ab}x_{ab}^T, \\ &= \frac{4}{\sigma^2} A \sum_{a,b} k_{ab} (\Hmo \ya)_b x_{ab}x_{ab}^T \sum_{i,j} (\yh_i - y_i) k_{ij} (\Hmo)_{ja}, \\ &= \frac{4}{\sigma^2} A \sum_{a,b} k_{ab} (\Hmo \ya)_b x_{ab}x_{ab}^T [H^{-T} Q^T (\yh - \ya)]_{a}, \end{align*} where $Q \in \R^{n_2 \times n_1}$ is the second kernel defined by $Q_{ij}=\kernel{i}{j}$. In this case, $\tilde{W}_{ab} := k_{ab} (\Hmo \ya)_b [H^{-T} Q^T (\yh - \ya)]_{a}$, and corollary \ref{cor:1} yields V = \frac{4}{\sigma^2} A \Xa^T (-\tilde{W} - \tilde{W}^T + \tilde{R} + \tilde{S})\Xa, with matrices defined as in lemma \ref{lem:2}. Finally, \begin{align*} \dA{\mathcal{L}} = &-\frac{4}{\sigma^2} A (-\XA^T W \Xa - \Xa W^T \XA^T + \XA^T R \XA + \Xa^T S \Xa) \\ &+ \frac{4}{\sigma^2} A \Xa^T (-\tilde{W} - \tilde{W}^T + \tilde{R} + \tilde{S})\Xa. \end{align*} \section{Appendix} \begin{lemma}\label{lem:1} \dA{}(\Vert Az \Vert_2^2) = 2 A z z^T. \end{lemma} \begin{lemma}\label{lem:2} Consider two matrices $A$ and $B$ having the same shape as $\Xa$ and $\XA$ of the introduction, and $x_{ij} = a_i - b_j$ ($i$-th line of $A$ minus $j$-th line of $B$). Set further the matrix \Sigma = \sum_{i,j} W_{ij} x_{ij}x_{ij}^T, with some coefficients $W_{ij}$ making up a matrix $W$. Then \Sigma = -A^T W B^T - B^T W^T A + A^T R A + B^T S B, where $R$ and $S$ are both diagonal matrices with $R_{ii} = \sum_j W_{ij}$, and $S_{jj} = \sum_i W_{ij}$. \end{lemma} \begin{proof} AAAAAAAAAA \end{proof} \begin{corollary}\label{cor:1} In the case that $A=B$, we get that \Sigma = A^T (-W-W^T+R+S) A^T. \end{corollary} \clearpage \printbibliography \clearpage \section*{Supporting Information} \end{document} \documentclass[12pt]{article} \usepackage[utf8]{inputenc} % \usepackage{amsmath} % \usepackage{amssymb} % \usepackage{amsthm} % \newcommand\R{\mathbb{R}} % \newcommand\XA{X^{(A)}} % \newcommand\Xa{X^{(\alpha)}} % \newcommand\yA{y^{(A)}} % \newcommand\ya{y^{(\alpha)}} % \newcommand\dA[1]{\frac{\partial #1}{\partial A}} % \newcommand\yh{\hat{y}} % \newcommand\kernel[2]{\frac{1}{\sqrt{2\pi} \sigma} e^{-\frac{d(x_{#1}, x_{#2})^2}{\sigma^2}}} % \newcommand\Hmo{H^{-1}} % \newtheorem*{remark}{Remark} % \newtheorem{lemma}{Lemma} % \newtheorem{corollary}{Corollary}