Page MenuHomec4science

main.tex
No OneTemporary

File Metadata

Created
Fri, Mar 29, 15:16

main.tex

\documentclass[12pt]{iopart}
\usepackage{utf8math}
\usepackage{todonotes}
\usepackage{xcolor}
\usepackage{graphicx}
%% the following two lines because the iopart class conflicts with amsmath
\expandafter\let\csname equation*\endcsname\relax
\expandafter\let\csname endequation*\endcsname\relax
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{subcaption}
\usepackage{siunitx}
\usepackage{dirtytalk}
\usepackage[version=3]{mhchem} % Formula subscripts using \ce{}
\usepackage{bm}
\usepackage{xr}
\usepackage{xcolor}
\usepackage[normalem]{ulem}
\usepackage[style=numeric,citestyle=nature,sorting=none,backend=biber]{biblatex}
\addbibresource{fix_bib.bib}
\newcommand*\mycommand[1]{\texttt{\emph{#1}}}
\newcommand{\mc}[1]{{\color{blue}#1}}
\newcommand\R{\mathbb{R}}
\newcommand\XA{{X^{(A)}}}
\newcommand\Xa{{X^{(\alpha)}}}
\newcommand\xa{{x^{(\alpha)}}}
\newcommand\xai[1]{{x_#1^{(\alpha)}}}
\newcommand\xA{{x^{(A)}}}
\newcommand\xAi[1]{{x_#1^{(A)}}}
\newcommand\yA{{y^{(A)}}}
\newcommand\yAi[1]{{y_#1^{(A)}}}
\newcommand\ya{{y^{(\alpha)}}}
\newcommand\yai[1]{{y_#1^{(\alpha)}}}
\newcommand\dA[1]{\frac{\partial #1}{\partial A}}
\newcommand\yh{\widehat{y}}
\newcommand\kernel[2]{\frac{1}{\sqrt{2\pi} \sigma} e^{-\frac{d(x_{#1}, x_{#2})^2}{\sigma^2}}}
\newcommand\Hmo{H^{-1}}
\newtheorem*{remark}{Remark}
\newtheorem{lemma}{Lemma}
\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}
\makeatletter
\newcommand*{\addFileDependency}[1]{
\typeout{(#1)}
\@addtofilelist{#1}
\IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother
\newcommand*{\myexternaldocument}[1]{
\externaldocument{#1}
\addFileDependency{#1.tex}
\addFileDependency{#1.aux}
}
\renewcommand{\thefootnote}{\textit{\alph{footnote}}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Not sure how to include SI right now with this style but will do later
\DeclareUnicodeCharacter{2212}{-}
\DeclareUnicodeCharacter{394}{$\Delta$}
% put all the external documents here!
%\myexternaldocument{SI}
%\listfiles
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}
\title{Metric Learning for Kernel Ridge Regression: Assessment of Molecular Similarity}
\author{Raimon Fabregat\textsuperscript{1,\ddag}, Puck van Gerwen\textsuperscript{1,2,\ddag}, Matthieu Haeberle\textsuperscript{1,3,\ddag}, Friedrich Eisenbrand\textsuperscript{3} and Cl\'emence Corminboeuf\textsuperscript{1,2}}
\address{\textsuperscript{1}Laboratory for Computational Molecular Design, Institute of Chemical Sciences and Engineering, \'Ecole Polytechnique F\'ed\'erale de Lausanne, 1015 Lausanne, Switzerland\\
\textsuperscript{2}National Center for Competence in Research-Catalysis (NCCR-Catalysis), Zurich, Switzerland \\
\textsuperscript{3}Chair of Discrete Optimization, \'Ecole Polytechnique F\'ed\'erale de Lausanne, 1015 Lausanne, Switzerland\\
\textsuperscript{\ddag}These authors contributed equally to this work.
}
\ead{clemence.corminboeuf@epfl.ch}
\vspace{10pt}
\begin{indented}
\item[]\today
\end{indented}
% ABSTRACT / CONTRIBUTION IN PROGRESS
\begin{abstract} \todo{\scriptsize FE: My try. Rewrite abstract at will}
Kernel regression methods are widely used to estimate an unknown
real-valued function from a high-dimensional feature space.
Typically, these methods are based on a distance-based kernel. The
standard Euclidean distance is often not reflecting the
differences in the observed function value as signals in the data might be
less relevant. An elegant approach to surmount this
shortcoming is \emph{metric learning} to
find an appropriate transformation of the training set, consisting
of rotation and scaling. The state-of-the-art of metric learning is
based on the \emph{Nadaraya-Watson}
estimator and the approximation of an optimal corresponding
transformation. The Nadaraya-Watson estimator is often observed to be inferior to
\emph{Kernel Ridge Regression}, especially for prediction tasks in
the area of computational chemistry. In this paper, we propose a
method for metric learning that is based on Kernel Ridge Regression
instead. While the loss-gradient is considerably more involved,
this method shows superior performance for important regression
tasks in computational chemistry such as the prediction of
atomization energies of molecules in the QM9 set. This is also
visible in standard 2D-projections of the transformed feature
space.
\end{abstract}
%\section{Contribution}
%We propose a novel method for prediction based on \emph{regression} and \emph{metric learning}.
\section{Introduction}
Supervised and unsupervised machine learning methods have demonstrated tremendous capabilities in the fields of natural sciences in the last decade\todo{citations}. On the one hand, supervised models aim to find a mathematical relationship between input data and target properties. On the other hand, unsupervised algorithms provide means to find underlying structures in unlabelled data. A central concept to many algorithms belonging to both supervised and unsupervised approaches is \textit{similarity}. The underlying assumption is that points close in the feature space should be close in property space. This is the basis of prediction methods such as the widely used kernel methods \textit{e.g.}, Kernel Ridge Regression (KRR) and Gaussian Process Regression (GPR), and of the large majority of unsupervised methods for clustering and dimensionality reduction. The performance of such approaches critically depends on the similarity measure used to compare data points. Similarity between two elements in a feature space is often constructed using a kernel, a function that outputs a scalar value between 1 (identical) or 0 (not similar). The most widely used kernel functions contain a decreasing exponential of a metric, such as the Euclidean distance (Gaussian kernel) or the Manhattan distance (Laplacian kernel). Alternative measures such as the linear kernel (the dot product between the feature vectors), or the cosine similarity also exist.\\
There is a limited number of available similarity measures constructed from pre-defined metrics together with a given kernel. Pre-defined metrics are especially unsuitable
when a descriptive feature space is high dimensional and possibly polluted with irrelevant or redundant features. In such cases, the associated similarity measured in the feature space will not necessarily correlate with that of the target property. For instance, the Euclidean distance used in the Gaussian kernel ($d_E(\textbf{a},\textbf{b}) = \sqrt{\sum_i(a_i - b_i)^2} = ||\textbf{a} - \textbf{b}||_2$) is not invariant under monotonic transformations and is sensitive to the scaling of individual coordinates, which results in having features with high variance (not necessarily relevant) being dominant across a dataset.\\
Similarity-based algorithms relying upon physics-inspired molecular representations as typically used in chemistry ideally illustrate these limitations. These representations are designed to encode all the necessary information to unequivocally describe a molecular structure and generally take the form of non-linear functions of atom types and positions that are directly injected into the ML architecture. Popular examples of such representations are SLATM \cite{huang2020quantum}, SOAP \cite{bartok_2013_soap} and FCHL \cite{faber_alchemical_2018, christensen2020fchl}, among many others \cite{rupp2012fast, huang2016communication, musil2021physics}. They are used to develop predictive models for a variety of properties \cite{rupp2012fast, von2018quantum,christensen2020fchl,li2015molecular, chmiela_machine_2017, chmiela_towards_2018,bereau_transferable_2015,grisafi2018symmetry, wilkins_accurate_nodate,grisafi2018transferable,bartok2010gaussian,fabrizio2019electron,westermayr_machine_2019} or to facilitate the visualisation and interpretation of the chemical space \cite{ceriotti_unsupervised_2019, glielmo2021unsupervised, cheng2020mapping} when combined with unsupervised algorithms based on dimensionality reduction (\textit{e.g.}, Principal Component Analysis (PCA), \textit{t}-distributed Stochastic Neighbours Embedding (\textit{t}-SNE) \cite{van2008visualizing}, sketch-map \cite{ceriotti_2013_sketchmap}). Yet, these representations do not provide a universally good measure of molecular similarity, resulting in sub-optimal performance when more challenging properties are targeted \cite{gallarati2021reaction}.
This deficiency is intrinsically connected to the use of pre-defined metrics that treats all the features on a equal footing regardless of their redundancy or relevance to a particular task. While redundancy in a feature space is easily eliminated by using unsupervised linear dimensionality reduction algorithms such as PCA and the CUR decomposition \cite{mahoney2009cur}, adapting the relevance of descriptive features in a metric for a particular target requires a supervised approach \cite{kuhn2013applied}.\\
A family of methods called metric or similarity learning exists specifically for this purpose. Metric learning (or distance metric learning, or similarity learning) \cite{kulis2012metric, yang2006distance} provides an elegant solution to adapt the notion of similarity (and distance) according to the target property. Thus, similarity can be transformed from a global concept (two molecules are similar based on their structure) to an application-specific concept (two molecules are similar, in the context of their properties). This conceptual shift enables a more specific definition of similarity, which naturally circumvents many of the pitfalls of pre-defined similarity notions, as well as enabling more accurate machine learning models. Despite its enormous promise, metric learning algorithms have remained largely absent in the chemical domain, except for a few examples on graph-structured data \cite{jeon2019resimnet, zheng2019sense, nasser2020improved, coupry2022application}. This is likely because most frameworks are designed for classification \cite{Kilian2007, koch2015siamese, hoffer2015deep}, while chemistry is dominated by regression tasks.\\
One of the few examples of metric learning algorithms for regression is the Metric Learning for Kernel Regression (MLKR) developed by Weinberger and Tesauro \cite{Kilian2007}. MLKR learns a linear transformation matrix that transforms the distance metric between points in order to optimize the prediction of a particular target in a kernel regression. However, the kernel regression used in MLKR employs the non-parametric Nadaraya–Watson (NW) estimator, which, as demonstrated in this work, is less suited than the KRR estimator for regression tasks. The NW estimator, which is essentially a Nearest-Neighbours estimator, evaluates similarity between points in a relative manner using a notion that is dependent on the distribution of points within a particular dataset. In KRR, the notion of similarity is instead absolute, \textit{i.e.}, independent of the distribution of data. This is an important distinction if the aim is to accurately predict molecular properties. Within this context, identifying the trial instance/molecules that are \textit{close} (in the absolute sense) to the molecular system of interest is more relevant than identifying the \textit{closest} points that potentially correspond to fairly dissimilar molecules. In order to enable metric learning in the KRR framework, we propose a new algorithm - Metric Learning for Kernel Ridge Regresion (MLKRR) - which modifies the MLKR formalism for the KRR estimator. MLKRR could enable the wide-spread adoption of metric learning for kernel-based machine learning tasks, thereby re-defining the fundamental notion of similarity upon which they depend. We demonstrate the performance of the MLKRR algorithm on the prototypical task of regressing atomization energies of small molecules in the QM9 dataset \cite{ramakrishnan_quantum_2014}.
\section{Metric learning}
\label{sec:theory}
% either same distance, new space or new distance, same space
Metric learning algorithms transform a feature space in order to construct a distance function that minimizes the prediction error of a specific property. They learn a linear transformation matrix $A$ that generates a \emph{Mahalanobis distance}~\cite{mahalanobis1936generalized} ($d_M$) on the original space
\begin{equation}
% d_E(\textbf{a}',\textbf{b}') = ||\textbf{a}' - \textbf{b}'||_2 =
d_M(\textbf{a},\textbf{b}) = ||\mathbf{A}\textbf{a} - \mathbf{A}\textbf{b}||_2 = ||\mathbf{A} (\textbf{a} - \textbf{b})||_2.
\end{equation}
%
% rotation and scaling
Since $A^TA$ is positive semidefinite, there exists an orthogonal
matrix $Q$ as well as non-negative diagonal matrix $D$ such that
$A^TA = Q^T D Q$, see \cite{Kilian2007}. Thus, the transformation of
the feature space with $A$ corresponds to a rotation and a scaling of
the components. In this way, distances of points can be increased or
decreased, depending on their relevance for the target property.
\subsection{Metric Learning for Kernel Regression}\label{subsec:mlkr}
Weinberger and Tesauro \cite{Kilian2007} propose the following method to apply the transformation matrix $A$ in the context of prediction.
Suppose that the data points are $x_i \in \mathbb{R}^d$ and the predictions are $y_i \in \mathbb{R}$, $i=1,\dots,n$. The transformation is used in the \emph{Gaussian kernel} as follows
\begin{equation}
k(x_i, x_j) = %\frac{1}{\sqrt{2\pi} \sigma} e^{-\frac{d_M(x_{i}, x_{j})^2}{\sigma^2}} =
\frac{1}{\sqrt{2\pi} \sigma} e^{-\frac{||\mathbf{A} (x_i - x_j)||_2^2}{\sigma^2}}
\label{eq:kernel}
\end{equation}
%
% move 1/Z to when it is needed
The authors rely on the \emph{Nadaraya–Watson estimator (NW)}
\begin{equation}
\widehat{y}_i = \frac{\sum_{j \neq i} y_j k_{ij}}{\sum_{j \neq i} k_{ij}}. %= \frac{1}{Z_i}\sum_{j \neq i} y_j k_{ij},
\label{eq:NW}
\end{equation}
%
The prediction of a new data point $x$ is then
\begin{equation}
f(x) = \widehat{y} = \frac{\sum_{j} y_j k(x,x_j)}{\sum_{j}k(x,x_j)}
\end{equation}
%
The learning of the matrix $A$ is expressed as an optimization problem. The goal is to minimize the residual sum of squares $\mathcal{L} := \sum_{i}(y_i - \widehat{y_i})^2$. The optimization then relies on the gradient of the loss with respect to the transformation matrix $A$
\begin{align}\label{eq:grad_mlkr}
\frac{\partial \mathcal{L}}{\partial A} = \frac{4}{\sigma^2}A \sum_i (\widehat{y}_i - y_i) \frac{1}{Z_i} \sum_{j \neq i} (\widehat{y}_i - y_j) k_{ij} x_{ij}x_{ij}^T,
\end{align}
where $x_{ij} := x_i - x_j$ and $Z_i := \sum_{j \neq i} k_{ij}$.
\subsection{Metric Learning for Kernel Ridge Regression}
% standard reference
To overcome \todo{Here, we need some sentences explaining that MLKR does not work compared to kernel ridge regression for our application} the pitfalls of the NW estimator in metric learning we introduce Metric Learning for Kernel Ridge
Regression (MLKRR), which uses the standard parametric \emph{KRR estimator}
instead
\begin{equation}
\widehat{y}_i = \sum_j \alpha_j k(x_i, x_j)
\end{equation}
The prediction of a new data point $x$ is then:
\begin{equation}\label{eq:pred-krr}
f(x) = \widehat{y} = \sum_j \alpha_j k(x, x_j)
\end{equation}
The coefficients $α_j$ that minimize the sum of squares $\mathcal{L}_\alpha := \sum_{i}(y_i - \widehat{y}_i)^2$ are obtained by solving the linear equation \todo{MH: citation on KRR}
\begin{equation}
\boldsymbol{\alpha} = (K + \lambda I)^{-1} \boldsymbol{y}
\label{eq:krr}
\end{equation}
where $K ∈ ℝ^{n ×n}$ is the matrix defined my the Gaussian kernel $k(x_i,x_j)$, $\lambda$ is a small regularization parameter and $I$ is the identity matrix. These coefficients $\boldsymbol{\alpha}$ defined above is a specific choice of coefficients that solve kernel ridge regression. \todo{FE: Meaning is not clear to me} Many other choices in fact exist and may prove themselves to be more robust. However, with the matrix optimization in mind, we chose a fixed expression in order to simplify the computation of the gradient \eqref{eq:grad} below.
In classical KRR, the $\boldsymbol{\alpha}$ coefficients are optimized on fixed kernel. In the metric learning approach described above on the other hand, only the matrix $A$ is optimized.
The novelty of our method lies in the \emph{combination} of both principles. We now explain the two-step procedure that optimizes the estimator and then the metric.
We first sample $n_\alpha$ data points $\{ \xai{i} \}_{i=1}^{n_\alpha}$ and their corresponding labels $\{ \yai{i} \}_{i=1}^{n_\alpha}$ from our dataset in order to compute the estimator from the coefficients $\boldsymbol{\alpha}$. It follows from the expression \eqref{eq:krr} above that $\boldsymbol{\alpha}$ may be written in terms of the matrix $A$, hidden in the kernel matrix $K_{ij} := k(\xai{i}, \xai{j})$.
Once the estimator is found, we use its predictions \eqref{eq:pred-krr} in order to optimize the matrix $A$. For this, a new loss function is considered on new data points. Indeed, to increase to quality of our optimization, we do not predict data that was already used in the training set of the estimator. For this, we sample $n_A$ new data points $\{ \xAi{i} \}_{i=1}^{n_A}$ with their corresponding labels $\{ \yAi{i} \}_{i=1}^{n_A}$. The predictions of these points are hence given by
\begin{equation}\label{eq:pred-A}
\widehat{y}^{(A)}_i := \sum_j \alpha_j k(x^{(A)}_i, x^{(\alpha)}_j),
\end{equation}
which in turn yield the second loss function to minimize $\mathcal{L}_A := \sum_{i}(y^{(A)}_i - \widehat{y}^{(A)}_i)^2$.
The computation of the matrix $A$ is done by gradient descent using $\nabla_A\mathcal{L}_A$ \todo{MH: since the function is not convex. Find an easy argument as to why.} of the loss function. To express the gradient in a closed form, we introduce the following notation.
The predictions \eqref{eq:pred-A} impose the definition of the additional kernel matrix $Q_{ij}:=k(\xAi{i}, \xai{j})$. Further, we set $\Xa$ and $\XA$ the matrices whose rows are the data points of corresponding superscript. Let also $\ya$ and $\yA$ be the vectors whose entries are the property labels of corresponding superscript.
The gradient $\nabla_A\mathcal{L}_A$ is now given by
\begin{align}\label{eq:grad}
\begin{split}
\dA{\mathcal{L}_A} = &-\frac{4}{\sigma^2} A \left(-\XA^T W \Xa - \Xa^T W^T \XA + \XA^T R \XA + \Xa^T S \Xa\right) \\
&+ \frac{4}{\sigma^2} A \Xa^T \left(-\tilde{W} - \tilde{W}^T + \tilde{R} + \tilde{S}\right)\Xa,
\end{split}
\end{align}
where $W_{ij} := (\yh^{(A)}_i - \yAi{i}) \alpha_j Q_{ij}$, $\tilde{W}_{ab} := K_{ab} \alpha_b \left[(K+\lambda I)^{-T} Q^T (\yh^{(A)} - \yA) \right]_{a}$. The diagonal matrices $R, S, \tilde{R}, \tilde{S}$ are the vertical and horizontal sums of $W$ and $\tilde{W}$, that is $R_{ii} := \sum_j W_{ij}$, and $S_{jj} := \sum_i W_{ij}$. More on the definitions of these matrices, together with the proof of correctness of the expression is given in the appendix.
Using the python library \texttt{metric-learn} \cite{de2020metric} as inspiration\todo{FE: Maybe better to write: We implemented ... based on the python library ...}, we implemented MLKR and MLKRR in a python module found at \texttt{github.com/lcmd-epfl/MLKRR}.
% PCA now gives meaningful separation of the points
\begin{figure}
\centering
\includegraphics[width=1\textwidth]{Figures/mlkrr.pdf}
\caption{ This figure shows the result of the transformation of the points, carried out by the MLKRR algorithm on a training set that is divided into two sets labeled with $α$ and $Α$. The colors of the points represent the corresponding function values. The projection of the points with standard PCA does not show a correlation of the value with the first component, while the PCA projection of the points ($PCA'$) that have been transformed with $A$ feature such a correlation. The transformed point-set features less noise and the relevant signals are amplified. \todo{FE: I rewrote this. Please check.}
% Schematic depiction of the MLKRR algorithm. The training set contains $\alpha$ and $A$ pointsin order to optimize both parameters separately. MLKRR optimizes the transformation matrix $A$, transforming the Euclidean distance to a Mahalanobis distance in order to minimize the prediction errors of a KRR model constructed using the set $\alpha$. This results in a new set of features, which reduce noise and amplify the relevant signal for regression tasks.
}
\label{fig:mlkrr}
\end{figure}
%$\Xa = \begin{pmatrix} -\quad x^{(\alpha)T}_1 \quad - \\ \vdots \\ -\quad x^{(\alpha)T}_{n_\alpha} \quad - \end{pmatrix} \in \R^{n_1 \times n}$ and $\ya = \begin{pmatrix} y^{(\alpha)}_1 \\ \vdots \\ y^{(\alpha)}_{n_\alpha} \end{pmatrix}$,
%$\XA = \begin{pmatrix} -\quad x^{(A)T}_1 \quad - \\ \vdots \\ -\quad x^{(A)T}_{n_A} \quad - \end{pmatrix} \in \R^{n_A \times n}$ and $\yA = \begin{pmatrix} y^{(A)}_1 \\ \vdots \\ y^{(A)}_{n_A} \end{pmatrix}$,
\subsection{Conceptual comparison of MLKR and MLKRR}
The different nature of the similarity used by the KRR and NW estimators affects their respective regression performance. Given a datapoint $x$ in two different data distributions, the KRR definition of similarity between $x$ and a neighbour $a$ is independent of any other points in the distribution. On the other side, the NW definition of similarity adapts to the distribution of data, so that a point $x$ is similar to $a$ if $a$ is the closest point to $x$ (see Figure \ref{fig:krr_vs_nw}a). Effectively, functions regressed with NW result in piece-wise surfaces resembling Voronoi diagrams, where each region of the feature space is dominated by the nearest data point. Alternatively, KRR generates surfaces that transition smoothly, which generally results in much superior prediction performance. This is clearly observed in Figure \ref{fig:krr_vs_nw}b, which shows the result of a simple regression on a 2D feature space.
\begin{figure}
\centering
\includegraphics[width=1\textwidth]{Figures/krr_vs_nw.pdf}
\caption{a) Depiction of the different approaches to define similarity between KRR and NW. b) Target function depending on two features and the corresponding regressions based on the KRR and the NW estimators. The two models were trained on a limited sample of data points marked as crosses in the first plot. }
\label{fig:krr_vs_nw}
\end{figure}
\section{Computational details}
\subsection{Dataset}
All models were built using a random subset of 24,000 molecules from the QM9 dataset \cite{ramakrishnan_quantum_2014} of 134,000 three-dimensional structures and corresponding atomization energies of small drug-like molecules (up to 9 heavy atoms). The initial set was then split into 20,000 for training, 2,000 for testing (fitting hyperparameters for the KRR, and tracking the accuracy of the models throughout optimization) and 2,000 for validation (used for the out-of-sample error in the learning curves).
\subsection{FCHL representation}
Molecules are represented using the Faber-Christensen-Huang-Lilienfeld representation \cite{christensen2020fchl} (the 2019 version) as implemented in the \texttt{qml} python package \cite{qml_code}, but we note that any other representation could have been used. The FCHL19 representation is effectively a computationally efficient, condensed variation of SLATM \cite{huang2020quantum}. The default settings were used for all of the FCHL19 parameters. Local representations were converted to global ones by summing all of the atomic contributions, such that the eventual representations were a vector consisting of 720 features.
\subsection{Optimization details}
The MLKR and MLKRR algorithms were optimized for enough time to reach convergence, which was around 2,000 steps in both cases (see Figure \ref{fig:optimization}). The function \texttt{minimize} of the SciPy\cite{jones2001scipy} library was used to optimize the matrix $A$, which itself uses the L-BFGS-S\cite{boyd2004convex} algorithm. $A$ was initialized as the identity matrix. \\
For the MLKRR, we used $n_\alpha$ = $n_A$, although some tests suggest that $n_\alpha < n_A$ could be superior. In order to reduce overfitting for the MLKRR, the training data was reshuffled into $A$ and $\alpha$ sets every 30 optimization steps. The starting kernel width $\sigma$ for the MLKRR was $\sigma = 55$, and the regularization parameter was fixed at $\lambda = 1e^-9$. These parameters are the optimal values for the standard KRR, which were optimized using a grid search on the test set.
% (a comparison of the results with and without shuffling is shown in Figure \ref{fig:shuffle} of the SI).
\section{Results and Discussion}
\subsection{Learning transformation matrices}
Training the MLKR and MLKRR algorithms consists of optimizing the relevant transformation matrices $A$. Figure \ref{fig:optimization} illustrates the evolution of the optimization process for the two algorithm. Despite the noisier optimization of the MLKRR, which is a result of the periodic reshuffling of the datasets to optimize both $A$ and $\alpha$, both algorithms converge after approximately 2000 steps.
\begin{figure}[h!]
\centering
\includegraphics[width=1\textwidth]{Figures/optimisations.pdf}
\caption{Learning curves showing the evolution of the Mean Absolute Error (MAE) throughout the optimization process for our implementations of Metric Learning for Kernel Regression (MLKR) and Metric Learning for Kernel Ridge Regression (MLKRR). Both converge after around 2000 steps.}
\label{fig:optimization}
\end{figure}
After the optimization process, we obtain the optimized transformation matrices $A$, which are visualized in Figure \ref{fig:amats}. Both matrices $A_{\mathrm{MLKR}}$ and $A_{\mathrm{MLKRR}}$ contain many non-zero values, both along the diagonal and in off-diagonal terms. Modifications to diagonal terms are effectively a re-weighting of the original features, akin to the application-based re-weighting of terms by Ceriotti and co-workers \cite{willatt_feature_2018}. Here however, the re-weighting is entirely learned by the algorithm. Diagonal terms $A_{ii}$ with absolute value larger than 1 indicate that a heigher weight is applied to specific features, whereas terms smaller than 1 indicate that a lower weight is applied. $A_{ii}=0$ indicates that the feature is dropped entirely, reducing the dimensionality of the feature space. Off-diagonal terms $A_{ij}, i\neq j$, are linear combinations of the original features. The highly correlated nature of the FCHL features is illustrated by the fact that the transformation matrices are rather smooth (shown in the panels on the right of Figure \ref{fig:amats}). After convergence, the largest terms in the transformation matrices are the diagonal values. This is partially due to the fact that the initial guess for the optimization process is the identity matrix. It also indicates that the original features are useful to predict the target property, just with an appropriate re-weighting (\textit{i.e.} a selection and amplification of relevant features for the target property). The selection of off-diagonal terms acts to combine features that are originally correlated, \textit{i.e.} generating new features. Thus, the metric learning process naturally applies a feature selection and extraction protocol for the target property.
\begin{figure}
\centering
\includegraphics[width=1\textwidth]{Figures/amats.eps}
\caption{Colormaps representing the learned transformation matrices $A$ for MLKR ($A_{\mathrm{MLKR}}$) and MLKRR ($A_{\mathrm{MLKRR}}$). The lefthand panels illustrate all matrix elements, whereas the righthand panels illustrate zoomed-in components. Both transformation matrices amplify the diagonal components in order to select relevant features for the target property, as well as choosing off-diagonal components to reduce the number of redundant features. The color scale is capped in the range [-2, 2] to facilitate the visualization of the off-diagonal elements (the highest absolute values of the matrices are below 5).}
\label{fig:amats}
\end{figure}
\subsection{Improving predictions of atomization energies of QM9 molecules}
With our trained transformation matrices in hand, we now proceed to evaluating the new feature space for the prediction of atomization energies of QM9 molecules. As we showed in our previous work \cite{gallarati2021reaction}, unless modified by feature selection or MLKR(R) protocols, feature variance is not necessarily correlated to their predictive capabilities. Here, since we have modified the feature space and corresponding distance metric, we expect the variance to again correlate with the relevance of features for a target property. In Figure \ref{fig:variances}, the variance of the original FCHL features and modified FCHL\textsubscript{MLKR} and FCHL\textsubscript{MLKRR} features is shown. The original and transformed features do not represent exactly the same information, as the transformed features are constructed as a linear combination of the original ones. Nevertheless, they are still the dominant components, as seen in the diagonal from the $A$ matrices in Figure \ref{fig:amats}. As observed, FCHL\textsubscript{MLKRR}, and to a lesser extent, FCHL\textsubscript{MLKR} makes use of almost all of the feature space. This suggests that the metric learning procedure effectively manipulates all of the features to eliminate irrelevance and redundancy. \\
\begin{figure}
\centering
\includegraphics[width=1\textwidth]{Figures/variances.pdf}
\caption{Feature variances of the original FCHL and the metric-learned alternatives (FCHL$_{\text{MLKR}}$ and FCHL$_{\text{MLKRR}}$) on the QM9 dataset (left), with a zoom on the [0,5] range of the $y$ axis
(right).}
\label{fig:variances}
\end{figure}
The essential goal of the metric learning procedure is to improve predictions on the target property. In Figure \ref{fig:learning}, we compare the relative capabilities of the distance metric learned by MLKR and MLKRR to improve predictions in a KRR model. The learning curves illustrate the evolution of the MAE with increasing training data. In the left panel of Figure \ref{fig:learning}, we observe that the metric obtained with MLKRR offers a significant improvement over the original. The learning curve shows a faster decrease and the MAE of the final model is $\sim$30\% lower. On the other hand, the metric obtained with MLKR in fact performs worse than the original one. The learning curve is shifted upwards and the performance of the final model is $\sim$10\% worse. This suggests that the distance metric learned by the MLKR framework does not necessarily improve its capabilities for KRR. Since the MLKR optimization procedure relies on the NW estimator, we suspect that the nature of this estimator compared to that of the KRR results in a metric that is less suitable for regression tasks.
% make it more clear that FCHL blue is KRR no metric learning
\begin{figure}
\centering
\includegraphics[width=1\textwidth]{Figures/learning.pdf}
\caption{Learning curves with the evolution of the MAE on a validation set of 2k structures obtained using a KRR model (left) and distribution of errors in the same set obtained with a KRR constructed using 10000 data points from the training set (right), generated using the FCHL representation and the altered variations obtained with MLKR (FCHL\textsubscript{MLKR}) and MLKRR (FCHL\textsubscript{MLKRR}).}
\label{fig:learning}
\end{figure}
\subsection{Dimensionality reduction}
Similarity is not only used by supervised algorithms but also by a large number of unsupervised techniques, such as the dimensionality reduction method t-Distributed Stochastic Neighbor Embedding (t-SNE) \cite{van2008visualizing}, PCA, Multidimensional scaling or Isomap \cite{tenenbaum2000global}, sampling methods (\textit{i.e.}, Farthest Point Sampling) \cite{imbalzano_automatic_2018} and some clustering techniques. We illustrate the effects of our metric-learned similarity measures using t-SNE and PCA models by constructing 2D projections of the QM9 dataset using the feature spaces spanned by FCHL, FCHL$_{MLKR}$ (FCHL $\cdot \textbf{A}_{MLKR}^T$) and FCHL$_{MLKRR}$ (FCHL $\cdot \textbf{A}_{MLKRR}^T$). t-SNE projects data points in a N-dimensional space (N=2 in general) by minimizing the difference of the pairwise affinity between points in the original and new spaces. The definition of affinity between two points in t-SNE is:
\begin{equation}
p_{ij}= e^{- \gamma d(x_i, x_j)^2} / \sum_{k \neq i} e^{- \gamma d(x_i, x_k)^2}
\end{equation}\label{eqn:tsne}
akin to the weights of the kernel average for the predictions in the NW estimator of MLKR (see equation \ref{eq:NW}). PCA instead performs the projection such that the new dimensions are those with the highest variance in the original data. It does not rely on an explicit distance as in the \textit{t}-SNE, but still selects the principal components by evaluating $\mathbf{X}^T \mathbf{X}$ (where $\mathbf{X}$ is the feature vector), which is analogous to a distance. \\
The \textit{t}-SNE and PCA maps obtained with FCHL, FCHL$_{MLKR}$ and FCHL$_{MLKRR}$ are shown in Figure \ref{fig:tsnes}, where each of the points representing a molecule are colored by their excess atomization energy. Sharp differences between the maps are observed. For instance, the t-SNE and PCA maps obtained with the FCHL$_{\mathrm{MLKR}}$ features tend to organize local clusters. Yet, there is no global coherence between the organization of the projected points and the colored target property. Alternatively, the FCHL$_{\mathrm{MLKRR}}$ maps resemble more closely to those of the original FCHL, albeit organize into larger regions offering a better coherence with respect to the target property. Overall, MLKRR improves the organization of data to uncover the patterns (Fig. \ref{fig:tsnes}) relevant to optimize the prediction of the targeted properties (Fig. \ref{fig:learning}). \\
\begin{figure}
\centering
\includegraphics[width=1\textwidth]{Figures/tsnes_pcas.pdf}
\caption{2D projections of FCHL, FCHL$_{\text{MLKR}}$ and FCHL$_{\text{MLKRR}}$ constructed using t-SNE and PCA algorithms (first and second row, respectively). The colormap shows the excess atomization energy for each compound.}
\label{fig:tsnes}
\end{figure}
The proposed algorithm includes additional functionalities that range beyond maximizing prediction accuracy in specific applications. Careful analysis of the learned transformation matrices provide valuable insights as to why some representations behave better than others \cite{faber_prediction_2017}. In addition, the algorithm offers a mathematical route to construct a hierarchy of molecular representations adaptable and tailored to specific targets as an alternative to building representation encoding the relevant information in absolute terms \cite{musil2021physics}. Comparisons between the similarities and metrics learned for a wide range of applications might also uncover hidden trends useful to develop improved and more general molecular representations. Finally, MLKRR could also be exploited in transfer-learning or meta-learning approaches, a concept, which has so far been limited to Neural Network applications \cite{brazdil2008metalearning}.
\section{Conclusions}
Molecular sciences use a large number machine learning methods that rely upon the notion of similarity. This notion is inherently connected with the definition of a pre-defined distance metric, which is not always suited to characterize high-dimensional feature vectors containing irrelevant or often redundant features. To address this shortcoming and circumvent the use of a pre-defined metric, we derive the Metric Learning for Kernel Ridge Regression (MLKRR) algorithm, which optimizes the prediction of a particular target by re-defining the Euclidean distance between points on the basis of the kernel ridge regression estimator. MLKRR was shown to offer improved performance (30\%) on regression tasks of the prototypical atomization energies of QM9 dataset. In addition, we provide insightful analysis on why is the proposed algorithm superior to possible alternatives using the non-parametric Nadaraya–Watson (NW) estimator for regression tasks.
\section*{Acknowledgements}
R.F. and C.C. acknowledge funding from the European Research Council (ERC) under the European Union's Horizon 2020 research and innovation programme (Grant 817977). P.v.G. and C.C. acknowledge the National Centre of Competence in Research (NCCR) “Sustainable chemical process through catalysis (Catalysis)” of the Swiss National Science Foundation (SNSF, Grant No. 180544) for financial support. M.N.H., F.E. and C.C acknowledge the FSB Intrafaculty Seed Funding. We thank Alberto Fabrizio for helpful discussions and feedback on the manuscript.
\clearpage
\printbibliography
\newpage
\section{Appendix: Derivation of MLKR and MLKRR gradients}
\subsection{Derivation of the MLKR gradient in equation \eqref{eq:grad_mlkr}}
%Details on how to of the MLKR gradient are given hereafter.
Starting from the top, we find
\begin{align*}
\dA{\mathcal{L}} = 2 \sum_i (\yh_i - y_i) \dA{\yh_i},
\end{align*}
where $\yh_i = \frac{1}{Z_i} \sum_{j\neq i} k_{ij} y_j$ and $Z_i = \sum_{j \neq i} k_{ij}$.
This naturally leads us to compute $\dA{\yh_i}$, and hence $\dA{k_{ij}}$.
Firstly, the kernel derivative is
\begin{align}\label{eq:kerngrad}
\begin{split}
\dA{k_{ij}} &= -\frac{1}{\sigma^2} k_{ij} \dA{}\left(d(x_i,x_j)^2\right),\\
&= -\frac{2k_{ij}}{\sigma^2} A x_{ij}x_{ij}^T,
\end{split}
\end{align}
where $x_{ij} := x_i - x_j$.
Consequently, one has
\begin{align*}
\dA{\yh_i} &= \frac{1}{Z_i} \sum_{j\neq i} \dA{k_{ij}} y_j - \frac{1}{Z_i} \yh_i \sum_{j \neq i} \dA{k_{ij}}, \\
&= - \frac{2}{\sigma^2 Z_i} A \sum_{j\neq i} k_{ij} y_j x_{ij} x_{ij}^T + \frac{2}{\sigma^2 Z_i} A \yh_i \sum_{j\neq i} k_{ij} x_{ij} x_{ij}^T, \\
&= \frac{2}{\sigma^2 Z_i} A \sum_{j\neq i} k_{ij} (\yh_i - y_j) x_{ij} x_{ij}^T.
\end{align*}
From which the conclusion stems naturally.
\subsection{Derivation of the MLKRR gradient of equation \eqref{eq:grad}}
In the following, the derivation of the gradient \eqref{eq:grad} and its precise definition are given.
The notations defined in section \ref{subsec:mlkr} will be reused along with the subsequent new ones.
We consider two different kernel matrices: one between $\Xa$ and itself and one between $\XA$ and $\Xa$ (seen as $K$ and $Q$ respectively in equation \eqref{eq:grad}). We also denote by $H$ the regularized kernel defining the coefficients $\alpha$ for kernel ridge regression.
To simplify notations, we let the index $i$ range over $\{1, \dots, n_A \}$, while the indices $j, a$, and $b$ range over $\{1, \dots, n_\alpha\}$. We further lose the superscripts on $x$ and $y$ when the indices suffice to determine them, and use the same letter $K$ for both kernels.
\begin{alignat*}{2}
k_{i,j} &:= \kernel{i}{j}, \qquad\qquad &&k_{j,j'} := \kernel{j}{j'}, \\
H &:= (K+\lambda I), &&\quad \ \alpha :=H^{-1} \ya.
\end{alignat*}
% To further simplify notations, we let the index $i$ range over $\{1, \dots, n_A \}$, while the indices $j, a$, and $b$ range over $\{1, \dots, n_\alpha\}$. We also lose the superscripts on $x$ and $y$ when the indices suffice to determine them.
As above, we start from the following expression
\begin{align*}
\dA{\mathcal{L}} = 2 \sum_i (\yh_i - y_i) \dA{\yh_i},
\end{align*}
where $\dA{\yh_i} = \sum_j \dA{}(\alpha_j k_{ij})$. Remembering that both $\alpha$ and $K$ depend on $A$, we therefore aim to compute $\dA{k_{ij}}$ and $\dA{\alpha_j}$.
Remember that the kernel derivative is already computed in \eqref{eq:kerngrad}.
\begin{align*}
\dA{k_{ij}} = -\frac{2k_{ij}}{\sigma^2} A x_{ij}x_{ij}^T,
\end{align*}
where $x_{ij} := x_i - x_j$.
The second term requires treating the derivative of $\Hmo$, the details of which are spared. In fact, routine computation yields that
\begin{align*}
\dA{}(\Hmo)_{jj'} &= \frac{2}{\sigma^2} A \sum_{a,b} (\Hmo)_{ja} (\Hmo)_{bj'} k_{ab} x_{ab}x_{ab}^T,
\end{align*}
and hence that
\begin{align*}
\dA{\alpha_j} &= \frac{2}{\sigma^2} A \sum_{a,b} (\Hmo)_{ja} k_{ab} \alpha_b x_{ab}x_{ab}^T.
\end{align*}
Combining both derivatives gives the following expression for the gradient.
\begin{align}\label{eq:grad0}
\dA{\mathcal{L}} &= 2 \sum_{i,j} (\yh_i - y_i) \left[\dA{k_{ij}} \alpha_j + \dA{\alpha_j} k_{ij}\right], \nonumber \\
\begin{split}
&= -\frac{4}{\sigma^2} A \sum_{i,j} (\yh_i - y_i) \alpha_j k_{ij} x_{ij}x_{ij}^T \\
& \quad + \frac{4}{\sigma^2} A \sum_{a,b} k_{ab} \alpha_b \left[H^{-T} K^T (\yh - \yA)\right]_{a} x_{ab}x_{ab}^T.
\end{split}
\end{align}
%
We recall that $K$ has two different definitions depending on its indexing. To clarify, we set $K \in \R^{n_\alpha \times n_\alpha}$ the kernel between $\Xa$ and itself; and we set $Q \in \R^{n_A \times n_\alpha}$ the kernel between $\XA$ and $\Xa$.
The lemma below allows us to turn this expression into matrix form. Indeed, note that both sums are of the form $\sum_{i,j} W_{ij} (x_i - x_j)(x_i-x_j)^T$, where $x_i$ and $x_j$ are lines of $\XA$ or $\Xa$ depending on their index.
\begin{lemma}\label{lem:2}
Consider two matrices $A$ and $B$ with lines $\{a_i\}$, $\{b_j\}$ of matching dimensions, and let $x_{ij} = a_i - b_j$. Set further the matrix
\begin{align}\label{eq:1}
\Sigma = \sum_{i,j} W_{ij} x_{ij}x_{ij}^T,
\end{align}
with some coefficients $W_{ij}$ making up a matrix $W$. Then
\begin{align*}
\Sigma = -A^T W B - B^T W^T A + A^T R A + B^T S B,
\end{align*}
where $R$ and $S$ are both diagonal matrices with $R_{ii} = \sum_j W_{ij}$, and $S_{jj} = \sum_i W_{ij}$.
\end{lemma}
Applying the lemma to both expressions in \eqref{eq:grad0} leads us to construct the matrices $W \in \R^{n_A \times n_\alpha}$ and $\tilde{W} \in \R^{n_\alpha \times n_\alpha}$ in the following way.
\begin{align*}
\begin{cases}
W_{ij} := (\yh_i - y_i) \alpha_j Q_{ij}, \\
\tilde{W}_{ab} := K_{ab} \alpha_b \left[H^{-T} Q^T (\yh - \yA)\right]_{a}.
\end{cases}
\end{align*}
Finally, the diagonal matrices $R, S, \tilde{R}, \tilde{S}$ given by the lemma conclude.
\begin{proof}[Proof of lemma \ref{lem:2}]
The right-hand side of \eqref{eq:1} splits into four sums which make up each term of the result.
\begin{align*}
\Sigma &= \sum_{i,j} W_{ij} (a_i - b_j)(a_i-b_j)^T, \\
&= \sum_{i} \left(\sum_j W_{ij}\right) a_i a_i^T + \sum_{j} \left(\sum_i W_{ij}\right) b_j b_j^T, \\
& \quad - \sum_{i,j} W_{ij} a_i b_j^T - \sum_{i,j} W_{ij} b_j a_i^T.
\end{align*}
Note that the last two terms are transpose of each others so that only one has to be treated. Additionally, for a general matrix $M$ of appropriate dimensions, one can easily verify that
\begin{align*}
A^T M B = \sum_{i,j} M_{ij}a_i b_j^T.
\end{align*}
Taking $M=W, R$, and $S$ yields the desired result.
\end{proof}
% \subsection*{Optimization of MLKRR}
% Figure \ref{fig:shuffle} illustrates the training of the MLKRR with and without shuffling the training data.
% \begin{figure}[h!]
% \centering
% \includegraphics[width=1\textwidth]{Figures/optimization_smooth.eps}
% \caption{Evolution of the MAE during the training of a MLKRR with shuffling (right) and without shuffling(left). As can be seen, without the shuffling the train MAE decreases monotonically while the test MAE eventually starts going up, indicating a possible overfitting. The shuffling prevents this, and at the same time generate a metric with a smaller gap between train and test MAE, which implies better generalization.}
% \label{fig:shuffle}
% \end{figure}
\end{document}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End:

Event Timeline