{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def read_sdf(sdf):\n", " with open(sdf, \"r\") as f:\n", " txt = f.read().rstrip()\n", " return txt" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def get_ncharges_coords(sdf):\n", " mol = Chem.MolFromMolBlock(sdf)\n", " #mol = Chem.AddHs(mol)\n", " # rdkit molobj\n", " ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]\n", " conf = mol.GetConformer()\n", " coords = np.asarray(conf.GetPositions())\n", " return ncharges, coords" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def cutoff_func(R_ij, central_cutoff=4.8, central_decay=1):\n", " if R_ij <= (central_cutoff - central_decay):\n", " func = 1.\n", " elif ((central_cutoff - central_decay) < R_ij) and (R_ij <= (central_cutoff + central_decay)):\n", " func = 0.5 * (1. + np.cos((np.pi * R_ij - central_cutoff + central_decay)/central_decay))\n", " else:\n", " func = 0.\n", " return func" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "def get_atomic_CM(ncharges, coords, max_natoms, central_cutoff=4.8, central_decay=1):\n", " size = int((max_natoms + 1)*max_natoms / 2)\n", " rep = np.zeros((len(ncharges), size))\n", " \n", " # central atom loop\n", " for k in range(len(ncharges)):\n", " M = np.zeros((len(ncharges), len(ncharges)))\n", " for i in range(len(ncharges)):\n", " R_ik = np.linalg.norm(coords[i]-coords[k])\n", " # print('R_ik', R_ik)\n", " f_ik = cutoff_func(R_ik, central_cutoff=central_cutoff,\n", " central_decay=central_decay)\n", " for j in range(len(ncharges)):\n", " if i <=j:\n", " if i == j:\n", " M[i,j] = 0.5 * ncharges[i]**2.4 * f_ik**2\n", " M[j,i] = M[i,j]\n", "\n", " else:\n", " R_jk = np.linalg.norm(coords[j]-coords[k])\n", " # print('R_jk', R_jk)\n", " f_jk = cutoff_func(R_jk, central_cutoff=central_cutoff,\n", " central_decay=central_decay)\n", " R_ij = np.linalg.norm(coords[i]-coords[j])\n", " # print('R_ij', R_ij)\n", " f_ij = cutoff_func(R_ij, central_cutoff=central_cutoff,\n", " central_decay=central_decay)\n", " M[i,j] = (ncharges[i]*ncharges[j]/R_ij)*f_ik*f_jk*f_ij\n", " M[j,i] = M[i,j]\n", "\n", "\n", " # concat upper triangular and diagonal\n", " upper_triang = M[np.triu_indices(len(M))]\n", " s_upper_triang = np.sort(upper_triang)[::-1]\n", " \n", " # pad to full size\n", " n_zeros = size - len(s_upper_triang)\n", " zeros = np.zeros(n_zeros)\n", " rep[k] = np.concatenate((s_upper_triang, zeros))\n", "\n", " return rep" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_sdfs = sorted(glob(\"targets/*.sdf\"))\n", "target_sdfs" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "qm9_amons_files = sorted(glob(\"amons-qm9/*.sdf\"))" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "qm9_amons_sdfs = [read_sdf(x) for x in qm9_amons_files]" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in qm9_amons_sdfs]" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "qm9_ncharges = ncharges_list" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "([8],\n", " [6, 6],\n", " [6, 7],\n", " [8, 6],\n", " [6, 8, 8],\n", " [6, 8, 7],\n", " [6, 6, 7, 7],\n", " [8, 6, 6, 7],\n", " [8, 6, 6, 6],\n", " [6, 7, 6, 8],\n", " [6, 7, 6, 8, 8],\n", " [6, 8, 8, 7, 6],\n", " [8, 6, 6, 7, 7, 6],\n", " [8, 6, 6, 7, 6, 8],\n", " [8, 6, 6, 7, 6, 8, 8],\n", " [6, 7, 6, 8, 8, 7, 6])" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qm9_ncharges" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "qm9_reps = [np.array(get_atomic_CM(np.array(ncharges_list[i]),\n", " np.array(coords_list[i]), \n", " max_natoms=9))\n", " for i in range(len(ncharges_list))]" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "qm9_reps = np.array(qm9_reps)" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1, 45)" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qm9_reps[0].shape" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[73.51669472, 73.51669472, 53.3587074 , 53.3587074 , 39.59747102,\n", " 38.24346354, 36.8581052 , 36.8581052 , 36.8581052 , 35.01513065,\n", " 32.97763233, 32.88982286, 30.62177746, 28.63343371, 24.38002738,\n", " 24.32694184, 23.45775915, 21.04930695, 17.96633265, 17.46140502,\n", " 17.44163762, 17.36912369, 17.26721941, 15.88104653, 15.79556841,\n", " 14.00224752, 13.37087415, 2.00479248, 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ],\n", " [73.51669472, 73.51669472, 53.3587074 , 53.3587074 , 39.59747102,\n", " 38.24346354, 36.8581052 , 36.8581052 , 36.8581052 , 35.01513065,\n", " 32.97763233, 32.88982286, 30.62177746, 28.63343371, 24.38002738,\n", " 24.32694184, 23.45775915, 21.04930695, 17.96633265, 17.46140502,\n", " 17.44163762, 17.36912369, 17.26721941, 15.88104653, 15.79556841,\n", " 14.00224752, 13.37087415, 2.00479248, 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ],\n", " [73.51669472, 73.51669472, 53.3587074 , 53.3587074 , 39.59747102,\n", " 38.24346354, 36.8581052 , 36.8581052 , 36.8581052 , 35.01513065,\n", " 32.97763233, 32.88982286, 30.62177746, 28.63343371, 24.38002738,\n", " 24.32694184, 23.45775915, 21.04930695, 17.96633265, 17.46140502,\n", " 17.44163762, 17.36912369, 17.26721941, 15.88104653, 15.79556841,\n", " 14.00224752, 13.37087415, 2.00479248, 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ],\n", " [73.51669472, 73.51669472, 53.3587074 , 53.3587074 , 39.59747102,\n", " 38.24346354, 36.8581052 , 36.8581052 , 35.01513065, 32.88982286,\n", " 30.62177746, 28.63343371, 24.32694184, 23.45775915, 17.96633265,\n", " 17.46140502, 17.36912369, 17.26721941, 15.88104653, 15.79556841,\n", " 14.00224752, 5.43166876, 4.01557734, 3.46698216, 2.87277137,\n", " 2.20228544, 0.99990932, 0.33020468, 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ],\n", " [73.51669472, 73.51669472, 53.3587074 , 53.3587074 , 39.59747102,\n", " 38.24346354, 36.8581052 , 36.8581052 , 36.8581052 , 35.01513065,\n", " 32.97763233, 32.88982286, 30.62177746, 28.63343371, 24.38002738,\n", " 24.32694184, 23.45775915, 21.04930695, 17.96633265, 17.46140502,\n", " 17.44163762, 17.36912369, 17.26721941, 15.88104653, 15.79556841,\n", " 14.00224752, 13.37087415, 2.00479248, 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ],\n", " [73.51669472, 73.51669472, 53.3587074 , 53.3587074 , 39.59747102,\n", " 38.24346354, 36.8581052 , 36.8581052 , 36.8581052 , 35.01513065,\n", " 32.97763233, 32.88982286, 30.62177746, 28.63343371, 24.38002738,\n", " 24.32694184, 23.45775915, 21.04930695, 17.96633265, 17.46140502,\n", " 17.44163762, 17.36912369, 17.26721941, 15.88104653, 15.79556841,\n", " 14.00224752, 13.37087415, 2.00479248, 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ],\n", " [73.51669472, 53.3587074 , 53.3587074 , 39.59747102, 36.8581052 ,\n", " 36.8581052 , 36.8581052 , 35.01513065, 32.97763233, 32.88982286,\n", " 30.62177746, 24.38002738, 23.45775915, 21.04930695, 17.96633265,\n", " 17.46140502, 17.44163762, 17.36912369, 17.26721941, 15.88104653,\n", " 13.37087415, 6.29899151, 4.71614595, 4.00683374, 2.60165116,\n", " 2.30627747, 1.99440606, 0.33020468, 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ]])" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qm9_reps[-1]" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "qm9_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in qm9_amons_files]" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "vitc_amons_files = sorted(glob(\"amons-vitc/*.sdf\"))" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "vitc_amons_sdfs = [read_sdf(x) for x in vitc_amons_files]" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in vitc_amons_sdfs]" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "vitc_ncharges = ncharges_list" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "vitc_reps = [np.array(get_atomic_CM(np.array(ncharges_list[i]), np.array(coords_list[i]), \n", " max_natoms=12)) for i in \n", " range(len(ncharges_list))]" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "vitc_reps = np.array(vitc_reps)" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "vitc_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitc_amons_files]" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "vitd_amons_files = sorted(glob(\"amons-vitd/*.sdf\"))" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "vitd_amons_sdfs = [read_sdf(x) for x in vitd_amons_files]" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in vitd_amons_sdfs]" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "vitd_ncharges = ncharges_list" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "vitd_reps = [np.array(get_atomic_CM(np.array(ncharges_list[i]), np.array(coords_list[i]),\n", " max_natoms=28))\n", " for i in range(len(ncharges_list))]" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "vitd_reps = np.array(vitd_reps)" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "vitd_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitd_amons_files]" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "# np save " ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "np.savez(\"amons_aCM_data.npz\", \n", " vitd_amons_labels=vitd_amons_labels,\n", " vitc_amons_labels=vitc_amons_labels,\n", " qm9_amons_labels=qm9_amons_labels,\n", " vitd_amons_ncharges=vitd_ncharges,\n", " vitc_amons_ncharges=vitc_ncharges,\n", " qm9_amons_ncharges=qm9_ncharges,\n", " vitd_amons_reps=vitd_reps,\n", " vitc_amons_reps=vitc_reps,\n", " qm9_amons_reps=qm9_reps)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1, 406)" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vitd_reps[0].shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 }