diff --git a/GetCM.ipynb b/GetCM.ipynb index 8288f5a..a5cc204 100644 --- a/GetCM.ipynb +++ b/GetCM.ipynb @@ -1,310 +1,310 @@ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import qml " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "target_xyzs = sorted(glob(\"targets/*.xyz\"))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "database_xyzs = sorted(glob(\"qm7/*.xyz\"))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "target_mols = [qml.Compound(x) for x in target_xyzs]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "database_mols = [qml.Compound(x) for x in database_xyzs]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def get_CM(mol):\n", " ncharges = mol.nuclear_charges\n", " coords = mol.coordinates \n", " CM = np.zeros((len(coords), len(coords)))\n", " for i in range(len(coords)):\n", " for j in range(len(coords)):\n", " if i==j:\n", " CM[i,j] = 0.5 * ncharges[i]**2.4\n", " else:\n", " CM[i,j] = ncharges[i] * ncharges[j] / np.linalg.norm(coords[j] - coords[i])\n", " \n", " return ncharges, CM" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "mol = target_mols[0]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "ncharges, CM = get_CM(mol)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "target_ncharges = []\n", "target_CMs = []\n", "for mol in target_mols: \n", " ncharge, CM = get_CM(mol)\n", " target_ncharges.append(ncharge)\n", " target_CMs.append(CM)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "database_ncharges = []\n", "database_CMs = []\n", "for mol in database_mols:\n", " ncharge, CM = get_CM(mol)\n", " database_ncharges.append(ncharge)\n", " database_CMs.append(CM)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " target_CMs = np.array(target_CMs)\n" ] } ], "source": [ "target_CMs = np.array(target_CMs)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " database_CMs = np.array(database_CMs)\n" ] } ], "source": [ "database_CMs = np.array(database_CMs)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "target_labels = [t.split(\"/\")[-1].split(\".xyz\")[0] for t in target_xyzs]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "database_labels = [t.split(\"/\")[-1].split(\".xyz\")[0] for t in database_xyzs]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "target_labels = np.array(target_labels)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "database_labels = np.array(database_labels)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " target_ncharges = np.array(target_ncharges)\n" ] } ], "source": [ "target_ncharges = np.array(target_ncharges)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " database_ncharges = np.array(database_ncharges)\n" ] } ], "source": [ "database_ncharges = np.array(database_ncharges)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "np.savez(\"data.npz\", \n", " target_labels=target_labels, \n", " target_CMs=target_CMs, \n", " target_ncharges=target_ncharges,\n", " database_labels=database_labels, \n", " database_CMs=database_CMs,\n", " database_ncharges=database_ncharges)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "x = np.load(\"data.npz\", allow_pickle=True)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['target_labels',\n", " 'target_CMs',\n", " 'target_ncharges',\n", " 'database_labels',\n", " 'database_CMs',\n", " 'database_ncharges']" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3" + "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/GetCMAmons.ipynb b/GetCMAmons.ipynb index 0c789d2..3613466 100644 --- a/GetCMAmons.ipynb +++ b/GetCMAmons.ipynb @@ -1,312 +1,312 @@ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import qml " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def read_sdf(sdf):\n", " with open(sdf, \"r\") as f:\n", " txt = f.read().rstrip()\n", " return txt" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def get_CM(sdf):\n", " mol = Chem.MolFromMolBlock(sdf)\n", " # rdkit molobj\n", " ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]\n", " conf = mol.GetConformer()\n", " coords = np.asarray(conf.GetPositions())\n", " CM = np.zeros((len(coords), len(coords)))\n", " for i in range(len(coords)):\n", " for j in range(len(coords)):\n", " if i==j:\n", " CM[i,j] = 0.5 * ncharges[i]**2.4\n", " else:\n", " CM[i,j] = ncharges[i] * ncharges[j] / np.linalg.norm(coords[j] - coords[i])\n", " \n", " return ncharges, CM" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']" ] }, - "execution_count": 29, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_sdfs = sorted(glob(\"targets/*.sdf\"))\n", "target_sdfs" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "qm9_amons_files = sorted(glob(\"amons-qm9/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "qm9_amons_sdfs = [read_sdf(x) for x in qm9_amons_files]" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "qm9_amons_ncharges = []\n", "qm9_amons_CMs = []\n", "for sdf in qm9_amons_sdfs:\n", " ncharge, CM = get_CM(sdf)\n", " qm9_amons_ncharges.append(ncharge)\n", " qm9_amons_CMs.append(CM)" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "qm9_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in qm9_amons_files]" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "vitc_amons_files = sorted(glob(\"amons-vitc/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "vitc_amons_sdfs = [read_sdf(x) for x in vitc_amons_files]" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "vitc_amons_ncharges = []\n", "vitc_amons_CMs = []\n", "for sdf in vitc_amons_sdfs:\n", " ncharge, CM = get_CM(sdf)\n", " vitc_amons_ncharges.append(ncharge)\n", " vitc_amons_CMs.append(CM)" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "vitc_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitc_amons_files]" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "vitd_amons_files = sorted(glob(\"amons-vitd/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "vitd_amons_sdfs = [read_sdf(x) for x in vitd_amons_files]" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "vitd_amons_ncharges = []\n", "vitd_amons_CMs = []\n", "for sdf in vitd_amons_sdfs:\n", " ncharge, CM = get_CM(sdf)\n", " vitd_amons_ncharges.append(ncharge)\n", " vitd_amons_CMs.append(CM)" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "vitd_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitd_amons_files]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# np save " ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " return array(a, dtype, copy=False, order=order, subok=True)\n" ] } ], "source": [ "np.savez(\"amons_data.npz\", \n", " vitd_amons_labels=vitd_amons_labels,\n", " vitc_amons_labels=vitc_amons_labels,\n", " qm9_amons_labels=qm9_amons_labels,\n", " vitd_amons_ncharges=vitd_amons_ncharges,\n", " vitc_amons_ncharges=vitc_amons_ncharges,\n", " qm9_amons_ncharges=qm9_amons_ncharges,\n", " vitd_amons_CMs=vitd_amons_CMs,\n", " vitc_amons_CMs=vitc_amons_CMs,\n", " qm9_amons_CMs=qm9_amons_CMs)" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "x = np.load(\"amons_data.npz\", allow_pickle=True)" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['vitd_amons_labels',\n", " 'vitc_amons_labels',\n", " 'qm9_amons_labels',\n", " 'vitd_amons_ncharges',\n", " 'vitc_amons_ncharges',\n", " 'qm9_amons_ncharges',\n", " 'vitd_amons_CMs',\n", " 'vitc_amons_CMs',\n", " 'qm9_amons_CMs']" ] }, - "execution_count": 47, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3" + "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/GetSLATM.ipynb b/GetSLATM.ipynb new file mode 100644 index 0000000..36a3293 --- /dev/null +++ b/GetSLATM.ipynb @@ -0,0 +1,274 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import qml " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from glob import glob\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from rdkit import Chem" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "target_xyzs = sorted(glob(\"targets/*.xyz\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def read_sdf(sdf):\n", + " with open(sdf, \"r\") as f:\n", + " txt = f.read().rstrip()\n", + " return txt" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def get_ncharges_coords(sdf):\n", + " mol = Chem.MolFromMolBlock(sdf)\n", + " #mol = Chem.AddHs(mol)\n", + " # rdkit molobj\n", + " ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]\n", + " conf = mol.GetConformer()\n", + " coords = np.asarray(conf.GetPositions())\n", + " return ncharges, coords" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_files = sorted(glob(\"targets/*.sdf\"))\n", + "target_files" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "target_sdfs = [read_sdf(x) for x in target_files]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "conf_data = [get_ncharges_coords(x) for x in target_sdfs]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "ncharges_list, coords_list = zip(*conf_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# mbtypes separate to each target" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[8, 6, 6, 7, 6, 8, 8, 7, 6]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ncharges_list[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:4: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + " after removing the cwd from sys.path.\n" + ] + } + ], + "source": [ + "target_reps = np.array(\n", + "[qml.representations.generate_slatm(coords_list[i], ncharges_list[i], \n", + " mbtypes=qml.representations.get_slatm_mbtypes([ncharges_list[i]]))\n", + "for i in range(len(ncharges_list))])" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3121,)" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_reps[0].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "target_labels = [t.split(\"/\")[-1].split(\".xyz\")[0] for t in target_sdfs]" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + " return array(a, dtype, copy=False, order=order, subok=True)\n" + ] + } + ], + "source": [ + "np.savez(\"target_vector_data.npz\", \n", + " target_labels=target_labels, \n", + " target_reps=target_reps, \n", + " target_ncharges=ncharges_list,)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "x = np.load(\"target_vector_data.npz\", allow_pickle=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['target_labels', 'target_reps', 'target_ncharges']" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/GetCMAmons.ipynb b/GetSLATMAmons.ipynb similarity index 51% copy from GetCMAmons.ipynb copy to GetSLATMAmons.ipynb index 0c789d2..3e07e22 100644 --- a/GetCMAmons.ipynb +++ b/GetSLATMAmons.ipynb @@ -1,312 +1,430 @@ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import qml " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def read_sdf(sdf):\n", " with open(sdf, \"r\") as f:\n", " txt = f.read().rstrip()\n", " return txt" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ - "def get_CM(sdf):\n", + "def get_ncharges_coords(sdf):\n", " mol = Chem.MolFromMolBlock(sdf)\n", + " #mol = Chem.AddHs(mol)\n", " # rdkit molobj\n", " ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]\n", " conf = mol.GetConformer()\n", " coords = np.asarray(conf.GetPositions())\n", - " CM = np.zeros((len(coords), len(coords)))\n", - " for i in range(len(coords)):\n", - " for j in range(len(coords)):\n", - " if i==j:\n", - " CM[i,j] = 0.5 * ncharges[i]**2.4\n", - " else:\n", - " CM[i,j] = ncharges[i] * ncharges[j] / np.linalg.norm(coords[j] - coords[i])\n", - " \n", - " return ncharges, CM" + " return ncharges, coords" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']" ] }, - "execution_count": 29, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_sdfs = sorted(glob(\"targets/*.sdf\"))\n", "target_sdfs" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "qm9_amons_files = sorted(glob(\"amons-qm9/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "qm9_amons_sdfs = [read_sdf(x) for x in qm9_amons_files]" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "conf_data = [get_ncharges_coords(x) for x in qm9_amons_sdfs]" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "ncharges_list, coords_list = zip(*conf_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ - "qm9_amons_ncharges = []\n", - "qm9_amons_CMs = []\n", - "for sdf in qm9_amons_sdfs:\n", - " ncharge, CM = get_CM(sdf)\n", - " qm9_amons_ncharges.append(ncharge)\n", - " qm9_amons_CMs.append(CM)" + "qm9_ncharges = ncharges_list" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "qm9_reps = [qml.representations.generate_slatm(coords_list[i], ncharges_list[i], mbtypes) for i in \n", + " range(len(ncharges_list))]" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "qm9_reps = np.array(qm9_reps)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(16, 3121)" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qm9_reps.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "qm9_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in qm9_amons_files]" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "vitc_amons_files = sorted(glob(\"amons-vitc/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "vitc_amons_sdfs = [read_sdf(x) for x in vitc_amons_files]" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ - "vitc_amons_ncharges = []\n", - "vitc_amons_CMs = []\n", - "for sdf in vitc_amons_sdfs:\n", - " ncharge, CM = get_CM(sdf)\n", - " vitc_amons_ncharges.append(ncharge)\n", - " vitc_amons_CMs.append(CM)" + "conf_data = [get_ncharges_coords(x) for x in vitc_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "ncharges_list, coords_list = zip(*conf_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "vitc_ncharges = ncharges_list" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "vitc_reps = [qml.representations.generate_slatm(coords_list[i], ncharges_list[i], mbtypes) for i in \n", + " range(len(ncharges_list))]" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "vitc_reps = np.array(vitc_reps)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "vitc_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitc_amons_files]" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "vitd_amons_files = sorted(glob(\"amons-vitd/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "vitd_amons_sdfs = [read_sdf(x) for x in vitd_amons_files]" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "conf_data = [get_ncharges_coords(x) for x in vitd_amons_sdfs]" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "ncharges_list, coords_list = zip(*conf_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "vitd_ncharges = ncharges_list" + ] + }, + { + "cell_type": "code", + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ - "vitd_amons_ncharges = []\n", - "vitd_amons_CMs = []\n", - "for sdf in vitd_amons_sdfs:\n", - " ncharge, CM = get_CM(sdf)\n", - " vitd_amons_ncharges.append(ncharge)\n", - " vitd_amons_CMs.append(CM)" + "vitd_reps = [qml.representations.generate_slatm(coords_list[i], ncharges_list[i], mbtypes) for i \n", + " in range(len(ncharges_list))]" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "vitd_reps = np.array(vitd_reps)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "vitd_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitd_amons_files]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# np save " ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 81, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " return array(a, dtype, copy=False, order=order, subok=True)\n" - ] - } - ], + "outputs": [], "source": [ - "np.savez(\"amons_data.npz\", \n", + "np.savez(\"amons_vector_data.npz\", \n", " vitd_amons_labels=vitd_amons_labels,\n", " vitc_amons_labels=vitc_amons_labels,\n", " qm9_amons_labels=qm9_amons_labels,\n", - " vitd_amons_ncharges=vitd_amons_ncharges,\n", - " vitc_amons_ncharges=vitc_amons_ncharges,\n", - " qm9_amons_ncharges=qm9_amons_ncharges,\n", - " vitd_amons_CMs=vitd_amons_CMs,\n", - " vitc_amons_CMs=vitc_amons_CMs,\n", - " qm9_amons_CMs=qm9_amons_CMs)" + " vitd_amons_ncharges=vitd_ncharges,\n", + " vitc_amons_ncharges=vitc_ncharges,\n", + " qm9_amons_ncharges=qm9_ncharges,\n", + " vitd_amons_slatms=vitd_reps,\n", + " vitc_amons_slatms=vitc_reps,\n", + " qm9_amons_slatms=qm9_reps)" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ - "x = np.load(\"amons_data.npz\", allow_pickle=True)" + "x = np.load(\"amons_vector_data.npz\", allow_pickle=True)" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['vitd_amons_labels',\n", " 'vitc_amons_labels',\n", " 'qm9_amons_labels',\n", " 'vitd_amons_ncharges',\n", " 'vitc_amons_ncharges',\n", " 'qm9_amons_ncharges',\n", - " 'vitd_amons_CMs',\n", - " 'vitc_amons_CMs',\n", - " 'qm9_amons_CMs']" + " 'vitd_amons_slatms',\n", + " 'vitc_amons_slatms',\n", + " 'qm9_amons_slatms']" ] }, - "execution_count": 47, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "x.files" + "list(x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3" + "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/amons_vector_data.npz b/amons_vector_data.npz new file mode 100644 index 0000000..3957ed2 Binary files /dev/null and b/amons_vector_data.npz differ diff --git a/target_vector_data.npz b/target_vector_data.npz new file mode 100644 index 0000000..7f76fb9 Binary files /dev/null and b/target_vector_data.npz differ