diff --git a/GetCM.ipynb b/GetCM.ipynb index a5cc204..3d127fd 100644 --- a/GetCM.ipynb +++ b/GetCM.ipynb @@ -1,310 +1,310 @@ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import qml " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "target_xyzs = sorted(glob(\"targets/*.xyz\"))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "database_xyzs = sorted(glob(\"qm7/*.xyz\"))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "target_mols = [qml.Compound(x) for x in target_xyzs]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "database_mols = [qml.Compound(x) for x in database_xyzs]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def get_CM(mol):\n", " ncharges = mol.nuclear_charges\n", " coords = mol.coordinates \n", " CM = np.zeros((len(coords), len(coords)))\n", " for i in range(len(coords)):\n", " for j in range(len(coords)):\n", " if i==j:\n", " CM[i,j] = 0.5 * ncharges[i]**2.4\n", " else:\n", " CM[i,j] = ncharges[i] * ncharges[j] / np.linalg.norm(coords[j] - coords[i])\n", " \n", " return ncharges, CM" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "mol = target_mols[0]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "ncharges, CM = get_CM(mol)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "target_ncharges = []\n", "target_CMs = []\n", "for mol in target_mols: \n", " ncharge, CM = get_CM(mol)\n", " target_ncharges.append(ncharge)\n", " target_CMs.append(CM)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "database_ncharges = []\n", "database_CMs = []\n", "for mol in database_mols:\n", " ncharge, CM = get_CM(mol)\n", " database_ncharges.append(ncharge)\n", " database_CMs.append(CM)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - ":1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " target_CMs = np.array(target_CMs)\n" + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "target_CMs = np.array(target_CMs)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - ":1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " database_CMs = np.array(database_CMs)\n" + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "database_CMs = np.array(database_CMs)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "target_labels = [t.split(\"/\")[-1].split(\".xyz\")[0] for t in target_xyzs]" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "database_labels = [t.split(\"/\")[-1].split(\".xyz\")[0] for t in database_xyzs]" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "target_labels = np.array(target_labels)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "database_labels = np.array(database_labels)" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - ":1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " target_ncharges = np.array(target_ncharges)\n" + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "target_ncharges = np.array(target_ncharges)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - ":1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " database_ncharges = np.array(database_ncharges)\n" + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "database_ncharges = np.array(database_ncharges)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "np.savez(\"data.npz\", \n", " target_labels=target_labels, \n", " target_CMs=target_CMs, \n", " target_ncharges=target_ncharges,\n", " database_labels=database_labels, \n", " database_CMs=database_CMs,\n", " database_ncharges=database_ncharges)" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "x = np.load(\"data.npz\", allow_pickle=True)" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['target_labels',\n", " 'target_CMs',\n", " 'target_ncharges',\n", " 'database_labels',\n", " 'database_CMs',\n", " 'database_ncharges']" ] }, - "execution_count": 24, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/GetSLATM.ipynb b/GetFCHL.ipynb similarity index 65% copy from GetSLATM.ipynb copy to GetFCHL.ipynb index a9af520..adcdd95 100644 --- a/GetSLATM.ipynb +++ b/GetFCHL.ipynb @@ -1,226 +1,257 @@ { "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "import qml " ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "target_xyzs = sorted(glob(\"targets/*.xyz\"))" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "def read_sdf(sdf):\n", " with open(sdf, \"r\") as f:\n", " txt = f.read().rstrip()\n", " return txt" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "def get_ncharges_coords(sdf):\n", " mol = Chem.MolFromMolBlock(sdf)\n", " #mol = Chem.AddHs(mol)\n", " # rdkit molobj\n", " ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]\n", " conf = mol.GetConformer()\n", " coords = np.asarray(conf.GetPositions())\n", " return ncharges, coords" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']" ] }, - "execution_count": 7, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_files = sorted(glob(\"targets/*.sdf\"))\n", "target_files" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "target_sdfs = [read_sdf(x) for x in target_files]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in target_sdfs]" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 46, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[9, 12, 28]" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# mbtypes separate to each target" + "sizes = [len(x) for x in ncharges_list]\n", + "sizes" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[array([6, 7, 8]), array([6, 8]), array([6, 8])]" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "elements_list = [np.unique(x) for x in ncharges_list]\n", + "elements_list" + ] + }, + { + "cell_type": "code", + "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:5: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:5: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"\n" ] } ], "source": [ "target_reps = np.array(\n", - "[np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], \n", - " mbtypes=qml.representations.get_slatm_mbtypes([ncharges_list[i]]),\n", - " local=True))\n", + "[np.array(qml.representations.generate_fchl_acsf(ncharges_list[i],\n", + " coords_list[i],\n", + " elements=elements_list[i]))\n", "for i in range(len(ncharges_list))])" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(28, 857)" + "(28, 168)" ] }, - "execution_count": 13, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_reps[2].shape" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "target_labels = [t.split(\"/\")[-1].split(\".xyz\")[0] for t in target_sdfs]" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 51, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " return array(a, dtype, copy=False, order=order, subok=True)\n" - ] - } - ], + "outputs": [], "source": [ - "np.savez(\"target_vector_data.npz\", \n", + "np.savez(\"target_FCHL_data.npz\", \n", " target_labels=target_labels, \n", " target_reps=target_reps, \n", " target_ncharges=ncharges_list,)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/GetSLATMAmons.ipynb b/GetFCHLAmons.ipynb similarity index 63% copy from GetSLATMAmons.ipynb copy to GetFCHLAmons.ipynb index 386fbcf..197a1c9 100644 --- a/GetSLATMAmons.ipynb +++ b/GetFCHLAmons.ipynb @@ -1,461 +1,431 @@ { "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "import qml " ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "def read_sdf(sdf):\n", " with open(sdf, \"r\") as f:\n", " txt = f.read().rstrip()\n", " return txt" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "def get_ncharges_coords(sdf):\n", " mol = Chem.MolFromMolBlock(sdf)\n", " #mol = Chem.AddHs(mol)\n", " # rdkit molobj\n", " ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]\n", " conf = mol.GetConformer()\n", " coords = np.asarray(conf.GetPositions())\n", " return ncharges, coords" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']" ] }, - "execution_count": 6, + "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_sdfs = sorted(glob(\"targets/*.sdf\"))\n", "target_sdfs" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "qm9_amons_files = sorted(glob(\"amons-qm9/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "qm9_amons_sdfs = [read_sdf(x) for x in qm9_amons_files]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in qm9_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "qm9_ncharges = ncharges_list" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ - "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "qm9_reps = [np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], mbtypes,\n", - " local=True)) for i in \n", + "qm9_reps = [np.array(qml.representations.generate_fchl_acsf(\n", + " ncharges_list[i],\n", + " coords_list[i],\n", + " elements=[6,7,8])) for i in \n", " range(len(ncharges_list))]" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 81, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "qm9_reps = np.array(qm9_reps)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(1, 3121)" + "(1, 312)" ] }, - "execution_count": 15, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qm9_reps[0].shape" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "qm9_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in qm9_amons_files]" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "vitc_amons_files = sorted(glob(\"amons-vitc/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "vitc_amons_sdfs = [read_sdf(x) for x in vitc_amons_files]" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in vitc_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "vitc_ncharges = ncharges_list" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ - "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "vitc_reps = [np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], \n", - " mbtypes, local=True)) for i in \n", + "vitc_reps = [np.array(qml.representations.generate_fchl_acsf(\n", + " ncharges_list[i],\n", + " coords_list[i],\n", + " elements=[6,8])) for i in \n", " range(len(ncharges_list))]" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 90, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "vitc_reps = np.array(vitc_reps)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "vitc_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitc_amons_files]" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "vitd_amons_files = sorted(glob(\"amons-vitd/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "vitd_amons_sdfs = [read_sdf(x) for x in vitd_amons_files]" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in vitd_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "vitd_ncharges = ncharges_list" ] }, { "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ - "vitd_reps = [np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], \n", - " mbtypes, local=True)) for i \n", + "vitd_reps = [np.array(qml.representations.generate_fchl_acsf(\n", + " ncharges_list[i],\n", + " coords_list[i],\n", + " elements=[6,8])) for i \n", " in range(len(ncharges_list))]" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 98, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "vitd_reps = np.array(vitd_reps)" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "vitd_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitd_amons_files]" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "# np save " ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 100, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " return array(a, dtype, copy=False, order=order, subok=True)\n" - ] - } - ], + "outputs": [], "source": [ - "np.savez(\"amons_vector_data.npz\", \n", + "np.savez(\"amons_FCHL_data.npz\", \n", " vitd_amons_labels=vitd_amons_labels,\n", " vitc_amons_labels=vitc_amons_labels,\n", " qm9_amons_labels=qm9_amons_labels,\n", " vitd_amons_ncharges=vitd_ncharges,\n", " vitc_amons_ncharges=vitc_ncharges,\n", " qm9_amons_ncharges=qm9_ncharges,\n", - " vitd_amons_slatms=vitd_reps,\n", - " vitc_amons_slatms=vitc_reps,\n", - " qm9_amons_slatms=qm9_reps)" + " vitd_amons_reps=vitd_reps,\n", + " vitc_amons_reps=vitc_reps,\n", + " qm9_amons_reps=qm9_reps)" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(1, 857)" + "(1, 168)" ] }, - "execution_count": 37, + "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vitd_reps[0].shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/GetSLATM.ipynb b/GetSLATM.ipynb index a9af520..02d68db 100644 --- a/GetSLATM.ipynb +++ b/GetSLATM.ipynb @@ -1,226 +1,226 @@ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import qml " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "target_xyzs = sorted(glob(\"targets/*.xyz\"))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def read_sdf(sdf):\n", " with open(sdf, \"r\") as f:\n", " txt = f.read().rstrip()\n", " return txt" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def get_ncharges_coords(sdf):\n", " mol = Chem.MolFromMolBlock(sdf)\n", " #mol = Chem.AddHs(mol)\n", " # rdkit molobj\n", " ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]\n", " conf = mol.GetConformer()\n", " coords = np.asarray(conf.GetPositions())\n", " return ncharges, coords" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_files = sorted(glob(\"targets/*.sdf\"))\n", "target_files" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "target_sdfs = [read_sdf(x) for x in target_files]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in target_sdfs]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# mbtypes separate to each target" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:5: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:5: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"\n" ] } ], "source": [ "target_reps = np.array(\n", "[np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], \n", " mbtypes=qml.representations.get_slatm_mbtypes([ncharges_list[i]]),\n", " local=True))\n", "for i in range(len(ncharges_list))])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(28, 857)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_reps[2].shape" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "target_labels = [t.split(\"/\")[-1].split(\".xyz\")[0] for t in target_sdfs]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " return array(a, dtype, copy=False, order=order, subok=True)\n" ] } ], "source": [ - "np.savez(\"target_vector_data.npz\", \n", + "np.savez(\"target_SLATM_data.npz\", \n", " target_labels=target_labels, \n", " target_reps=target_reps, \n", " target_ncharges=ncharges_list,)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/GetSLATMAmons.ipynb b/GetSLATMAmons.ipynb index 386fbcf..198c63a 100644 --- a/GetSLATMAmons.ipynb +++ b/GetSLATMAmons.ipynb @@ -1,461 +1,452 @@ { "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "import qml " ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "def read_sdf(sdf):\n", " with open(sdf, \"r\") as f:\n", " txt = f.read().rstrip()\n", " return txt" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "def get_ncharges_coords(sdf):\n", " mol = Chem.MolFromMolBlock(sdf)\n", " #mol = Chem.AddHs(mol)\n", " # rdkit molobj\n", " ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]\n", " conf = mol.GetConformer()\n", " coords = np.asarray(conf.GetPositions())\n", " return ncharges, coords" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']" ] }, - "execution_count": 6, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_sdfs = sorted(glob(\"targets/*.sdf\"))\n", "target_sdfs" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "qm9_amons_files = sorted(glob(\"amons-qm9/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "qm9_amons_sdfs = [read_sdf(x) for x in qm9_amons_files]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in qm9_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "qm9_ncharges = ncharges_list" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "qm9_reps = [np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], mbtypes,\n", " local=True)) for i in \n", " range(len(ncharges_list))]" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "qm9_reps = np.array(qm9_reps)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1, 3121)" ] }, - "execution_count": 15, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qm9_reps[0].shape" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "qm9_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in qm9_amons_files]" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "vitc_amons_files = sorted(glob(\"amons-vitc/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "vitc_amons_sdfs = [read_sdf(x) for x in vitc_amons_files]" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in vitc_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "vitc_ncharges = ncharges_list" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "vitc_reps = [np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], \n", " mbtypes, local=True)) for i in \n", " range(len(ncharges_list))]" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "vitc_reps = np.array(vitc_reps)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "vitc_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitc_amons_files]" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "vitd_amons_files = sorted(glob(\"amons-vitd/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "vitd_amons_sdfs = [read_sdf(x) for x in vitd_amons_files]" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in vitd_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "vitd_ncharges = ncharges_list" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "vitd_reps = [np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], \n", " mbtypes, local=True)) for i \n", " in range(len(ncharges_list))]" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "vitd_reps = np.array(vitd_reps)" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "vitd_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitd_amons_files]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "# np save " ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 71, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " return array(a, dtype, copy=False, order=order, subok=True)\n" - ] - } - ], + "outputs": [], "source": [ - "np.savez(\"amons_vector_data.npz\", \n", + "np.savez(\"amons_SLATM_data.npz\", \n", " vitd_amons_labels=vitd_amons_labels,\n", " vitc_amons_labels=vitc_amons_labels,\n", " qm9_amons_labels=qm9_amons_labels,\n", " vitd_amons_ncharges=vitd_ncharges,\n", " vitc_amons_ncharges=vitc_ncharges,\n", " qm9_amons_ncharges=qm9_ncharges,\n", - " vitd_amons_slatms=vitd_reps,\n", - " vitc_amons_slatms=vitc_reps,\n", - " qm9_amons_slatms=qm9_reps)" + " vitd_amons_reps=vitd_reps,\n", + " vitc_amons_reps=vitc_reps,\n", + " qm9_amons_reps=qm9_reps)" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1, 857)" ] }, - "execution_count": 37, + "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vitd_reps[0].shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/GetSLATM.ipynb b/GetSOAP.ipynb similarity index 57% copy from GetSLATM.ipynb copy to GetSOAP.ipynb index a9af520..cf97d8e 100644 --- a/GetSLATM.ipynb +++ b/GetSOAP.ipynb @@ -1,226 +1,285 @@ { "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "import qml " + "from dscribe.descriptors import SOAP" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "target_xyzs = sorted(glob(\"targets/*.xyz\"))" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def read_sdf(sdf):\n", " with open(sdf, \"r\") as f:\n", " txt = f.read().rstrip()\n", " return txt" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "def get_ncharges_coords(sdf):\n", " mol = Chem.MolFromMolBlock(sdf)\n", " #mol = Chem.AddHs(mol)\n", " # rdkit molobj\n", " ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]\n", + " elements = [atom.GetSymbol() for atom in mol.GetAtoms()]\n", " conf = mol.GetConformer()\n", " coords = np.asarray(conf.GetPositions())\n", - " return ncharges, coords" + " return ncharges, elements, coords" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "import ase" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def get_soap(sdf, elements=[6,7,8]):\n", + " ncharges, atomtypes, coords = get_ncharges_coords(sdf)\n", + " atomsobj = ase.Atoms(symbols=atomtypes, positions=coords)\n", + " soap = SOAP(\n", + " species=elements,\n", + " rcut=5.0,\n", + " nmax=8,\n", + " lmax=8,\n", + " sigma=0.2,\n", + " periodic=False,\n", + " crossover=True,\n", + " sparse=False,\n", + " )\n", + " return soap.create(atomsobj)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']" ] }, - "execution_count": 7, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_files = sorted(glob(\"targets/*.sdf\"))\n", "target_files" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "target_sdfs = [read_sdf(x) for x in target_files]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in target_sdfs]" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "ncharges_list, coords_list = zip(*conf_data)" + "ncharges_list, _, _ = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[array([6, 7, 8]), array([6, 8]), array([6, 8])]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# mbtypes separate to each target" + "elements_list = [np.unique(x) for x in ncharges_list]\n", + "elements_list" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 30, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:5: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " \"\"\"\n" - ] + "data": { + "text/plain": [ + "[9, 12, 28]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "target_reps = np.array(\n", - "[np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], \n", - " mbtypes=qml.representations.get_slatm_mbtypes([ncharges_list[i]]),\n", - " local=True))\n", - "for i in range(len(ncharges_list))])" + "sizes = [len(x) for x in ncharges_list]\n", + "sizes" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "target_reps = [get_soap(target_sdfs[i], elements=elements_list[i]) for i in range(len(target_sdfs))]" + ] + }, + { + "cell_type": "code", + "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(28, 857)" + "(9, 2700)" ] }, - "execution_count": 13, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "target_reps[2].shape" + "target_reps[0].shape" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "target_labels = [t.split(\"/\")[-1].split(\".xyz\")[0] for t in target_sdfs]" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " return array(a, dtype, copy=False, order=order, subok=True)\n" ] } ], "source": [ - "np.savez(\"target_vector_data.npz\", \n", + "np.savez(\"target_SOAP_data.npz\", \n", " target_labels=target_labels, \n", " target_reps=target_reps, \n", " target_ncharges=ncharges_list,)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/GetSLATMAmons.ipynb b/GetSOAPAmons.ipynb similarity index 70% copy from GetSLATMAmons.ipynb copy to GetSOAPAmons.ipynb index 386fbcf..54fae95 100644 --- a/GetSLATMAmons.ipynb +++ b/GetSOAPAmons.ipynb @@ -1,461 +1,451 @@ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "import qml " + "from dscribe.descriptors import SOAP" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def read_sdf(sdf):\n", " with open(sdf, \"r\") as f:\n", " txt = f.read().rstrip()\n", " return txt" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def get_ncharges_coords(sdf):\n", " mol = Chem.MolFromMolBlock(sdf)\n", " #mol = Chem.AddHs(mol)\n", " # rdkit molobj\n", " ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]\n", + " elements = [atom.GetSymbol() for atom in mol.GetAtoms()]\n", " conf = mol.GetConformer()\n", " coords = np.asarray(conf.GetPositions())\n", - " return ncharges, coords" + " return ncharges, elements, coords" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, + "outputs": [], + "source": [ + "import ase" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def get_soap(sdf, elements=[6,7,8]):\n", + " ncharges, atomtypes, coords = get_ncharges_coords(sdf)\n", + " atomsobj = ase.Atoms(symbols=atomtypes, positions=coords)\n", + " soap = SOAP(\n", + " species=elements,\n", + " rcut=5.0,\n", + " nmax=8,\n", + " lmax=8,\n", + " sigma=0.2,\n", + " periodic=False,\n", + " crossover=True,\n", + " sparse=False,\n", + " )\n", + " return soap.create(atomsobj)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_sdfs = sorted(glob(\"targets/*.sdf\"))\n", "target_sdfs" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "qm9_amons_files = sorted(glob(\"amons-qm9/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "qm9_amons_sdfs = [read_sdf(x) for x in qm9_amons_files]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in qm9_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "ncharges_list, coords_list = zip(*conf_data)" + "ncharges_list, _, _ = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "qm9_ncharges = ncharges_list" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "qm9_reps = [np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], mbtypes,\n", - " local=True)) for i in \n", - " range(len(ncharges_list))]" + "qm9_reps = [get_soap(x, elements=[6,7,8]) for x in qm9_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "qm9_reps = np.array(qm9_reps)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(1, 3121)" + "(1, 2700)" ] }, - "execution_count": 15, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qm9_reps[0].shape" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "qm9_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in qm9_amons_files]" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "vitc_amons_files = sorted(glob(\"amons-vitc/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "vitc_amons_sdfs = [read_sdf(x) for x in vitc_amons_files]" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in vitc_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "ncharges_list, coords_list = zip(*conf_data)" + "ncharges_list, _, _ = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "vitc_ncharges = ncharges_list" ] }, { "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ - "vitc_reps = [np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], \n", - " mbtypes, local=True)) for i in \n", - " range(len(ncharges_list))]" + "vitc_reps = [get_soap(x, elements=[6,8]) for x in vitc_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "vitc_reps = np.array(vitc_reps)" ] }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "vitc_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitc_amons_files]" - ] - }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ - "vitd_amons_files = sorted(glob(\"amons-vitd/*.sdf\"))" + "vitc_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitc_amons_files]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ - "vitd_amons_sdfs = [read_sdf(x) for x in vitd_amons_files]" + "vitd_amons_files = sorted(glob(\"amons-vitd/*.sdf\"))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ - "conf_data = [get_ncharges_coords(x) for x in vitd_amons_sdfs]" + "vitd_amons_sdfs = [read_sdf(x) for x in vitd_amons_files]" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ - "ncharges_list, coords_list = zip(*conf_data)" + "conf_data = [get_ncharges_coords(x) for x in vitd_amons_sdfs]" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ - "vitd_ncharges = ncharges_list" + "ncharges_list, _, _ = zip(*conf_data)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ - "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" + "vitd_ncharges = ncharges_list" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ - "vitd_reps = [np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], \n", - " mbtypes, local=True)) for i \n", - " in range(len(ncharges_list))]" + "vitd_reps = [get_soap(x, elements=[6,8]) for x in vitd_amons_sdfs]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "vitd_reps = np.array(vitd_reps)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "vitd_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitd_amons_files]" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "# np save " ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " return array(a, dtype, copy=False, order=order, subok=True)\n" - ] - } - ], + "outputs": [], "source": [ - "np.savez(\"amons_vector_data.npz\", \n", + "np.savez(\"amons_SOAP_data.npz\", \n", " vitd_amons_labels=vitd_amons_labels,\n", " vitc_amons_labels=vitc_amons_labels,\n", " qm9_amons_labels=qm9_amons_labels,\n", " vitd_amons_ncharges=vitd_ncharges,\n", " vitc_amons_ncharges=vitc_ncharges,\n", " qm9_amons_ncharges=qm9_ncharges,\n", - " vitd_amons_slatms=vitd_reps,\n", - " vitc_amons_slatms=vitc_reps,\n", - " qm9_amons_slatms=qm9_reps)" + " vitd_amons_reps=vitd_reps,\n", + " vitc_amons_reps=vitc_reps,\n", + " qm9_amons_reps=qm9_reps)" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(1, 857)" + "(1, 168)" ] }, - "execution_count": 37, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vitd_reps[0].shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/GetSLATM.ipynb b/GetaCM.ipynb similarity index 67% copy from GetSLATM.ipynb copy to GetaCM.ipynb index a9af520..b5b822b 100644 --- a/GetSLATM.ipynb +++ b/GetaCM.ipynb @@ -1,226 +1,229 @@ { "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "import qml " ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "target_xyzs = sorted(glob(\"targets/*.xyz\"))" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "def read_sdf(sdf):\n", " with open(sdf, \"r\") as f:\n", " txt = f.read().rstrip()\n", " return txt" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "def get_ncharges_coords(sdf):\n", " mol = Chem.MolFromMolBlock(sdf)\n", " #mol = Chem.AddHs(mol)\n", " # rdkit molobj\n", " ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]\n", " conf = mol.GetConformer()\n", " coords = np.asarray(conf.GetPositions())\n", " return ncharges, coords" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']" ] }, - "execution_count": 7, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_files = sorted(glob(\"targets/*.sdf\"))\n", "target_files" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "target_sdfs = [read_sdf(x) for x in target_files]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in target_sdfs]" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 36, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[9, 12, 28]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# mbtypes separate to each target" + "sizes = [len(x) for x in ncharges_list]\n", + "sizes" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:5: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:5: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"\n" ] } ], "source": [ "target_reps = np.array(\n", - "[np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], \n", - " mbtypes=qml.representations.get_slatm_mbtypes([ncharges_list[i]]),\n", - " local=True))\n", + "[np.array(qml.representations.generate_atomic_coulomb_matrix(np.array(ncharges_list[i]), \n", + " np.array(coords_list[i]),\n", + " size=sizes[i]))\n", "for i in range(len(ncharges_list))])" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(28, 857)" + "(9, 45)" ] }, - "execution_count": 13, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "target_reps[2].shape" + "target_reps[0].shape" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "target_labels = [t.split(\"/\")[-1].split(\".xyz\")[0] for t in target_sdfs]" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 40, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " return array(a, dtype, copy=False, order=order, subok=True)\n" - ] - } - ], + "outputs": [], "source": [ - "np.savez(\"target_vector_data.npz\", \n", + "np.savez(\"target_aCM_data.npz\", \n", " target_labels=target_labels, \n", " target_reps=target_reps, \n", " target_ncharges=ncharges_list,)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/GetSLATMAmons.ipynb b/GetaCMAmons.ipynb similarity index 63% copy from GetSLATMAmons.ipynb copy to GetaCMAmons.ipynb index 386fbcf..9cbd83b 100644 --- a/GetSLATMAmons.ipynb +++ b/GetaCMAmons.ipynb @@ -1,461 +1,448 @@ { "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "import qml " ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "def read_sdf(sdf):\n", " with open(sdf, \"r\") as f:\n", " txt = f.read().rstrip()\n", " return txt" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "def get_ncharges_coords(sdf):\n", " mol = Chem.MolFromMolBlock(sdf)\n", " #mol = Chem.AddHs(mol)\n", " # rdkit molobj\n", " ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]\n", " conf = mol.GetConformer()\n", " coords = np.asarray(conf.GetPositions())\n", " return ncharges, coords" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']" ] }, - "execution_count": 6, + "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_sdfs = sorted(glob(\"targets/*.sdf\"))\n", "target_sdfs" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "qm9_amons_files = sorted(glob(\"amons-qm9/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "qm9_amons_sdfs = [read_sdf(x) for x in qm9_amons_files]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in qm9_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "qm9_ncharges = ncharges_list" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ - "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" + "qm9_reps = [np.array(qml.representations.generate_atomic_coulomb_matrix(np.array(ncharges_list[i]),\n", + " np.array(coords_list[i]), \n", + " size=9))\n", + " for i in range(len(ncharges_list))]" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "qm9_reps = [np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], mbtypes,\n", - " local=True)) for i in \n", - " range(len(ncharges_list))]" - ] - }, - { - "cell_type": "code", - "execution_count": 14, + "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "qm9_reps = np.array(qm9_reps)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(1, 3121)" + "(1, 45)" ] }, - "execution_count": 15, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qm9_reps[0].shape" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "qm9_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in qm9_amons_files]" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "vitc_amons_files = sorted(glob(\"amons-vitc/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "vitc_amons_sdfs = [read_sdf(x) for x in vitc_amons_files]" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in vitc_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "vitc_ncharges = ncharges_list" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 88, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "8" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" + "max([len(x) for x in ncharges_list])" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ - "vitc_reps = [np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], \n", - " mbtypes, local=True)) for i in \n", + "vitc_reps = [np.array(qml.representations.generate_atomic_coulomb_matrix(np.array(ncharges_list[i]), \n", + " np.array(coords_list[i]), \n", + " size=12)) for i in \n", " range(len(ncharges_list))]" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 90, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "vitc_reps = np.array(vitc_reps)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "vitc_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitc_amons_files]" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "vitd_amons_files = sorted(glob(\"amons-vitd/*.sdf\"))" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "vitd_amons_sdfs = [read_sdf(x) for x in vitd_amons_files]" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "conf_data = [get_ncharges_coords(x) for x in vitd_amons_sdfs]" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "ncharges_list, coords_list = zip(*conf_data)" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "vitd_ncharges = ncharges_list" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ - "mbtypes = qml.representations.get_slatm_mbtypes(ncharges_list)" + "vitd_reps = [np.array(qml.representations.generate_atomic_coulomb_matrix(np.array(ncharges_list[i]),\n", + " np.array(coords_list[i]),\n", + " size=28))\n", + " for i in range(len(ncharges_list))]" ] }, { "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "vitd_reps = [np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], \n", - " mbtypes, local=True)) for i \n", - " in range(len(ncharges_list))]" - ] - }, - { - "cell_type": "code", - "execution_count": 33, + "execution_count": 98, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "vitd_reps = np.array(vitd_reps)" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "vitd_amons_labels = [t.split(\"/\")[-1].split(\".sdf\")[0] for t in vitd_amons_files]" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "# np save " ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 100, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", - " return array(a, dtype, copy=False, order=order, subok=True)\n" - ] - } - ], + "outputs": [], "source": [ - "np.savez(\"amons_vector_data.npz\", \n", + "np.savez(\"amons_aCM_data.npz\", \n", " vitd_amons_labels=vitd_amons_labels,\n", " vitc_amons_labels=vitc_amons_labels,\n", " qm9_amons_labels=qm9_amons_labels,\n", " vitd_amons_ncharges=vitd_ncharges,\n", " vitc_amons_ncharges=vitc_ncharges,\n", " qm9_amons_ncharges=qm9_ncharges,\n", - " vitd_amons_slatms=vitd_reps,\n", - " vitc_amons_slatms=vitc_reps,\n", - " qm9_amons_slatms=qm9_reps)" + " vitd_amons_reps=vitd_reps,\n", + " vitc_amons_reps=vitc_reps,\n", + " qm9_amons_reps=qm9_reps)" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(1, 857)" + "(1, 406)" ] }, - "execution_count": 37, + "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vitd_reps[0].shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/README.md b/README.md index 458f1bf..bdbd7d3 100644 --- a/README.md +++ b/README.md @@ -1,56 +1,56 @@ # Data ## Structures The matrices for 3 target structures (to synthesize) and a database of 7165 query structures (to combine to build the target) are compressed in `data.npz` Within python, it can be read like: ``` data = np.load("data.npz", allow_pickle=True) ``` where `data.files` will return the names of the numpy arrays (should be `target_labels, target_CMs, target_ncharges, database_labels, database_CMs, database_ncharges`) where CMs are the matrices (of target and database respectively) and the corresponding arrays can be accessed like: ``` data["target_labels"] ``` For more details see the documentation: https://het.as.utexas.edu/HET/Software/Numpy/reference/generated/numpy.savez.html ## Connectivity / functional group information Adjacency matrices and functional group information derived from the connectivity are compressed in `connectivity_data.npz`. Within python, it can be read like: ``` connectivity_data = np.load("connectivity_data.npz") ``` the corresponding keys are `fg_counts_targets` for the functional group counts of each of the 3 target molecules,`fg_counts_frags` for the functional group counts of each of the fragment molecules, `frag_adj_matrices` for the adjacency matrices of the fragments and `target_adj_matrices` for the adjacency matrices of the target molecules. The order is the same as those in `data` containing the structures. ## Optimal databases Dedicated databases of small molecules are saved for each target, all compressed in the file `amons_data.npz`. ``` data = np.load("amons_data.npz") ``` contains the same information as in the original `data`, but now specific to each target. Target 0 (qm9) has the data: ``` qm9_amons_labels qm9_amons_ncharges qm9_amons_CMs ``` where the CMs are the representation matrices. Similarly, target 1 (vitc) has the same data with the prefix `vitc_`. Same for vitd. These databases are much smaller, making the search faster. ### Optimal databases and vector data Rather than using symmetric matrices to represent our molecules where each row/column index represents an atom index, we can directly use a vector of the same length for each atom index. In other words, we have an asymmetric matrix of dimensions N_atoms x V_dim where V_dim is the length of the vector. We can access the representation for each atom as the appropriate index of the asymmetric matrix. V_dim will vary based on the atoms present in the target system, but will be consistent between the target and database candidates. -These representations (named SLATM, though there are others we could try) are provided in `.npz` files `target_vector_data.npz` and `amons_vector_data.npz`. +Now we have datasets for 4 different asymmetric representations: aCM, SLATM, SOAP and FCHL, all named like `target_repname_data.npz` for the target and `amons_repname_data.npz` for the fragments. diff --git a/amons_FCHL_data.npz b/amons_FCHL_data.npz new file mode 100644 index 0000000..913821a Binary files /dev/null and b/amons_FCHL_data.npz differ diff --git a/amons_SLATM_data.npz b/amons_SLATM_data.npz new file mode 100644 index 0000000..807fecd Binary files /dev/null and b/amons_SLATM_data.npz differ diff --git a/amons_SOAP_data.npz b/amons_SOAP_data.npz new file mode 100644 index 0000000..fce6c8c Binary files /dev/null and b/amons_SOAP_data.npz differ diff --git a/amons_aCM_data.npz b/amons_aCM_data.npz new file mode 100644 index 0000000..4077659 Binary files /dev/null and b/amons_aCM_data.npz differ diff --git a/data.npz b/data.npz index 30acbd9..d830ef5 100644 Binary files a/data.npz and b/data.npz differ diff --git a/target_FCHL_data.npz b/target_FCHL_data.npz new file mode 100644 index 0000000..7e55224 Binary files /dev/null and b/target_FCHL_data.npz differ diff --git a/target_SLATM_data.npz b/target_SLATM_data.npz new file mode 100644 index 0000000..1cc42b3 Binary files /dev/null and b/target_SLATM_data.npz differ diff --git a/target_SOAP_data.npz b/target_SOAP_data.npz new file mode 100644 index 0000000..5b36bb3 Binary files /dev/null and b/target_SOAP_data.npz differ diff --git a/target_aCM_data.npz b/target_aCM_data.npz new file mode 100644 index 0000000..90a1dc4 Binary files /dev/null and b/target_aCM_data.npz differ