{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem\n", "from xyz2mol import xyz2mol" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# need ncharges, coords" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "NUCLEAR_CHARGE = {\n", " \"H\":1,\n", " \"C\":6,\n", " \"O\":8,\n", " \"N\":7,\n", " \"F\":9,\n", " \"S\":16\n", "}" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "def read_xyz(filename):\n", " with open(filename, \"r\") as f:\n", " lines = f.readlines()\n", "\n", " natoms = int(lines[0])\n", " nuclear_charges = []\n", " coordinates = []\n", "\n", " for i, line in enumerate(lines[2:natoms+2]):\n", " tokens = line.split()\n", "\n", " if len(tokens) < 4:\n", " break\n", "\n", " nuclear_charges.append(NUCLEAR_CHARGE[tokens[0]])\n", " coordinates.append([float(token) for token in tokens[1:4]])\n", " \n", " return nuclear_charges, coordinates" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "def xyzfile_to_mol(filename):\n", " ncharges, coords = read_xyz(filename)\n", " mols = xyz2mol(ncharges, coords)\n", " return mols[0]" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "from glob import glob" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "frag_files = [x for x in sorted(glob(\"qm7/*\"))]" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "target_files = [x for x in sorted(glob(\"targets/*\"))]" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "frag_mols = [xyzfile_to_mol(x) for x in frag_files]" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "target_mols = [xyzfile_to_mol(x) for x in target_files]" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "# for func group matches need to use SMARTS and \n", "# func_group = Chem.MolFromSmarts(\"smarts\")\n", "# mol.GetSubStructMatches(func_group)\n", "# this returns tuples of matches (then count)" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [], "source": [ "# need to identify list of relevant functional groups" ] }, { "cell_type": "code", "execution_count": 208, "metadata": {}, "outputs": [], "source": [ "functional_groups = {\n", " 'arene' : 'c',\n", " 'allenic C' : '[$([CX2](=C)=C)]',\n", " 'vinylic C' : '[$([CX3]=[CX3])]',\n", " 'acetylenic C' : '[$([CX2]#C)]',\n", " \"carbonyl\" : '[$([CX3]=[OX1]),$([CX3+]-[OX1-])]',\n", " 'aldeyhde' : '[CX3H1](=O)[#6]',\n", " 'amide' : '[NX3][CX3](=[OX1])[#6]',\n", " 'carboxylic acid': '[CX3](=O)[OX2H1]',\n", " 'ester' : '[#6][CX3](=O)[OX2H0][#6]',\n", " 'ketone' : '[#6][CX3](=O)[#6]',\n", " 'ether' : '[OD2]([#6])[#6]',\n", " 'azo general' : '[#7]',\n", " 'amine' : '[NX3;H2,H1;!$(NC=O)]',\n", " 'enamine' : '[NX3][CX3]=[CX3]',\n", " 'imine' : '[$([CX3]([#6])[#6]),$([CX3H][#6])]=[$([NX2][#6]),$([NX2H])]',\n", " 'nitrate' : '[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]',\n", " 'nitrile' : '[NX1]#[CX2]',\n", " 'nitro' : '[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]',\n", " 'alcohol' : '[#6][OX2H]',\n", " 'enol' : '[OX2H][#6X3]=[#6]',\n", " 'phenol' : '[OX2H][cX3]:[c]'\n", "}" ] }, { "cell_type": "code", "execution_count": 216, "metadata": {}, "outputs": [], "source": [ "with open('functional_groups.txt', 'w') as f:\n", " for label, fg in functional_groups.items():\n", " f.write(label+' '+fg+'\\n')" ] }, { "cell_type": "code", "execution_count": 200, "metadata": {}, "outputs": [], "source": [ "def get_fg_count(mol, functional_groups):\n", " fg_count = []\n", " for label, fg in functional_groups.items():\n", " fg_mol = Chem.MolFromSmarts(fg)\n", " match = mol.GetSubstructMatches(fg_mol)\n", " fg_count.append(len(match))\n", " return fg_count " ] }, { "cell_type": "code", "execution_count": 201, "metadata": {}, "outputs": [], "source": [ "fg_counts_targets = [get_fg_count(x, functional_groups) for x in target_mols]" ] }, { "cell_type": "code", "execution_count": 203, "metadata": {}, "outputs": [], "source": [ "fg_counts_frags = [get_fg_count(x, functional_groups) for x in frag_mols]" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "# get adj matrices" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "frag_adj_matrices = [Chem.rdmolops.GetAdjacencyMatrix(x) for x in frag_mols]" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "target_adj_matrices = [Chem.rdmolops.GetAdjacencyMatrix(x) for x in target_mols]" ] }, { "cell_type": "code", "execution_count": 205, "metadata": {}, "outputs": [], "source": [ "# save everything" ] }, { "cell_type": "code", "execution_count": 217, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \"\"\"Entry point for launching an IPython kernel.\n", "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:2: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " \n" ] } ], "source": [ "data = [np.array(fg_counts_targets), np.array(fg_counts_frags), np.array(frag_adj_matrices),\n", " np.array(target_adj_matrices)]" ] }, { "cell_type": "code", "execution_count": 222, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", " return array(a, dtype, copy=False, order=order, subok=True)\n" ] } ], "source": [ "np.savez('connectivity_data.npz',fg_counts_targets=fg_counts_targets,\n", " fg_counts_frags=fg_counts_frags,\n", " frag_adj_matrices=frag_adj_matrices,\n", " target_adj_matrices=target_adj_matrices)" ] }, { "cell_type": "code", "execution_count": 223, "metadata": {}, "outputs": [], "source": [ "container = np.load('connectivity_data.npz')" ] }, { "cell_type": "code", "execution_count": 224, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['fg_counts_targets',\n", " 'fg_counts_frags',\n", " 'frag_adj_matrices',\n", " 'target_adj_matrices']" ] }, "execution_count": 224, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(container.keys())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 }