{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from rdkit import Chem\n",
    "from xyz2mol import xyz2mol"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# need ncharges, coords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "NUCLEAR_CHARGE = {\n",
    "    \"H\":1,\n",
    "    \"C\":6,\n",
    "    \"O\":8,\n",
    "    \"N\":7,\n",
    "    \"F\":9,\n",
    "    \"S\":16\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_xyz(filename):\n",
    "    with open(filename, \"r\") as f:\n",
    "        lines = f.readlines()\n",
    "\n",
    "    natoms = int(lines[0])\n",
    "    nuclear_charges = []\n",
    "    coordinates = []\n",
    "\n",
    "    for i, line in enumerate(lines[2:natoms+2]):\n",
    "        tokens = line.split()\n",
    "\n",
    "        if len(tokens) < 4:\n",
    "            break\n",
    "\n",
    "        nuclear_charges.append(NUCLEAR_CHARGE[tokens[0]])\n",
    "        coordinates.append([float(token) for token in tokens[1:4]])\n",
    "   \n",
    "    return nuclear_charges, coordinates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "def xyzfile_to_mol(filename):\n",
    "    ncharges, coords = read_xyz(filename)\n",
    "    mols = xyz2mol(ncharges, coords)\n",
    "    return mols[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "frag_files = [x for x in sorted(glob(\"qm7/*\"))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_files = [x for x in sorted(glob(\"targets/*\"))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "frag_mols = [xyzfile_to_mol(x) for x in frag_files]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_mols = [xyzfile_to_mol(x) for x in target_files]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for func group matches need to use SMARTS and \n",
    "# func_group = Chem.MolFromSmarts(\"smarts\")\n",
    "# mol.GetSubStructMatches(func_group)\n",
    "# this returns tuples of matches (then count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [],
   "source": [
    "# need to identify list of relevant functional groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 208,
   "metadata": {},
   "outputs": [],
   "source": [
    "functional_groups = {\n",
    "    'arene' : 'c',\n",
    "    'allenic C' : '[$([CX2](=C)=C)]',\n",
    "    'vinylic C' : '[$([CX3]=[CX3])]',\n",
    "    'acetylenic C' : '[$([CX2]#C)]',\n",
    "    \"carbonyl\" : '[$([CX3]=[OX1]),$([CX3+]-[OX1-])]',\n",
    "    'aldeyhde' : '[CX3H1](=O)[#6]',\n",
    "    'amide' : '[NX3][CX3](=[OX1])[#6]',\n",
    "    'carboxylic acid': '[CX3](=O)[OX2H1]',\n",
    "    'ester' : '[#6][CX3](=O)[OX2H0][#6]',\n",
    "    'ketone' : '[#6][CX3](=O)[#6]',\n",
    "    'ether' : '[OD2]([#6])[#6]',\n",
    "    'azo general' : '[#7]',\n",
    "    'amine' : '[NX3;H2,H1;!$(NC=O)]',\n",
    "    'enamine' : '[NX3][CX3]=[CX3]',\n",
    "    'imine' : '[$([CX3]([#6])[#6]),$([CX3H][#6])]=[$([NX2][#6]),$([NX2H])]',\n",
    "    'nitrate' :  '[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]',\n",
    "    'nitrile' : '[NX1]#[CX2]',\n",
    "    'nitro' : '[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]',\n",
    "    'alcohol' : '[#6][OX2H]',\n",
    "    'enol' : '[OX2H][#6X3]=[#6]',\n",
    "    'phenol' : '[OX2H][cX3]:[c]'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('functional_groups.txt', 'w') as f:\n",
    "    for label, fg in functional_groups.items():\n",
    "        f.write(label+' '+fg+'\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_fg_count(mol, functional_groups):\n",
    "    fg_count = []\n",
    "    for label, fg in functional_groups.items():\n",
    "        fg_mol = Chem.MolFromSmarts(fg)\n",
    "        match = mol.GetSubstructMatches(fg_mol)\n",
    "        fg_count.append(len(match))\n",
    "    return fg_count  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {},
   "outputs": [],
   "source": [
    "fg_counts_targets = [get_fg_count(x, functional_groups) for x in target_mols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 203,
   "metadata": {},
   "outputs": [],
   "source": [
    "fg_counts_frags = [get_fg_count(x, functional_groups) for x in frag_mols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get adj matrices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "frag_adj_matrices = [Chem.rdmolops.GetAdjacencyMatrix(x) for x in frag_mols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_adj_matrices = [Chem.rdmolops.GetAdjacencyMatrix(x) for x in target_mols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 205,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save everything"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 217,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:2: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "data = [np.array(fg_counts_targets), np.array(fg_counts_frags), np.array(frag_adj_matrices),\n",
    "       np.array(target_adj_matrices)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
      "  return array(a, dtype, copy=False, order=order, subok=True)\n"
     ]
    }
   ],
   "source": [
    "np.savez('connectivity_data.npz',fg_counts_targets=fg_counts_targets,\n",
    "        fg_counts_frags=fg_counts_frags,\n",
    "        frag_adj_matrices=frag_adj_matrices,\n",
    "        target_adj_matrices=target_adj_matrices)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "metadata": {},
   "outputs": [],
   "source": [
    "container = np.load('connectivity_data.npz')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['fg_counts_targets',\n",
       " 'fg_counts_frags',\n",
       " 'frag_adj_matrices',\n",
       " 'target_adj_matrices']"
      ]
     },
     "execution_count": 224,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(container.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}