{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "import qml "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "from rdkit import Chem"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_xyzs = sorted(glob(\"targets/*.xyz\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_sdf(sdf):\n",
    "    with open(sdf, \"r\") as f:\n",
    "        txt = f.read().rstrip()\n",
    "    return txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_ncharges_coords(sdf):\n",
    "    mol = Chem.MolFromMolBlock(sdf)\n",
    "   #mol = Chem.AddHs(mol)\n",
    "    # rdkit molobj\n",
    "    ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]\n",
    "    conf = mol.GetConformer()\n",
    "    coords = np.asarray(conf.GetPositions())\n",
    "    return ncharges, coords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target_files = sorted(glob(\"targets/*.sdf\"))\n",
    "target_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_sdfs = [read_sdf(x) for x in target_files]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "conf_data = [get_ncharges_coords(x) for x in target_sdfs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "ncharges_list, coords_list = zip(*conf_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[9, 12, 28]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sizes = [len(x) for x in ncharges_list]\n",
    "sizes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/puck/anaconda3/envs/aqml/lib/python3.7/site-packages/ipykernel_launcher.py:5: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
      "  \"\"\"\n"
     ]
    }
   ],
   "source": [
    "target_reps = np.array(\n",
    "[np.array(qml.representations.generate_atomic_coulomb_matrix(np.array(ncharges_list[i]), \n",
    "                                                             np.array(coords_list[i]),\n",
    "                                                            size=sizes[i]))\n",
    "for i in range(len(ncharges_list))])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(9, 45)"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target_reps[0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_labels = [t.split(\"/\")[-1].split(\".xyz\")[0] for t in target_sdfs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savez(\"target_aCM_data.npz\", \n",
    "         target_labels=target_labels, \n",
    "         target_reps=target_reps, \n",
    "         target_ncharges=ncharges_list,)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}