{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import qml "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_xyzs = sorted(glob(\"targets/*.xyz\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "database_xyzs = sorted(glob(\"qm7/*.xyz\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_mols = [qml.Compound(x) for x in target_xyzs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "database_mols = [qml.Compound(x) for x in database_xyzs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_CM(mol):\n",
    "    ncharges = mol.nuclear_charges\n",
    "    coords = mol.coordinates \n",
    "    CM = np.zeros((len(coords), len(coords)))\n",
    "    for i in range(len(coords)):\n",
    "        for j in range(len(coords)):\n",
    "            if i==j:\n",
    "                CM[i,j] = 0.5 * ncharges[i]**2.4\n",
    "            else:\n",
    "                CM[i,j] = ncharges[i] * ncharges[j] / np.linalg.norm(coords[j] - coords[i])\n",
    "                \n",
    "    return ncharges, CM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "mol = target_mols[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "ncharges, CM = get_CM(mol)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_ncharges = []\n",
    "target_CMs = []\n",
    "for mol in target_mols: \n",
    "    ncharge, CM = get_CM(mol)\n",
    "    target_ncharges.append(ncharge)\n",
    "    target_CMs.append(CM)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "database_ncharges = []\n",
    "database_CMs = []\n",
    "for mol in database_mols:\n",
    "    ncharge, CM = get_CM(mol)\n",
    "    database_ncharges.append(ncharge)\n",
    "    database_CMs.append(CM)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-14-8bb0455dc3ca>:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
      "  target_CMs = np.array(target_CMs)\n"
     ]
    }
   ],
   "source": [
    "target_CMs = np.array(target_CMs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-15-b1da13099042>:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
      "  database_CMs = np.array(database_CMs)\n"
     ]
    }
   ],
   "source": [
    "database_CMs = np.array(database_CMs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_labels = [t.split(\"/\")[-1].split(\".xyz\")[0] for t in target_xyzs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "database_labels = [t.split(\"/\")[-1].split(\".xyz\")[0] for t in database_xyzs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_labels = np.array(target_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "database_labels = np.array(database_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-20-d16e71a4aa93>:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
      "  target_ncharges = np.array(target_ncharges)\n"
     ]
    }
   ],
   "source": [
    "target_ncharges = np.array(target_ncharges)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-21-defeabfdee20>:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
      "  database_ncharges = np.array(database_ncharges)\n"
     ]
    }
   ],
   "source": [
    "database_ncharges = np.array(database_ncharges)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savez(\"data.npz\", \n",
    "         target_labels=target_labels, \n",
    "         target_CMs=target_CMs, \n",
    "         target_ncharges=target_ncharges,\n",
    "         database_labels=database_labels, \n",
    "         database_CMs=database_CMs,\n",
    "        database_ncharges=database_ncharges)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = np.load(\"data.npz\", allow_pickle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['target_labels',\n",
       " 'target_CMs',\n",
       " 'target_ncharges',\n",
       " 'database_labels',\n",
       " 'database_CMs',\n",
       " 'database_ncharges']"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}