diff --git a/GetFunctionalGroups.ipynb b/GetFunctionalGroups.ipynb
new file mode 100644
index 0000000..9651451
--- /dev/null
+++ b/GetFunctionalGroups.ipynb
@@ -0,0 +1,365 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from rdkit import Chem\n",
+    "from xyz2mol import xyz2mol"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# need ncharges, coords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NUCLEAR_CHARGE = {\n",
+    "    \"H\":1,\n",
+    "    \"C\":6,\n",
+    "    \"O\":8,\n",
+    "    \"N\":7,\n",
+    "    \"F\":9,\n",
+    "    \"S\":16\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_xyz(filename):\n",
+    "    with open(filename, \"r\") as f:\n",
+    "        lines = f.readlines()\n",
+    "\n",
+    "    natoms = int(lines[0])\n",
+    "    nuclear_charges = []\n",
+    "    coordinates = []\n",
+    "\n",
+    "    for i, line in enumerate(lines[2:natoms+2]):\n",
+    "        tokens = line.split()\n",
+    "\n",
+    "        if len(tokens) < 4:\n",
+    "            break\n",
+    "\n",
+    "        nuclear_charges.append(NUCLEAR_CHARGE[tokens[0]])\n",
+    "        coordinates.append([float(token) for token in tokens[1:4]])\n",
+    "   \n",
+    "    return nuclear_charges, coordinates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def xyzfile_to_mol(filename):\n",
+    "    ncharges, coords = read_xyz(filename)\n",
+    "    mols = xyz2mol(ncharges, coords)\n",
+    "    return mols[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from glob import glob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "frag_files = [x for x in sorted(glob(\"qm7/*\"))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "target_files = [x for x in sorted(glob(\"targets/*\"))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "frag_mols = [xyzfile_to_mol(x) for x in frag_files]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "target_mols = [xyzfile_to_mol(x) for x in target_files]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for func group matches need to use SMARTS and \n",
+    "# func_group = Chem.MolFromSmarts(\"smarts\")\n",
+    "# mol.GetSubStructMatches(func_group)\n",
+    "# this returns tuples of matches (then count)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 150,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# need to identify list of relevant functional groups"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 208,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "functional_groups = {\n",
+    "    'arene' : 'c',\n",
+    "    'allenic C' : '[$([CX2](=C)=C)]',\n",
+    "    'vinylic C' : '[$([CX3]=[CX3])]',\n",
+    "    'acetylenic C' : '[$([CX2]#C)]',\n",
+    "    \"carbonyl\" : '[$([CX3]=[OX1]),$([CX3+]-[OX1-])]',\n",
+    "    'aldeyhde' : '[CX3H1](=O)[#6]',\n",
+    "    'amide' : '[NX3][CX3](=[OX1])[#6]',\n",
+    "    'carboxylic acid': '[CX3](=O)[OX2H1]',\n",
+    "    'ester' : '[#6][CX3](=O)[OX2H0][#6]',\n",
+    "    'ketone' : '[#6][CX3](=O)[#6]',\n",
+    "    'ether' : '[OD2]([#6])[#6]',\n",
+    "    'azo general' : '[#7]',\n",
+    "    'amine' : '[NX3;H2,H1;!$(NC=O)]',\n",
+    "    'enamine' : '[NX3][CX3]=[CX3]',\n",
+    "    'imine' : '[$([CX3]([#6])[#6]),$([CX3H][#6])]=[$([NX2][#6]),$([NX2H])]',\n",
+    "    'nitrate' :  '[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]',\n",
+    "    'nitrile' : '[NX1]#[CX2]',\n",
+    "    'nitro' : '[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]',\n",
+    "    'alcohol' : '[#6][OX2H]',\n",
+    "    'enol' : '[OX2H][#6X3]=[#6]',\n",
+    "    'phenol' : '[OX2H][cX3]:[c]'\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 216,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('functional_groups.txt', 'w') as f:\n",
+    "    for label, fg in functional_groups.items():\n",
+    "        f.write(label+' '+fg+'\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 200,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_fg_count(mol, functional_groups):\n",
+    "    fg_count = []\n",
+    "    for label, fg in functional_groups.items():\n",
+    "        fg_mol = Chem.MolFromSmarts(fg)\n",
+    "        match = mol.GetSubstructMatches(fg_mol)\n",
+    "        fg_count.append(len(match))\n",
+    "    return fg_count  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 201,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fg_counts_targets = [get_fg_count(x, functional_groups) for x in target_mols]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 203,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fg_counts_frags = [get_fg_count(x, functional_groups) for x in frag_mols]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get adj matrices"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "frag_adj_matrices = [Chem.rdmolops.GetAdjacencyMatrix(x) for x in frag_mols]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "target_adj_matrices = [Chem.rdmolops.GetAdjacencyMatrix(x) for x in target_mols]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 205,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save everything"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 217,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
+      "  \"\"\"Entry point for launching an IPython kernel.\n",
+      "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:2: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
+      "  \n"
+     ]
+    }
+   ],
+   "source": [
+    "data = [np.array(fg_counts_targets), np.array(fg_counts_frags), np.array(frag_adj_matrices),\n",
+    "       np.array(target_adj_matrices)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 222,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
+      "  return array(a, dtype, copy=False, order=order, subok=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "np.savez('connectivity_data.npz',fg_counts_targets=fg_counts_targets,\n",
+    "        fg_counts_frags=fg_counts_frags,\n",
+    "        frag_adj_matrices=frag_adj_matrices,\n",
+    "        target_adj_matrices=target_adj_matrices)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 223,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "container = np.load('connectivity_data.npz')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 224,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['fg_counts_targets',\n",
+       " 'fg_counts_frags',\n",
+       " 'frag_adj_matrices',\n",
+       " 'target_adj_matrices']"
+      ]
+     },
+     "execution_count": 224,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(container.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/README.md b/README.md
index 303fc8b..039c1ea 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,31 @@
 # Data
+## Structures
 The matrices for 3 target structures (to synthesize) and a database of 7165 query structures (to combine to build the target)
 are compressed in `data.npz` 
 
 Within python, it can be read like: 
 ```
 data = np.load("data.npz", allow_pickle=True)
 ```
 
 where `data.files` will return the names of the numpy arrays (should be `target_labels, target_CMs, target_ncharges, database_labels, database_CMs, database_ncharges`) 
 where CMs are the matrices (of target and database respectively) and the corresponding arrays can be accessed like: 
 
 ```
 data["target_labels"]
 ```
 
 For more details see the documentation: 
 https://het.as.utexas.edu/HET/Software/Numpy/reference/generated/numpy.savez.html
+
+## Connectivity / functional group information
+Adjacency matrices and functional group information derived from the connectivity are compressed in `connectivity_data.npz`. 
+
+Within python, it can be read like:
+```
+connectivity_data = np.load("connectivity_data.npz")
+```
+
+the corresponding keys are `fg_counts_targets` for the functional group counts of each of the 3 target molecules,`fg_counts_frags` for the functional group counts of
+each of the fragment molecules, `frag_adj_matrices` for the adjacency matrices of the fragments and `target_adj_matrices` for the adjacency matrices of the target molecules.
+The order is the same as those in `data` containing the structures.  
diff --git a/__pycache__/xyz2mol.cpython-37.pyc b/__pycache__/xyz2mol.cpython-37.pyc
new file mode 100644
index 0000000..59d7561
Binary files /dev/null and b/__pycache__/xyz2mol.cpython-37.pyc differ
diff --git a/connectivity_data.npz b/connectivity_data.npz
new file mode 100644
index 0000000..b68f636
Binary files /dev/null and b/connectivity_data.npz differ
diff --git a/functional_groups.txt b/functional_groups.txt
new file mode 100644
index 0000000..9d72a0b
--- /dev/null
+++ b/functional_groups.txt
@@ -0,0 +1,21 @@
+arene c
+allenic C [$([CX2](=C)=C)]
+vinylic C [$([CX3]=[CX3])]
+acetylenic C [$([CX2]#C)]
+carbonyl [$([CX3]=[OX1]),$([CX3+]-[OX1-])]
+aldeyhde [CX3H1](=O)[#6]
+amide [NX3][CX3](=[OX1])[#6]
+carboxylic acid [CX3](=O)[OX2H1]
+ester [#6][CX3](=O)[OX2H0][#6]
+ketone [#6][CX3](=O)[#6]
+ether [OD2]([#6])[#6]
+azo general [#7]
+amine [NX3;H2,H1;!$(NC=O)]
+enamine [NX3][CX3]=[CX3]
+imine [$([CX3]([#6])[#6]),$([CX3H][#6])]=[$([NX2][#6]),$([NX2H])]
+nitrate [$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]
+nitrile [NX1]#[CX2]
+nitro [$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]
+alcohol [#6][OX2H]
+enol [OX2H][#6X3]=[#6]
+phenol [OX2H][cX3]:[c]
diff --git a/onepass.py b/onepass.py
index 71f9e5b..4de86e1 100644
--- a/onepass.py
+++ b/onepass.py
@@ -1,163 +1,163 @@
 import numpy as np 
 import timeit
 import gurobipy as gp
 from gurobipy import GRB
 
 def addvariables(Z):
     upperbounds=[]
     I=[]
     for M in database_indices:
         m=len(data['database_ncharges'][M])
         I=I+[(i,j,M) for i in range(m) for j in range(n)]
         upperbounds.append(int(n/m))
 
     x=Z.addVars(I, vtype=GRB.BINARY)
     # dummy variables y associated to number of times molecule M is picked 
     y=Z.addVars(database_indices, vtype="I", lb=0, ub=upperbounds)
     print("Variables added.")
     return x,y,I
 
 def addconstraints(Z,x,y):
     # injection into [n]
     Z.addConstrs(x.sum('*',j,'*') <= 1 for j in range(n))
     # use at least 80% of target
     Z.addConstr(x.sum() >= 0.8*n)
 
     # additional constraints: take whole molecule or leave it out, and matching charges
     for M in database_indices:
         CM=data['database_ncharges'][M]
         m=len(CM)
         # the number of indices of M used (counting multiple use) is at least 55% of its size
         Z.addConstr(x.sum('*','*',M) >= 0.55*m*y[M])
         # each index in M is used at most y[M] times
         Z.addConstrs(x.sum(i,'*',M) <= y[M] for i in range(m))
         
         # ignore incompatible charges and atoms of charge 1
         for i in range(m):
             for j in range(n):
                 if(CM[i] != CT[j]):
                 #if(CM[i] != CT[j] or CM[i]==1):
                     Z.addConstr(x[i,j,M]==0)
         
 
     print("Constraints added.")
     return 0
 
 def setobjective(Z,x,y):
     expr=gp.QuadExpr()
     print("Constructing objective function... ")
     key=0
     for M in database_indices:
         key=key+1
         Mol=data['database_CMs'][M]
         m=len(Mol)
         expr.addTerms(-m, y[M])
         for i in range(m):
             for j in range(m):
                 for k in range(n):
                     for l in range(k,n):
                         expr.add(x[i,k,M] * x[j,l,M], np.abs(T[k,l]-Mol[i,j])**2) 
         print(key, "  /  ", size_database)
 
     Z.setObjective(expr, GRB.MINIMIZE)
     print("Objective function set.")
     return 0
 
 # prints mappings of positions (indices+1) of each molecule (before preprocess) to positions inside target (before preprocess, but the hydrogens are at the end anyway)
 def print_sols(Z, x, y):
     SolCount=Z.SolCount
     for solnb in range(SolCount):
         print()
         print("--------------------------------")
         Z.setParam("SolutionNumber",solnb)
         print("Solution number", solnb+1, ", objective value", Z.PoolObjVal)
         
         for M in database_indices:
             amount_picked=int(np.rint(y[M].Xn))
             if amount_picked != 0:
                 m=len(data['database_ncharges'][M])
                 U=np.zeros((m,amount_picked))
                 
                 # constructing U   
                 for i in range(m):
                     k=0
                     for j in range(n):
                         if x[i,j,M].Xn==1 and sum(U[:,k]!=0) < 0.55*m:
                             U[i,k]=j+1
                             k=k+1
                 
                 # reading U
                 for k in range(amount_picked):
                     if np.any(U[:,k] != 0):
                         if k==0:
                             print("Molecule", data['database_labels'][M], "has been picked", amount_picked, "time(s) ( size", len(data['database_ncharges'][M]), ", used", sum([x[i,j,M].Xn for i in range(m) for j in range(n)]), ")")
                         print(k+1, end=": ")
                         for i in range(m):
                             if U[i,k]!=0:
                                 print(oldindex(i,M)+1, "->", U[i,k], end=", ")
                         print("used", sum(U[:,k]!=0)) 
 
 # converts new index (with hydrogens removed) to old index.
-def oldindex(i, M, target):
+def oldindex(i, M):
     k=0
     notones=0
     CM=olddata['database_ncharges'][M]
     while notones<i or CM[k]==1:
         if CM[k]!=1:
             notones=notones+1
         k=k+1
     return k
 
 def main():
     # construction of the model
     start=timeit.default_timer() 
     Z = gp.Model()
     Z.setParam('OutputFlag',1)
     x,y,I=addvariables(Z)
     addconstraints(Z,x,y)
     setobjective(Z,x,y)
     stop=timeit.default_timer()
     print("Model setup: ", stop-start, "s")
     
     # optimization
     # trying parameters from Z.tune() and others.
     Z.setParam("DegenMoves", 2)
     Z.setParam("ZeroHalfCuts", 2)
     Z.setParam("BQPCuts", 2)
     Z.setParam("RLTCuts", 2)
     
     # time (seconds) and gap (%) limit, (max) number of solutions kept in memory
     Z.setParam("TimeLimit", 180) 
     Z.setParam("MIPGap", 0.33) # 1.5-approx. algorithm (not really)
     Z.setParam("PoolSolutions", 10)
 
     print("------------")
     print("Optimization")
     print("------------")
     Z.optimize()
 
     print("Optimization runtime: ", Z.RunTime, "s")
     
     if(Z.status == 3):
         print("Model was proven to be infeasible.")
         return 1
     
     print_sols(Z,x,y)
     return 0
 
 # global constants 
 
 data=np.load("datafilter.npz", allow_pickle=True)
 olddata=np.load("data.npz", allow_pickle=True)
 target_index=1
 print("Target", data['target_labels'][target_index])
 CT=data['target_ncharges'][target_index]
 T=data['target_CMs'][target_index]
 n=len(data['target_ncharges'][target_index])
 
 #database_indices=np.loadtxt("filtered"+str(target_index), dtype=int)
 #database_indices=database_indices[:100]
 database_indices=range(500)
 size_database=len(database_indices)
 
 main()
diff --git a/xyz2mol.py b/xyz2mol.py
new file mode 100644
index 0000000..e50cee7
--- /dev/null
+++ b/xyz2mol.py
@@ -0,0 +1,817 @@
+"""
+Module for generating rdkit molobj/smiles/molecular graph from free atoms
+
+Implementation by Jan H. Jensen, based on the paper
+
+    Yeonjoon Kim and Woo Youn Kim
+    "Universal Structure Conversion Method for Organic Molecules: From Atomic Connectivity
+    to Three-Dimensional Geometry"
+    Bull. Korean Chem. Soc. 2015, Vol. 36, 1769-1777
+    DOI: 10.1002/bkcs.10334
+
+"""
+
+import copy
+import itertools
+
+from rdkit.Chem import rdmolops
+from rdkit.Chem import rdchem
+try:
+    from rdkit.Chem import rdEHTTools #requires RDKit 2019.9.1 or later
+except ImportError:
+    rdEHTTools = None
+
+from collections import defaultdict
+
+import numpy as np
+import networkx as nx
+
+from rdkit import Chem
+from rdkit.Chem import AllChem, rdmolops
+import sys
+
+global __ATOM_LIST__
+__ATOM_LIST__ = \
+    ['h',  'he',
+     'li', 'be', 'b',  'c',  'n',  'o',  'f',  'ne',
+     'na', 'mg', 'al', 'si', 'p',  's',  'cl', 'ar',
+     'k',  'ca', 'sc', 'ti', 'v ', 'cr', 'mn', 'fe', 'co', 'ni', 'cu',
+     'zn', 'ga', 'ge', 'as', 'se', 'br', 'kr',
+     'rb', 'sr', 'y',  'zr', 'nb', 'mo', 'tc', 'ru', 'rh', 'pd', 'ag',
+     'cd', 'in', 'sn', 'sb', 'te', 'i',  'xe',
+     'cs', 'ba', 'la', 'ce', 'pr', 'nd', 'pm', 'sm', 'eu', 'gd', 'tb', 'dy',
+     'ho', 'er', 'tm', 'yb', 'lu', 'hf', 'ta', 'w',  're', 'os', 'ir', 'pt',
+     'au', 'hg', 'tl', 'pb', 'bi', 'po', 'at', 'rn',
+     'fr', 'ra', 'ac', 'th', 'pa', 'u',  'np', 'pu']
+
+
+global atomic_valence
+global atomic_valence_electrons
+
+atomic_valence = defaultdict(list)
+atomic_valence[1] = [1]
+atomic_valence[5] = [3,4]
+atomic_valence[6] = [4]
+atomic_valence[7] = [3,4]
+atomic_valence[8] = [2,1,3]
+atomic_valence[9] = [1]
+atomic_valence[14] = [4]
+atomic_valence[15] = [5,3] #[5,4,3]
+atomic_valence[16] = [6,3,2] #[6,4,2]
+atomic_valence[17] = [1]
+atomic_valence[32] = [4]
+atomic_valence[35] = [1]
+atomic_valence[53] = [1]
+
+atomic_valence_electrons = {}
+atomic_valence_electrons[1] = 1
+atomic_valence_electrons[5] = 3
+atomic_valence_electrons[6] = 4
+atomic_valence_electrons[7] = 5
+atomic_valence_electrons[8] = 6
+atomic_valence_electrons[9] = 7
+atomic_valence_electrons[14] = 4
+atomic_valence_electrons[15] = 5
+atomic_valence_electrons[16] = 6
+atomic_valence_electrons[17] = 7
+atomic_valence_electrons[32] = 4
+atomic_valence_electrons[35] = 7
+atomic_valence_electrons[53] = 7
+
+
+def str_atom(atom):
+    """
+    convert integer atom to string atom
+    """
+    global __ATOM_LIST__
+    atom = __ATOM_LIST__[atom - 1]
+    return atom
+
+
+def int_atom(atom):
+    """
+    convert str atom to integer atom
+    """
+    global __ATOM_LIST__
+    #print(atom)
+    atom = atom.lower()
+    return __ATOM_LIST__.index(atom) + 1
+
+
+def get_UA(maxValence_list, valence_list):
+    """
+    """
+    UA = []
+    DU = []
+    for i, (maxValence, valence) in enumerate(zip(maxValence_list, valence_list)):
+        if not maxValence - valence > 0:
+            continue
+        UA.append(i)
+        DU.append(maxValence - valence)
+    return UA, DU
+
+
+def get_BO(AC, UA, DU, valences, UA_pairs, use_graph=True):
+    """
+    """
+    BO = AC.copy()
+    DU_save = []
+
+    while DU_save != DU:
+        for i, j in UA_pairs:
+            BO[i, j] += 1
+            BO[j, i] += 1
+
+        BO_valence = list(BO.sum(axis=1))
+        DU_save = copy.copy(DU)
+        UA, DU = get_UA(valences, BO_valence)
+        UA_pairs = get_UA_pairs(UA, AC, use_graph=use_graph)[0]
+
+    return BO
+
+
+def valences_not_too_large(BO, valences):
+    """
+    """
+    number_of_bonds_list = BO.sum(axis=1)
+    for valence, number_of_bonds in zip(valences, number_of_bonds_list):
+        if number_of_bonds > valence:
+            return False
+
+    return True
+
+def charge_is_OK(BO, AC, charge, DU, atomic_valence_electrons, atoms, valences,
+                 allow_charged_fragments=True):
+    # total charge
+    Q = 0
+
+    # charge fragment list
+    q_list = []
+
+    if allow_charged_fragments:
+
+        BO_valences = list(BO.sum(axis=1))
+        for i, atom in enumerate(atoms):
+            q = get_atomic_charge(atom, atomic_valence_electrons[atom], BO_valences[i])
+            Q += q
+            if atom == 6:
+                number_of_single_bonds_to_C = list(BO[i, :]).count(1)
+                if number_of_single_bonds_to_C == 2 and BO_valences[i] == 2:
+                    Q += 1
+                    q = 2
+                if number_of_single_bonds_to_C == 3 and Q + 1 < charge:
+                    Q += 2
+                    q = 1
+
+            if q != 0:
+                q_list.append(q)
+
+    return (charge == Q)
+
+def BO_is_OK(BO, AC, charge, DU, atomic_valence_electrons, atoms, valences,
+    allow_charged_fragments=True):
+    """
+    Sanity of bond-orders
+
+    args:
+        BO -
+        AC -
+        charge -
+        DU - 
+
+
+    optional
+        allow_charges_fragments - 
+
+
+    returns:
+        boolean - true of molecule is OK, false if not
+    """
+
+    if not valences_not_too_large(BO, valences):
+        return False
+
+    check_sum = (BO - AC).sum() == sum(DU)
+    check_charge = charge_is_OK(BO, AC, charge, DU, atomic_valence_electrons, atoms, valences,
+                                allow_charged_fragments)
+
+    if check_charge and check_sum: 
+        return True
+
+    return False
+
+
+def get_atomic_charge(atom, atomic_valence_electrons, BO_valence):
+    """
+    """
+
+    if atom == 1:
+        charge = 1 - BO_valence
+    elif atom == 5:
+        charge = 3 - BO_valence
+    elif atom == 15 and BO_valence == 5:
+        charge = 0
+    elif atom == 16 and BO_valence == 6:
+        charge = 0
+    else:
+        charge = atomic_valence_electrons - 8 + BO_valence
+
+    return charge
+
+
+def clean_charges(mol):
+    """
+    This hack should not be needed anymore, but is kept just in case
+
+    """
+
+    Chem.SanitizeMol(mol)
+    #rxn_smarts = ['[N+:1]=[*:2]-[C-:3]>>[N+0:1]-[*:2]=[C-0:3]',
+    #              '[N+:1]=[*:2]-[O-:3]>>[N+0:1]-[*:2]=[O-0:3]',
+    #              '[N+:1]=[*:2]-[*:3]=[*:4]-[O-:5]>>[N+0:1]-[*:2]=[*:3]-[*:4]=[O-0:5]',
+    #              '[#8:1]=[#6:2]([!-:6])[*:3]=[*:4][#6-:5]>>[*-:1][*:2]([*:6])=[*:3][*:4]=[*+0:5]',
+    #              '[O:1]=[c:2][c-:3]>>[*-:1][*:2][*+0:3]',
+    #              '[O:1]=[C:2][C-:3]>>[*-:1][*:2]=[*+0:3]']
+
+    rxn_smarts = ['[#6,#7:1]1=[#6,#7:2][#6,#7:3]=[#6,#7:4][CX3-,NX3-:5][#6,#7:6]1=[#6,#7:7]>>'
+                  '[#6,#7:1]1=[#6,#7:2][#6,#7:3]=[#6,#7:4][-0,-0:5]=[#6,#7:6]1[#6-,#7-:7]',
+                  '[#6,#7:1]1=[#6,#7:2][#6,#7:3](=[#6,#7:4])[#6,#7:5]=[#6,#7:6][CX3-,NX3-:7]1>>'
+                  '[#6,#7:1]1=[#6,#7:2][#6,#7:3]([#6-,#7-:4])=[#6,#7:5][#6,#7:6]=[-0,-0:7]1']
+
+    fragments = Chem.GetMolFrags(mol,asMols=True,sanitizeFrags=False)
+
+    for i, fragment in enumerate(fragments):
+        for smarts in rxn_smarts:
+            patt = Chem.MolFromSmarts(smarts.split(">>")[0])
+            while fragment.HasSubstructMatch(patt):
+                rxn = AllChem.ReactionFromSmarts(smarts)
+                ps = rxn.RunReactants((fragment,))
+                fragment = ps[0][0]
+                Chem.SanitizeMol(fragment)
+        if i == 0:
+            mol = fragment
+        else:
+            mol = Chem.CombineMols(mol, fragment)
+
+    return mol
+
+
+def BO2mol(mol, BO_matrix, atoms, atomic_valence_electrons,
+           mol_charge, allow_charged_fragments=True):
+    """
+    based on code written by Paolo Toscani
+
+    From bond order, atoms, valence structure and total charge, generate an
+    rdkit molecule.
+
+    args:
+        mol - rdkit molecule
+        BO_matrix - bond order matrix of molecule
+        atoms - list of integer atomic symbols
+        atomic_valence_electrons -
+        mol_charge - total charge of molecule
+
+    optional:
+        allow_charged_fragments - bool - allow charged fragments
+
+    returns
+        mol - updated rdkit molecule with bond connectivity
+
+    """
+
+    l = len(BO_matrix)
+    l2 = len(atoms)
+    BO_valences = list(BO_matrix.sum(axis=1))
+
+    if (l != l2):
+        raise RuntimeError('sizes of adjMat ({0:d}) and Atoms {1:d} differ'.format(l, l2))
+
+    rwMol = Chem.RWMol(mol)
+
+    bondTypeDict = {
+        1: Chem.BondType.SINGLE,
+        2: Chem.BondType.DOUBLE,
+        3: Chem.BondType.TRIPLE
+    }
+
+    for i in range(l):
+        for j in range(i + 1, l):
+            bo = int(round(BO_matrix[i, j]))
+            if (bo == 0):
+                continue
+            bt = bondTypeDict.get(bo, Chem.BondType.SINGLE)
+            rwMol.AddBond(i, j, bt)
+
+    mol = rwMol.GetMol()
+
+    if allow_charged_fragments:
+        mol = set_atomic_charges(
+            mol,
+            atoms,
+            atomic_valence_electrons,
+            BO_valences,
+            BO_matrix,
+            mol_charge)
+    else:
+        mol = set_atomic_radicals(mol, atoms, atomic_valence_electrons, BO_valences)
+
+    return mol
+
+
+def set_atomic_charges(mol, atoms, atomic_valence_electrons,
+                       BO_valences, BO_matrix, mol_charge):
+    """
+    """
+    q = 0
+    for i, atom in enumerate(atoms):
+        a = mol.GetAtomWithIdx(i)
+        charge = get_atomic_charge(atom, atomic_valence_electrons[atom], BO_valences[i])
+        q += charge
+        if atom == 6:
+            number_of_single_bonds_to_C = list(BO_matrix[i, :]).count(1)
+            if number_of_single_bonds_to_C == 2 and BO_valences[i] == 2:
+                q += 1
+                charge = 0
+            if number_of_single_bonds_to_C == 3 and q + 1 < mol_charge:
+                q += 2
+                charge = 1
+
+        if (abs(charge) > 0):
+            a.SetFormalCharge(int(charge))
+
+    #mol = clean_charges(mol)
+
+    return mol
+
+
+def set_atomic_radicals(mol, atoms, atomic_valence_electrons, BO_valences):
+    """
+
+    The number of radical electrons = absolute atomic charge
+
+    """
+    for i, atom in enumerate(atoms):
+        a = mol.GetAtomWithIdx(i)
+        charge = get_atomic_charge(
+            atom,
+            atomic_valence_electrons[atom],
+            BO_valences[i])
+
+        if (abs(charge) > 0):
+            a.SetNumRadicalElectrons(abs(int(charge)))
+
+    return mol
+
+
+def get_bonds(UA, AC):
+    """
+
+    """
+    bonds = []
+
+    for k, i in enumerate(UA):
+        for j in UA[k + 1:]:
+            if AC[i, j] == 1:
+                bonds.append(tuple(sorted([i, j])))
+
+    return bonds
+
+
+def get_UA_pairs(UA, AC, use_graph=True):
+    """
+
+    """
+
+    bonds = get_bonds(UA, AC)
+
+    if len(bonds) == 0:
+        return [()]
+
+    if use_graph:
+        G = nx.Graph()
+        G.add_edges_from(bonds)
+        UA_pairs = [list(nx.max_weight_matching(G))]
+        return UA_pairs
+
+    max_atoms_in_combo = 0
+    UA_pairs = [()]
+    for combo in list(itertools.combinations(bonds, int(len(UA) / 2))):
+        flat_list = [item for sublist in combo for item in sublist]
+        atoms_in_combo = len(set(flat_list))
+        if atoms_in_combo > max_atoms_in_combo:
+            max_atoms_in_combo = atoms_in_combo
+            UA_pairs = [combo]
+
+        elif atoms_in_combo == max_atoms_in_combo:
+            UA_pairs.append(combo)
+
+    return UA_pairs
+
+
+def AC2BO(AC, atoms, charge, allow_charged_fragments=True, use_graph=True):
+    """
+
+    implemenation of algorithm shown in Figure 2
+
+    UA: unsaturated atoms
+
+    DU: degree of unsaturation (u matrix in Figure)
+
+    best_BO: Bcurr in Figure
+
+    """
+
+    global atomic_valence
+    global atomic_valence_electrons
+
+    # make a list of valences, e.g. for CO: [[4],[2,1]]
+    valences_list_of_lists = []
+    AC_valence = list(AC.sum(axis=1))
+    
+    for i,(atomicNum,valence) in enumerate(zip(atoms,AC_valence)):
+        # valence can't be smaller than number of neighbourgs
+        possible_valence = [x for x in atomic_valence[atomicNum] if x >= valence]
+        if not possible_valence:
+            print('Valence of atom',i,'is',valence,'which bigger than allowed max',max(atomic_valence[atomicNum]),'. Stopping')
+            sys.exit()
+        valences_list_of_lists.append(possible_valence)
+
+    # convert [[4],[2,1]] to [[4,2],[4,1]]
+    valences_list = itertools.product(*valences_list_of_lists)
+
+    best_BO = AC.copy()
+
+    for valences in valences_list:
+
+        UA, DU_from_AC = get_UA(valences, AC_valence)
+
+        check_len = (len(UA) == 0)
+        if check_len:
+            check_bo = BO_is_OK(AC, AC, charge, DU_from_AC,
+                atomic_valence_electrons, atoms, valences,
+                allow_charged_fragments=allow_charged_fragments)
+        else:
+            check_bo = None
+
+        if check_len and check_bo:
+            return AC, atomic_valence_electrons
+
+        UA_pairs_list = get_UA_pairs(UA, AC, use_graph=use_graph)
+        for UA_pairs in UA_pairs_list:
+            BO = get_BO(AC, UA, DU_from_AC, valences, UA_pairs, use_graph=use_graph)
+            status = BO_is_OK(BO, AC, charge, DU_from_AC,
+                        atomic_valence_electrons, atoms, valences,
+                        allow_charged_fragments=allow_charged_fragments)
+            charge_OK = charge_is_OK(BO, AC, charge, DU_from_AC, atomic_valence_electrons, atoms, valences,
+                                     allow_charged_fragments=allow_charged_fragments)
+
+            if status:
+                return BO, atomic_valence_electrons
+            elif BO.sum() >= best_BO.sum() and valences_not_too_large(BO, valences) and charge_OK:
+                best_BO = BO.copy()
+
+    return best_BO, atomic_valence_electrons
+
+
+def AC2mol(mol, AC, atoms, charge, allow_charged_fragments=True, use_graph=True):
+    """
+    """
+
+    # convert AC matrix to bond order (BO) matrix
+    BO, atomic_valence_electrons = AC2BO(
+        AC,
+        atoms,
+        charge,
+        allow_charged_fragments=allow_charged_fragments,
+        use_graph=use_graph)
+
+    # add BO connectivity and charge info to mol object
+    mol = BO2mol(
+        mol,
+        BO,
+        atoms,
+        atomic_valence_electrons,
+        charge,
+        allow_charged_fragments=allow_charged_fragments)
+
+    # If charge is not correct don't return mol
+    if Chem.GetFormalCharge(mol) != charge:
+        return []
+
+    # BO2mol returns an arbitrary resonance form. Let's make the rest
+    mols = rdchem.ResonanceMolSupplier(mol, Chem.UNCONSTRAINED_CATIONS, Chem.UNCONSTRAINED_ANIONS)
+    mols = [mol for mol in mols]
+
+    return mols
+
+
+def get_proto_mol(atoms):
+    """
+    """
+    mol = Chem.MolFromSmarts("[#" + str(atoms[0]) + "]")
+    rwMol = Chem.RWMol(mol)
+    for i in range(1, len(atoms)):
+        a = Chem.Atom(atoms[i])
+        rwMol.AddAtom(a)
+
+    mol = rwMol.GetMol()
+
+    return mol
+
+
+def read_xyz_file(filename, look_for_charge=True):
+    """
+    """
+
+    atomic_symbols = []
+    xyz_coordinates = []
+    charge = 0
+    title = ""
+
+    with open(filename, "r") as file:
+        for line_number, line in enumerate(file):
+            if line_number == 0:
+                num_atoms = int(line)
+            elif line_number == 1:
+                title = line
+                if "charge=" in line:
+                    charge = int(line.split("=")[1])
+            else:
+                atomic_symbol, x, y, z = line.split()
+                atomic_symbols.append(atomic_symbol)
+                xyz_coordinates.append([float(x), float(y), float(z)])
+
+    atoms = [int_atom(atom) for atom in atomic_symbols]
+
+    return atoms, charge, xyz_coordinates
+
+
+def xyz2AC(atoms, xyz, charge, use_huckel=False):
+    """
+
+    atoms and coordinates to atom connectivity (AC)
+
+    args:
+        atoms - int atom types
+        xyz - coordinates
+        charge - molecule charge
+
+    optional:
+        use_huckel - Use Huckel method for atom connecitivty
+
+    returns
+        ac - atom connectivity matrix
+        mol - rdkit molecule
+
+    """
+
+    if use_huckel:
+        return xyz2AC_huckel(atoms, xyz, charge)
+    else:
+        return xyz2AC_vdW(atoms, xyz)
+
+
+def xyz2AC_vdW(atoms, xyz):
+
+    # Get mol template
+    mol = get_proto_mol(atoms)
+
+    # Set coordinates
+    conf = Chem.Conformer(mol.GetNumAtoms())
+    for i in range(mol.GetNumAtoms()):
+        conf.SetAtomPosition(i, (xyz[i][0], xyz[i][1], xyz[i][2]))
+    mol.AddConformer(conf)
+
+    AC = get_AC(mol)
+
+    return AC, mol
+
+
+def get_AC(mol, covalent_factor=1.3):
+    """
+
+    Generate adjacent matrix from atoms and coordinates.
+
+    AC is a (num_atoms, num_atoms) matrix with 1 being covalent bond and 0 is not
+
+
+    covalent_factor - 1.3 is an arbitrary factor
+
+    args:
+        mol - rdkit molobj with 3D conformer
+
+    optional
+        covalent_factor - increase covalent bond length threshold with facto
+
+    returns:
+        AC - adjacent matrix
+
+    """
+
+    # Calculate distance matrix
+    dMat = Chem.Get3DDistanceMatrix(mol)
+
+    pt = Chem.GetPeriodicTable()
+    num_atoms = mol.GetNumAtoms()
+    AC = np.zeros((num_atoms, num_atoms), dtype=int)
+
+    for i in range(num_atoms):
+        a_i = mol.GetAtomWithIdx(i)
+        Rcov_i = pt.GetRcovalent(a_i.GetAtomicNum()) * covalent_factor
+        for j in range(i + 1, num_atoms):
+            a_j = mol.GetAtomWithIdx(j)
+            Rcov_j = pt.GetRcovalent(a_j.GetAtomicNum()) * covalent_factor
+            if dMat[i, j] <= Rcov_i + Rcov_j:
+                AC[i, j] = 1
+                AC[j, i] = 1
+
+    return AC
+
+
+def xyz2AC_huckel(atomicNumList,xyz,charge):
+    """
+
+    args
+        atomicNumList - atom type list
+        xyz - coordinates
+        charge - molecule charge
+
+    returns
+        ac - atom connectivity
+        mol - rdkit molecule
+
+    """
+    mol = get_proto_mol(atomicNumList)
+
+    conf = Chem.Conformer(mol.GetNumAtoms())
+    for i in range(mol.GetNumAtoms()):
+        conf.SetAtomPosition(i,(xyz[i][0],xyz[i][1],xyz[i][2]))
+    mol.AddConformer(conf)
+
+    num_atoms = len(atomicNumList)
+    AC = np.zeros((num_atoms,num_atoms)).astype(int)
+
+    mol_huckel = Chem.Mol(mol)
+    mol_huckel.GetAtomWithIdx(0).SetFormalCharge(charge) #mol charge arbitrarily added to 1st atom    
+
+    passed,result = rdEHTTools.RunMol(mol_huckel)
+    opop = result.GetReducedOverlapPopulationMatrix()
+    tri = np.zeros((num_atoms, num_atoms))
+    tri[np.tril(np.ones((num_atoms, num_atoms), dtype=bool))] = opop #lower triangular to square matrix
+    for i in range(num_atoms):
+        for j in range(i+1,num_atoms):
+            pair_pop = abs(tri[j,i])   
+            if pair_pop >= 0.15: #arbitry cutoff for bond. May need adjustment
+                AC[i,j] = 1
+                AC[j,i] = 1
+
+    return AC, mol
+
+
+def chiral_stereo_check(mol):
+    """
+    Find and embed chiral information into the model based on the coordinates
+
+    args:
+        mol - rdkit molecule, with embeded conformer
+
+    """
+    Chem.SanitizeMol(mol)
+    Chem.DetectBondStereochemistry(mol, -1)
+    Chem.AssignStereochemistry(mol, flagPossibleStereoCenters=True, force=True)
+    Chem.AssignAtomChiralTagsFromStructure(mol, -1)
+
+    return
+
+
+def xyz2mol(atoms, coordinates,
+    charge=0,
+    allow_charged_fragments=True,
+    use_graph=True,
+    use_huckel=False,
+    embed_chiral=True):
+    """
+    Generate a rdkit molobj from atoms, coordinates and a total_charge.
+
+    args:
+        atoms - list of atom types (int)
+        coordinates - 3xN Cartesian coordinates
+        charge - total charge of the system (default: 0)
+
+    optional:
+        allow_charged_fragments - alternatively radicals are made
+        use_graph - use graph (networkx)
+        use_huckel - Use Huckel method for atom connectivity prediction
+        embed_chiral - embed chiral information to the molecule
+
+    returns:
+        mols - list of rdkit molobjects
+
+    """
+
+    # Get atom connectivity (AC) matrix, list of atomic numbers, molecular charge,
+    # and mol object with no connectivity information
+    AC, mol = xyz2AC(atoms, coordinates, charge, use_huckel=use_huckel)
+
+    # Convert AC to bond order matrix and add connectivity and charge info to
+    # mol object
+    new_mols = AC2mol(mol, AC, atoms, charge,
+        allow_charged_fragments=allow_charged_fragments,
+        use_graph=use_graph)
+
+    # Check for stereocenters and chiral centers
+    if embed_chiral:
+        for new_mol in new_mols:
+            chiral_stereo_check(new_mol)
+
+    return new_mols
+
+
+def main():
+
+
+    return
+
+
+if __name__ == "__main__":
+
+    import argparse
+
+    parser = argparse.ArgumentParser(usage='%(prog)s [options] molecule.xyz')
+    parser.add_argument('structure', metavar='structure', type=str)
+    parser.add_argument('-s', '--sdf',
+        action="store_true",
+        help="Dump sdf file")
+    parser.add_argument('--ignore-chiral',
+        action="store_true",
+        help="Ignore chiral centers")
+    parser.add_argument('--no-charged-fragments',
+        action="store_true",
+        help="Allow radicals to be made")
+    parser.add_argument('--no-graph',
+        action="store_true",
+        help="Run xyz2mol without networkx dependencies")
+
+    # huckel uses extended Huckel bond orders to locate bonds (requires RDKit 2019.9.1 or later)
+    # otherwise van der Waals radii are used
+    parser.add_argument('--use-huckel',
+        action="store_true",
+        help="Use Huckel method for atom connectivity")
+    parser.add_argument('-o', '--output-format',
+        action="store",
+        type=str,
+        help="Output format [smiles,sdf] (default=sdf)")
+    parser.add_argument('-c', '--charge',
+        action="store",
+        metavar="int",
+        type=int,
+        help="Total charge of the system")
+
+    args = parser.parse_args()
+
+    # read xyz file
+    filename = args.structure
+
+    # allow for charged fragments, alternatively radicals are made
+    charged_fragments = not args.no_charged_fragments
+
+    # quick is faster for large systems but requires networkx
+    # if you don't want to install networkx set quick=False and
+    # uncomment 'import networkx as nx' at the top of the file
+    quick = not args.no_graph
+
+    # chiral comment
+    embed_chiral = not args.ignore_chiral
+
+    # read atoms and coordinates. Try to find the charge
+    atoms, charge, xyz_coordinates = read_xyz_file(filename)
+
+    # huckel uses extended Huckel bond orders to locate bonds (requires RDKit 2019.9.1 or later)
+    # otherwise van der Waals radii are used
+    use_huckel = args.use_huckel
+
+    # if explicit charge from args, set it
+    if args.charge is not None:
+        charge = int(args.charge)
+
+    # Get the molobjs
+    mols = xyz2mol(atoms, xyz_coordinates,
+        charge=charge,
+        use_graph=quick,
+        allow_charged_fragments=charged_fragments,
+        embed_chiral=embed_chiral,
+        use_huckel=use_huckel)
+
+    # Print output
+    for mol in mols:
+        if args.output_format == "sdf":
+            txt = Chem.MolToMolBlock(mol)
+            print(txt)
+
+        else:
+            # Canonical hack
+            isomeric_smiles = not args.ignore_chiral
+            smiles = Chem.MolToSmiles(mol, isomericSmiles=isomeric_smiles)
+            m = Chem.MolFromSmiles(smiles)
+            smiles = Chem.MolToSmiles(m, isomericSmiles=isomeric_smiles)
+            print(smiles)