diff --git a/GetFunctionalGroups.ipynb b/GetFunctionalGroups.ipynb new file mode 100644 index 0000000..9651451 --- /dev/null +++ b/GetFunctionalGroups.ipynb @@ -0,0 +1,365 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from rdkit import Chem\n", + "from xyz2mol import xyz2mol" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# need ncharges, coords" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "NUCLEAR_CHARGE = {\n", + " \"H\":1,\n", + " \"C\":6,\n", + " \"O\":8,\n", + " \"N\":7,\n", + " \"F\":9,\n", + " \"S\":16\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "def read_xyz(filename):\n", + " with open(filename, \"r\") as f:\n", + " lines = f.readlines()\n", + "\n", + " natoms = int(lines[0])\n", + " nuclear_charges = []\n", + " coordinates = []\n", + "\n", + " for i, line in enumerate(lines[2:natoms+2]):\n", + " tokens = line.split()\n", + "\n", + " if len(tokens) < 4:\n", + " break\n", + "\n", + " nuclear_charges.append(NUCLEAR_CHARGE[tokens[0]])\n", + " coordinates.append([float(token) for token in tokens[1:4]])\n", + " \n", + " return nuclear_charges, coordinates" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "def xyzfile_to_mol(filename):\n", + " ncharges, coords = read_xyz(filename)\n", + " mols = xyz2mol(ncharges, coords)\n", + " return mols[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "from glob import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "frag_files = [x for x in sorted(glob(\"qm7/*\"))]" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "target_files = [x for x in sorted(glob(\"targets/*\"))]" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "frag_mols = [xyzfile_to_mol(x) for x in frag_files]" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "target_mols = [xyzfile_to_mol(x) for x in target_files]" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "# for func group matches need to use SMARTS and \n", + "# func_group = Chem.MolFromSmarts(\"smarts\")\n", + "# mol.GetSubStructMatches(func_group)\n", + "# this returns tuples of matches (then count)" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [], + "source": [ + "# need to identify list of relevant functional groups" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "metadata": {}, + "outputs": [], + "source": [ + "functional_groups = {\n", + " 'arene' : 'c',\n", + " 'allenic C' : '[$([CX2](=C)=C)]',\n", + " 'vinylic C' : '[$([CX3]=[CX3])]',\n", + " 'acetylenic C' : '[$([CX2]#C)]',\n", + " \"carbonyl\" : '[$([CX3]=[OX1]),$([CX3+]-[OX1-])]',\n", + " 'aldeyhde' : '[CX3H1](=O)[#6]',\n", + " 'amide' : '[NX3][CX3](=[OX1])[#6]',\n", + " 'carboxylic acid': '[CX3](=O)[OX2H1]',\n", + " 'ester' : '[#6][CX3](=O)[OX2H0][#6]',\n", + " 'ketone' : '[#6][CX3](=O)[#6]',\n", + " 'ether' : '[OD2]([#6])[#6]',\n", + " 'azo general' : '[#7]',\n", + " 'amine' : '[NX3;H2,H1;!$(NC=O)]',\n", + " 'enamine' : '[NX3][CX3]=[CX3]',\n", + " 'imine' : '[$([CX3]([#6])[#6]),$([CX3H][#6])]=[$([NX2][#6]),$([NX2H])]',\n", + " 'nitrate' : '[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]',\n", + " 'nitrile' : '[NX1]#[CX2]',\n", + " 'nitro' : '[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]',\n", + " 'alcohol' : '[#6][OX2H]',\n", + " 'enol' : '[OX2H][#6X3]=[#6]',\n", + " 'phenol' : '[OX2H][cX3]:[c]'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 216, + "metadata": {}, + "outputs": [], + "source": [ + "with open('functional_groups.txt', 'w') as f:\n", + " for label, fg in functional_groups.items():\n", + " f.write(label+' '+fg+'\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "metadata": {}, + "outputs": [], + "source": [ + "def get_fg_count(mol, functional_groups):\n", + " fg_count = []\n", + " for label, fg in functional_groups.items():\n", + " fg_mol = Chem.MolFromSmarts(fg)\n", + " match = mol.GetSubstructMatches(fg_mol)\n", + " fg_count.append(len(match))\n", + " return fg_count " + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "metadata": {}, + "outputs": [], + "source": [ + "fg_counts_targets = [get_fg_count(x, functional_groups) for x in target_mols]" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "metadata": {}, + "outputs": [], + "source": [ + "fg_counts_frags = [get_fg_count(x, functional_groups) for x in frag_mols]" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "# get adj matrices" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "frag_adj_matrices = [Chem.rdmolops.GetAdjacencyMatrix(x) for x in frag_mols]" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "target_adj_matrices = [Chem.rdmolops.GetAdjacencyMatrix(x) for x in target_mols]" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "metadata": {}, + "outputs": [], + "source": [ + "# save everything" + ] + }, + { + "cell_type": "code", + "execution_count": 217, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + " \"\"\"Entry point for launching an IPython kernel.\n", + "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/ipykernel_launcher.py:2: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + " \n" + ] + } + ], + "source": [ + "data = [np.array(fg_counts_targets), np.array(fg_counts_frags), np.array(frag_adj_matrices),\n", + " np.array(target_adj_matrices)]" + ] + }, + { + "cell_type": "code", + "execution_count": 222, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/puck/anaconda3/envs/rdkit/lib/python3.7/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", + " return array(a, dtype, copy=False, order=order, subok=True)\n" + ] + } + ], + "source": [ + "np.savez('connectivity_data.npz',fg_counts_targets=fg_counts_targets,\n", + " fg_counts_frags=fg_counts_frags,\n", + " frag_adj_matrices=frag_adj_matrices,\n", + " target_adj_matrices=target_adj_matrices)" + ] + }, + { + "cell_type": "code", + "execution_count": 223, + "metadata": {}, + "outputs": [], + "source": [ + "container = np.load('connectivity_data.npz')" + ] + }, + { + "cell_type": "code", + "execution_count": 224, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['fg_counts_targets',\n", + " 'fg_counts_frags',\n", + " 'frag_adj_matrices',\n", + " 'target_adj_matrices']" + ] + }, + "execution_count": 224, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(container.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/README.md b/README.md index 303fc8b..039c1ea 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,31 @@ # Data +## Structures The matrices for 3 target structures (to synthesize) and a database of 7165 query structures (to combine to build the target) are compressed in `data.npz` Within python, it can be read like: ``` data = np.load("data.npz", allow_pickle=True) ``` where `data.files` will return the names of the numpy arrays (should be `target_labels, target_CMs, target_ncharges, database_labels, database_CMs, database_ncharges`) where CMs are the matrices (of target and database respectively) and the corresponding arrays can be accessed like: ``` data["target_labels"] ``` For more details see the documentation: https://het.as.utexas.edu/HET/Software/Numpy/reference/generated/numpy.savez.html + +## Connectivity / functional group information +Adjacency matrices and functional group information derived from the connectivity are compressed in `connectivity_data.npz`. + +Within python, it can be read like: +``` +connectivity_data = np.load("connectivity_data.npz") +``` + +the corresponding keys are `fg_counts_targets` for the functional group counts of each of the 3 target molecules,`fg_counts_frags` for the functional group counts of +each of the fragment molecules, `frag_adj_matrices` for the adjacency matrices of the fragments and `target_adj_matrices` for the adjacency matrices of the target molecules. +The order is the same as those in `data` containing the structures. diff --git a/__pycache__/xyz2mol.cpython-37.pyc b/__pycache__/xyz2mol.cpython-37.pyc new file mode 100644 index 0000000..59d7561 Binary files /dev/null and b/__pycache__/xyz2mol.cpython-37.pyc differ diff --git a/connectivity_data.npz b/connectivity_data.npz new file mode 100644 index 0000000..b68f636 Binary files /dev/null and b/connectivity_data.npz differ diff --git a/functional_groups.txt b/functional_groups.txt new file mode 100644 index 0000000..9d72a0b --- /dev/null +++ b/functional_groups.txt @@ -0,0 +1,21 @@ +arene c +allenic C [$([CX2](=C)=C)] +vinylic C [$([CX3]=[CX3])] +acetylenic C [$([CX2]#C)] +carbonyl [$([CX3]=[OX1]),$([CX3+]-[OX1-])] +aldeyhde [CX3H1](=O)[#6] +amide [NX3][CX3](=[OX1])[#6] +carboxylic acid [CX3](=O)[OX2H1] +ester [#6][CX3](=O)[OX2H0][#6] +ketone [#6][CX3](=O)[#6] +ether [OD2]([#6])[#6] +azo general [#7] +amine [NX3;H2,H1;!$(NC=O)] +enamine [NX3][CX3]=[CX3] +imine [$([CX3]([#6])[#6]),$([CX3H][#6])]=[$([NX2][#6]),$([NX2H])] +nitrate [$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)] +nitrile [NX1]#[CX2] +nitro [$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8] +alcohol [#6][OX2H] +enol [OX2H][#6X3]=[#6] +phenol [OX2H][cX3]:[c] diff --git a/onepass.py b/onepass.py index 71f9e5b..4de86e1 100644 --- a/onepass.py +++ b/onepass.py @@ -1,163 +1,163 @@ import numpy as np import timeit import gurobipy as gp from gurobipy import GRB def addvariables(Z): upperbounds=[] I=[] for M in database_indices: m=len(data['database_ncharges'][M]) I=I+[(i,j,M) for i in range(m) for j in range(n)] upperbounds.append(int(n/m)) x=Z.addVars(I, vtype=GRB.BINARY) # dummy variables y associated to number of times molecule M is picked y=Z.addVars(database_indices, vtype="I", lb=0, ub=upperbounds) print("Variables added.") return x,y,I def addconstraints(Z,x,y): # injection into [n] Z.addConstrs(x.sum('*',j,'*') <= 1 for j in range(n)) # use at least 80% of target Z.addConstr(x.sum() >= 0.8*n) # additional constraints: take whole molecule or leave it out, and matching charges for M in database_indices: CM=data['database_ncharges'][M] m=len(CM) # the number of indices of M used (counting multiple use) is at least 55% of its size Z.addConstr(x.sum('*','*',M) >= 0.55*m*y[M]) # each index in M is used at most y[M] times Z.addConstrs(x.sum(i,'*',M) <= y[M] for i in range(m)) # ignore incompatible charges and atoms of charge 1 for i in range(m): for j in range(n): if(CM[i] != CT[j]): #if(CM[i] != CT[j] or CM[i]==1): Z.addConstr(x[i,j,M]==0) print("Constraints added.") return 0 def setobjective(Z,x,y): expr=gp.QuadExpr() print("Constructing objective function... ") key=0 for M in database_indices: key=key+1 Mol=data['database_CMs'][M] m=len(Mol) expr.addTerms(-m, y[M]) for i in range(m): for j in range(m): for k in range(n): for l in range(k,n): expr.add(x[i,k,M] * x[j,l,M], np.abs(T[k,l]-Mol[i,j])**2) print(key, " / ", size_database) Z.setObjective(expr, GRB.MINIMIZE) print("Objective function set.") return 0 # prints mappings of positions (indices+1) of each molecule (before preprocess) to positions inside target (before preprocess, but the hydrogens are at the end anyway) def print_sols(Z, x, y): SolCount=Z.SolCount for solnb in range(SolCount): print() print("--------------------------------") Z.setParam("SolutionNumber",solnb) print("Solution number", solnb+1, ", objective value", Z.PoolObjVal) for M in database_indices: amount_picked=int(np.rint(y[M].Xn)) if amount_picked != 0: m=len(data['database_ncharges'][M]) U=np.zeros((m,amount_picked)) # constructing U for i in range(m): k=0 for j in range(n): if x[i,j,M].Xn==1 and sum(U[:,k]!=0) < 0.55*m: U[i,k]=j+1 k=k+1 # reading U for k in range(amount_picked): if np.any(U[:,k] != 0): if k==0: print("Molecule", data['database_labels'][M], "has been picked", amount_picked, "time(s) ( size", len(data['database_ncharges'][M]), ", used", sum([x[i,j,M].Xn for i in range(m) for j in range(n)]), ")") print(k+1, end=": ") for i in range(m): if U[i,k]!=0: print(oldindex(i,M)+1, "->", U[i,k], end=", ") print("used", sum(U[:,k]!=0)) # converts new index (with hydrogens removed) to old index. -def oldindex(i, M, target): +def oldindex(i, M): k=0 notones=0 CM=olddata['database_ncharges'][M] while notones 0: + continue + UA.append(i) + DU.append(maxValence - valence) + return UA, DU + + +def get_BO(AC, UA, DU, valences, UA_pairs, use_graph=True): + """ + """ + BO = AC.copy() + DU_save = [] + + while DU_save != DU: + for i, j in UA_pairs: + BO[i, j] += 1 + BO[j, i] += 1 + + BO_valence = list(BO.sum(axis=1)) + DU_save = copy.copy(DU) + UA, DU = get_UA(valences, BO_valence) + UA_pairs = get_UA_pairs(UA, AC, use_graph=use_graph)[0] + + return BO + + +def valences_not_too_large(BO, valences): + """ + """ + number_of_bonds_list = BO.sum(axis=1) + for valence, number_of_bonds in zip(valences, number_of_bonds_list): + if number_of_bonds > valence: + return False + + return True + +def charge_is_OK(BO, AC, charge, DU, atomic_valence_electrons, atoms, valences, + allow_charged_fragments=True): + # total charge + Q = 0 + + # charge fragment list + q_list = [] + + if allow_charged_fragments: + + BO_valences = list(BO.sum(axis=1)) + for i, atom in enumerate(atoms): + q = get_atomic_charge(atom, atomic_valence_electrons[atom], BO_valences[i]) + Q += q + if atom == 6: + number_of_single_bonds_to_C = list(BO[i, :]).count(1) + if number_of_single_bonds_to_C == 2 and BO_valences[i] == 2: + Q += 1 + q = 2 + if number_of_single_bonds_to_C == 3 and Q + 1 < charge: + Q += 2 + q = 1 + + if q != 0: + q_list.append(q) + + return (charge == Q) + +def BO_is_OK(BO, AC, charge, DU, atomic_valence_electrons, atoms, valences, + allow_charged_fragments=True): + """ + Sanity of bond-orders + + args: + BO - + AC - + charge - + DU - + + + optional + allow_charges_fragments - + + + returns: + boolean - true of molecule is OK, false if not + """ + + if not valences_not_too_large(BO, valences): + return False + + check_sum = (BO - AC).sum() == sum(DU) + check_charge = charge_is_OK(BO, AC, charge, DU, atomic_valence_electrons, atoms, valences, + allow_charged_fragments) + + if check_charge and check_sum: + return True + + return False + + +def get_atomic_charge(atom, atomic_valence_electrons, BO_valence): + """ + """ + + if atom == 1: + charge = 1 - BO_valence + elif atom == 5: + charge = 3 - BO_valence + elif atom == 15 and BO_valence == 5: + charge = 0 + elif atom == 16 and BO_valence == 6: + charge = 0 + else: + charge = atomic_valence_electrons - 8 + BO_valence + + return charge + + +def clean_charges(mol): + """ + This hack should not be needed anymore, but is kept just in case + + """ + + Chem.SanitizeMol(mol) + #rxn_smarts = ['[N+:1]=[*:2]-[C-:3]>>[N+0:1]-[*:2]=[C-0:3]', + # '[N+:1]=[*:2]-[O-:3]>>[N+0:1]-[*:2]=[O-0:3]', + # '[N+:1]=[*:2]-[*:3]=[*:4]-[O-:5]>>[N+0:1]-[*:2]=[*:3]-[*:4]=[O-0:5]', + # '[#8:1]=[#6:2]([!-:6])[*:3]=[*:4][#6-:5]>>[*-:1][*:2]([*:6])=[*:3][*:4]=[*+0:5]', + # '[O:1]=[c:2][c-:3]>>[*-:1][*:2][*+0:3]', + # '[O:1]=[C:2][C-:3]>>[*-:1][*:2]=[*+0:3]'] + + rxn_smarts = ['[#6,#7:1]1=[#6,#7:2][#6,#7:3]=[#6,#7:4][CX3-,NX3-:5][#6,#7:6]1=[#6,#7:7]>>' + '[#6,#7:1]1=[#6,#7:2][#6,#7:3]=[#6,#7:4][-0,-0:5]=[#6,#7:6]1[#6-,#7-:7]', + '[#6,#7:1]1=[#6,#7:2][#6,#7:3](=[#6,#7:4])[#6,#7:5]=[#6,#7:6][CX3-,NX3-:7]1>>' + '[#6,#7:1]1=[#6,#7:2][#6,#7:3]([#6-,#7-:4])=[#6,#7:5][#6,#7:6]=[-0,-0:7]1'] + + fragments = Chem.GetMolFrags(mol,asMols=True,sanitizeFrags=False) + + for i, fragment in enumerate(fragments): + for smarts in rxn_smarts: + patt = Chem.MolFromSmarts(smarts.split(">>")[0]) + while fragment.HasSubstructMatch(patt): + rxn = AllChem.ReactionFromSmarts(smarts) + ps = rxn.RunReactants((fragment,)) + fragment = ps[0][0] + Chem.SanitizeMol(fragment) + if i == 0: + mol = fragment + else: + mol = Chem.CombineMols(mol, fragment) + + return mol + + +def BO2mol(mol, BO_matrix, atoms, atomic_valence_electrons, + mol_charge, allow_charged_fragments=True): + """ + based on code written by Paolo Toscani + + From bond order, atoms, valence structure and total charge, generate an + rdkit molecule. + + args: + mol - rdkit molecule + BO_matrix - bond order matrix of molecule + atoms - list of integer atomic symbols + atomic_valence_electrons - + mol_charge - total charge of molecule + + optional: + allow_charged_fragments - bool - allow charged fragments + + returns + mol - updated rdkit molecule with bond connectivity + + """ + + l = len(BO_matrix) + l2 = len(atoms) + BO_valences = list(BO_matrix.sum(axis=1)) + + if (l != l2): + raise RuntimeError('sizes of adjMat ({0:d}) and Atoms {1:d} differ'.format(l, l2)) + + rwMol = Chem.RWMol(mol) + + bondTypeDict = { + 1: Chem.BondType.SINGLE, + 2: Chem.BondType.DOUBLE, + 3: Chem.BondType.TRIPLE + } + + for i in range(l): + for j in range(i + 1, l): + bo = int(round(BO_matrix[i, j])) + if (bo == 0): + continue + bt = bondTypeDict.get(bo, Chem.BondType.SINGLE) + rwMol.AddBond(i, j, bt) + + mol = rwMol.GetMol() + + if allow_charged_fragments: + mol = set_atomic_charges( + mol, + atoms, + atomic_valence_electrons, + BO_valences, + BO_matrix, + mol_charge) + else: + mol = set_atomic_radicals(mol, atoms, atomic_valence_electrons, BO_valences) + + return mol + + +def set_atomic_charges(mol, atoms, atomic_valence_electrons, + BO_valences, BO_matrix, mol_charge): + """ + """ + q = 0 + for i, atom in enumerate(atoms): + a = mol.GetAtomWithIdx(i) + charge = get_atomic_charge(atom, atomic_valence_electrons[atom], BO_valences[i]) + q += charge + if atom == 6: + number_of_single_bonds_to_C = list(BO_matrix[i, :]).count(1) + if number_of_single_bonds_to_C == 2 and BO_valences[i] == 2: + q += 1 + charge = 0 + if number_of_single_bonds_to_C == 3 and q + 1 < mol_charge: + q += 2 + charge = 1 + + if (abs(charge) > 0): + a.SetFormalCharge(int(charge)) + + #mol = clean_charges(mol) + + return mol + + +def set_atomic_radicals(mol, atoms, atomic_valence_electrons, BO_valences): + """ + + The number of radical electrons = absolute atomic charge + + """ + for i, atom in enumerate(atoms): + a = mol.GetAtomWithIdx(i) + charge = get_atomic_charge( + atom, + atomic_valence_electrons[atom], + BO_valences[i]) + + if (abs(charge) > 0): + a.SetNumRadicalElectrons(abs(int(charge))) + + return mol + + +def get_bonds(UA, AC): + """ + + """ + bonds = [] + + for k, i in enumerate(UA): + for j in UA[k + 1:]: + if AC[i, j] == 1: + bonds.append(tuple(sorted([i, j]))) + + return bonds + + +def get_UA_pairs(UA, AC, use_graph=True): + """ + + """ + + bonds = get_bonds(UA, AC) + + if len(bonds) == 0: + return [()] + + if use_graph: + G = nx.Graph() + G.add_edges_from(bonds) + UA_pairs = [list(nx.max_weight_matching(G))] + return UA_pairs + + max_atoms_in_combo = 0 + UA_pairs = [()] + for combo in list(itertools.combinations(bonds, int(len(UA) / 2))): + flat_list = [item for sublist in combo for item in sublist] + atoms_in_combo = len(set(flat_list)) + if atoms_in_combo > max_atoms_in_combo: + max_atoms_in_combo = atoms_in_combo + UA_pairs = [combo] + + elif atoms_in_combo == max_atoms_in_combo: + UA_pairs.append(combo) + + return UA_pairs + + +def AC2BO(AC, atoms, charge, allow_charged_fragments=True, use_graph=True): + """ + + implemenation of algorithm shown in Figure 2 + + UA: unsaturated atoms + + DU: degree of unsaturation (u matrix in Figure) + + best_BO: Bcurr in Figure + + """ + + global atomic_valence + global atomic_valence_electrons + + # make a list of valences, e.g. for CO: [[4],[2,1]] + valences_list_of_lists = [] + AC_valence = list(AC.sum(axis=1)) + + for i,(atomicNum,valence) in enumerate(zip(atoms,AC_valence)): + # valence can't be smaller than number of neighbourgs + possible_valence = [x for x in atomic_valence[atomicNum] if x >= valence] + if not possible_valence: + print('Valence of atom',i,'is',valence,'which bigger than allowed max',max(atomic_valence[atomicNum]),'. Stopping') + sys.exit() + valences_list_of_lists.append(possible_valence) + + # convert [[4],[2,1]] to [[4,2],[4,1]] + valences_list = itertools.product(*valences_list_of_lists) + + best_BO = AC.copy() + + for valences in valences_list: + + UA, DU_from_AC = get_UA(valences, AC_valence) + + check_len = (len(UA) == 0) + if check_len: + check_bo = BO_is_OK(AC, AC, charge, DU_from_AC, + atomic_valence_electrons, atoms, valences, + allow_charged_fragments=allow_charged_fragments) + else: + check_bo = None + + if check_len and check_bo: + return AC, atomic_valence_electrons + + UA_pairs_list = get_UA_pairs(UA, AC, use_graph=use_graph) + for UA_pairs in UA_pairs_list: + BO = get_BO(AC, UA, DU_from_AC, valences, UA_pairs, use_graph=use_graph) + status = BO_is_OK(BO, AC, charge, DU_from_AC, + atomic_valence_electrons, atoms, valences, + allow_charged_fragments=allow_charged_fragments) + charge_OK = charge_is_OK(BO, AC, charge, DU_from_AC, atomic_valence_electrons, atoms, valences, + allow_charged_fragments=allow_charged_fragments) + + if status: + return BO, atomic_valence_electrons + elif BO.sum() >= best_BO.sum() and valences_not_too_large(BO, valences) and charge_OK: + best_BO = BO.copy() + + return best_BO, atomic_valence_electrons + + +def AC2mol(mol, AC, atoms, charge, allow_charged_fragments=True, use_graph=True): + """ + """ + + # convert AC matrix to bond order (BO) matrix + BO, atomic_valence_electrons = AC2BO( + AC, + atoms, + charge, + allow_charged_fragments=allow_charged_fragments, + use_graph=use_graph) + + # add BO connectivity and charge info to mol object + mol = BO2mol( + mol, + BO, + atoms, + atomic_valence_electrons, + charge, + allow_charged_fragments=allow_charged_fragments) + + # If charge is not correct don't return mol + if Chem.GetFormalCharge(mol) != charge: + return [] + + # BO2mol returns an arbitrary resonance form. Let's make the rest + mols = rdchem.ResonanceMolSupplier(mol, Chem.UNCONSTRAINED_CATIONS, Chem.UNCONSTRAINED_ANIONS) + mols = [mol for mol in mols] + + return mols + + +def get_proto_mol(atoms): + """ + """ + mol = Chem.MolFromSmarts("[#" + str(atoms[0]) + "]") + rwMol = Chem.RWMol(mol) + for i in range(1, len(atoms)): + a = Chem.Atom(atoms[i]) + rwMol.AddAtom(a) + + mol = rwMol.GetMol() + + return mol + + +def read_xyz_file(filename, look_for_charge=True): + """ + """ + + atomic_symbols = [] + xyz_coordinates = [] + charge = 0 + title = "" + + with open(filename, "r") as file: + for line_number, line in enumerate(file): + if line_number == 0: + num_atoms = int(line) + elif line_number == 1: + title = line + if "charge=" in line: + charge = int(line.split("=")[1]) + else: + atomic_symbol, x, y, z = line.split() + atomic_symbols.append(atomic_symbol) + xyz_coordinates.append([float(x), float(y), float(z)]) + + atoms = [int_atom(atom) for atom in atomic_symbols] + + return atoms, charge, xyz_coordinates + + +def xyz2AC(atoms, xyz, charge, use_huckel=False): + """ + + atoms and coordinates to atom connectivity (AC) + + args: + atoms - int atom types + xyz - coordinates + charge - molecule charge + + optional: + use_huckel - Use Huckel method for atom connecitivty + + returns + ac - atom connectivity matrix + mol - rdkit molecule + + """ + + if use_huckel: + return xyz2AC_huckel(atoms, xyz, charge) + else: + return xyz2AC_vdW(atoms, xyz) + + +def xyz2AC_vdW(atoms, xyz): + + # Get mol template + mol = get_proto_mol(atoms) + + # Set coordinates + conf = Chem.Conformer(mol.GetNumAtoms()) + for i in range(mol.GetNumAtoms()): + conf.SetAtomPosition(i, (xyz[i][0], xyz[i][1], xyz[i][2])) + mol.AddConformer(conf) + + AC = get_AC(mol) + + return AC, mol + + +def get_AC(mol, covalent_factor=1.3): + """ + + Generate adjacent matrix from atoms and coordinates. + + AC is a (num_atoms, num_atoms) matrix with 1 being covalent bond and 0 is not + + + covalent_factor - 1.3 is an arbitrary factor + + args: + mol - rdkit molobj with 3D conformer + + optional + covalent_factor - increase covalent bond length threshold with facto + + returns: + AC - adjacent matrix + + """ + + # Calculate distance matrix + dMat = Chem.Get3DDistanceMatrix(mol) + + pt = Chem.GetPeriodicTable() + num_atoms = mol.GetNumAtoms() + AC = np.zeros((num_atoms, num_atoms), dtype=int) + + for i in range(num_atoms): + a_i = mol.GetAtomWithIdx(i) + Rcov_i = pt.GetRcovalent(a_i.GetAtomicNum()) * covalent_factor + for j in range(i + 1, num_atoms): + a_j = mol.GetAtomWithIdx(j) + Rcov_j = pt.GetRcovalent(a_j.GetAtomicNum()) * covalent_factor + if dMat[i, j] <= Rcov_i + Rcov_j: + AC[i, j] = 1 + AC[j, i] = 1 + + return AC + + +def xyz2AC_huckel(atomicNumList,xyz,charge): + """ + + args + atomicNumList - atom type list + xyz - coordinates + charge - molecule charge + + returns + ac - atom connectivity + mol - rdkit molecule + + """ + mol = get_proto_mol(atomicNumList) + + conf = Chem.Conformer(mol.GetNumAtoms()) + for i in range(mol.GetNumAtoms()): + conf.SetAtomPosition(i,(xyz[i][0],xyz[i][1],xyz[i][2])) + mol.AddConformer(conf) + + num_atoms = len(atomicNumList) + AC = np.zeros((num_atoms,num_atoms)).astype(int) + + mol_huckel = Chem.Mol(mol) + mol_huckel.GetAtomWithIdx(0).SetFormalCharge(charge) #mol charge arbitrarily added to 1st atom + + passed,result = rdEHTTools.RunMol(mol_huckel) + opop = result.GetReducedOverlapPopulationMatrix() + tri = np.zeros((num_atoms, num_atoms)) + tri[np.tril(np.ones((num_atoms, num_atoms), dtype=bool))] = opop #lower triangular to square matrix + for i in range(num_atoms): + for j in range(i+1,num_atoms): + pair_pop = abs(tri[j,i]) + if pair_pop >= 0.15: #arbitry cutoff for bond. May need adjustment + AC[i,j] = 1 + AC[j,i] = 1 + + return AC, mol + + +def chiral_stereo_check(mol): + """ + Find and embed chiral information into the model based on the coordinates + + args: + mol - rdkit molecule, with embeded conformer + + """ + Chem.SanitizeMol(mol) + Chem.DetectBondStereochemistry(mol, -1) + Chem.AssignStereochemistry(mol, flagPossibleStereoCenters=True, force=True) + Chem.AssignAtomChiralTagsFromStructure(mol, -1) + + return + + +def xyz2mol(atoms, coordinates, + charge=0, + allow_charged_fragments=True, + use_graph=True, + use_huckel=False, + embed_chiral=True): + """ + Generate a rdkit molobj from atoms, coordinates and a total_charge. + + args: + atoms - list of atom types (int) + coordinates - 3xN Cartesian coordinates + charge - total charge of the system (default: 0) + + optional: + allow_charged_fragments - alternatively radicals are made + use_graph - use graph (networkx) + use_huckel - Use Huckel method for atom connectivity prediction + embed_chiral - embed chiral information to the molecule + + returns: + mols - list of rdkit molobjects + + """ + + # Get atom connectivity (AC) matrix, list of atomic numbers, molecular charge, + # and mol object with no connectivity information + AC, mol = xyz2AC(atoms, coordinates, charge, use_huckel=use_huckel) + + # Convert AC to bond order matrix and add connectivity and charge info to + # mol object + new_mols = AC2mol(mol, AC, atoms, charge, + allow_charged_fragments=allow_charged_fragments, + use_graph=use_graph) + + # Check for stereocenters and chiral centers + if embed_chiral: + for new_mol in new_mols: + chiral_stereo_check(new_mol) + + return new_mols + + +def main(): + + + return + + +if __name__ == "__main__": + + import argparse + + parser = argparse.ArgumentParser(usage='%(prog)s [options] molecule.xyz') + parser.add_argument('structure', metavar='structure', type=str) + parser.add_argument('-s', '--sdf', + action="store_true", + help="Dump sdf file") + parser.add_argument('--ignore-chiral', + action="store_true", + help="Ignore chiral centers") + parser.add_argument('--no-charged-fragments', + action="store_true", + help="Allow radicals to be made") + parser.add_argument('--no-graph', + action="store_true", + help="Run xyz2mol without networkx dependencies") + + # huckel uses extended Huckel bond orders to locate bonds (requires RDKit 2019.9.1 or later) + # otherwise van der Waals radii are used + parser.add_argument('--use-huckel', + action="store_true", + help="Use Huckel method for atom connectivity") + parser.add_argument('-o', '--output-format', + action="store", + type=str, + help="Output format [smiles,sdf] (default=sdf)") + parser.add_argument('-c', '--charge', + action="store", + metavar="int", + type=int, + help="Total charge of the system") + + args = parser.parse_args() + + # read xyz file + filename = args.structure + + # allow for charged fragments, alternatively radicals are made + charged_fragments = not args.no_charged_fragments + + # quick is faster for large systems but requires networkx + # if you don't want to install networkx set quick=False and + # uncomment 'import networkx as nx' at the top of the file + quick = not args.no_graph + + # chiral comment + embed_chiral = not args.ignore_chiral + + # read atoms and coordinates. Try to find the charge + atoms, charge, xyz_coordinates = read_xyz_file(filename) + + # huckel uses extended Huckel bond orders to locate bonds (requires RDKit 2019.9.1 or later) + # otherwise van der Waals radii are used + use_huckel = args.use_huckel + + # if explicit charge from args, set it + if args.charge is not None: + charge = int(args.charge) + + # Get the molobjs + mols = xyz2mol(atoms, xyz_coordinates, + charge=charge, + use_graph=quick, + allow_charged_fragments=charged_fragments, + embed_chiral=embed_chiral, + use_huckel=use_huckel) + + # Print output + for mol in mols: + if args.output_format == "sdf": + txt = Chem.MolToMolBlock(mol) + print(txt) + + else: + # Canonical hack + isomeric_smiles = not args.ignore_chiral + smiles = Chem.MolToSmiles(mol, isomericSmiles=isomeric_smiles) + m = Chem.MolFromSmiles(smiles) + smiles = Chem.MolToSmiles(m, isomericSmiles=isomeric_smiles) + print(smiles)