diff --git a/namingconvention b/namingconvention new file mode 100644 index 0000000..8d2b5ec --- /dev/null +++ b/namingconvention @@ -0,0 +1,11 @@ +database=prefix_rep_data.npz + +prefix=amons or qm7, +rep=CM, SLATM, aCM, SOAP, FCHL. + + +structure: mol_prefix_labels, mol_prefix_ncharges, mol_prefix_rep + +mol=vitc, vitd, qm9, +prefix=amons or qm7, # remove this? +rep=CMs, reps # combine these two? diff --git a/onepass.py b/onepass.py index a3cc4b5..5d83a09 100644 --- a/onepass.py +++ b/onepass.py @@ -1,166 +1,166 @@ import numpy as np import timeit import gurobipy as gp from gurobipy import GRB def addvariables(Z): upperbounds=[] I=[] J=[] for M in database_indices: - CM=data[targetname+"_amons_ncharges"][M] + CM=data[targetname+"_"+prefix+"_ncharges"][M] m=len(CM) I=I+[(i,j,M,G) for G in range(maxduplicates) for i in range(m) for j in range(n) if CM[i] == CT[j]] # if condition excludes j; i always takes all m values J=J+[(M,G) for G in range(maxduplicates)] x=Z.addVars(I, vtype=GRB.BINARY) y=Z.addVars(J, vtype=GRB.BINARY) print("Variables added.") return x,I,y def addconstraints(Z,x,I,y): # bijection into [n] Z.addConstrs(x.sum('*',j,'*', '*') == 1 for j in range(n)) for M in database_indices: - CM=data[targetname+"_amons_ncharges"][M] + CM=data[targetname+"_"+prefix+"_ncharges"][M] m=len(CM) # each i of each group is used at most once Z.addConstrs(x.sum(i,'*',M,G) <= 1 for i in range(m) for G in range(maxduplicates)) # y[M,G] = OR gate of the x[i,j,M,G] for each (M,G) Z.addConstrs(y[M,G] >= x[v] for G in range(maxduplicates) for v in I if v[2:]==(M,G)) Z.addConstrs(y[M,G] <= x.sum('*','*',M,G) for G in range(maxduplicates)) print("Constraints added.") return 0 # objective value should then be square rooted in the end (doesn't change optimality) def setobjective(Z,x,I,y): print("Constructing objective function... ") key=0 - if(representation==0 or representation==5): # Coulomb case + if(representation==0): # Coulomb case expr=gp.QuadExpr() T=targetdata['target_CMs'][target_index] for k in range(n): for l in range(n): expr += T[k,l]**2 for M in database_indices: key=key+1 - Mol=data[targetname+"_amons_CMs"][M] + Mol=data[targetname+"_"+prefix+"_CMs"][M] m=len(Mol) for G in range(maxduplicates): for (i,k) in [v[:2] for v in I if v[2:]==(M,G)]: for (j,l) in [v[:2] for v in I if v[2:]==(M,G)]: expr += (Mol[i,j]**2 - 2*T[k,l]*Mol[i,j])*x[i,k,M,G]*x[j,l,M,G] expr += y[M,G]*m*penaltyconst print(key, " / ", size_database) expr=expr-n*penaltyconst else: #SLATM case expr=gp.LinExpr() T=targetdata["target_reps"][target_index] for M in database_indices: key=key+1 - Mol=data[targetname+"_amons_reps"][M] + Mol=data[targetname+"_"+prefix+"_reps"][M] m=len(Mol) for G in range(maxduplicates): for (i,j) in [v[:2] for v in I if v[2:]==(M,G)]: C=np.linalg.norm(Mol[i]-T[j])**2 expr += C*x[i,j,M,G] expr += y[M,G]*m*penaltyconst print(key, " / ", size_database) expr=expr-n*penaltyconst Z.setObjective(expr, GRB.MINIMIZE) print("Objective function set.") return 0 # prints mappings of positions (indices+1) of each molecule to positions inside target def print_sols(Z, x, I, y): SolCount=Z.SolCount print("Target has size", n) print("Using representation", repname) for solnb in range(SolCount): print() print("--------------------------------") Z.setParam("SolutionNumber",solnb) print("Solution number", solnb+1, ", objective value with size penalty", (Z.PoolObjVal)) for M in database_indices: groups=[] for G in range(maxduplicates): if np.rint(y[M,G].Xn) == 1: groups.append(G) amount_picked=len(groups) for k in range(amount_picked): G=groups[k] - m=len(data[targetname+"_amons_ncharges"][M]) - label=data[targetname+"_amons_labels"][M] + m=len(data[targetname+"_"+prefix+"_ncharges"][M]) + label=data[targetname+"_"+prefix+"_labels"][M] if k==0: print("Molecule", label, "has been picked", amount_picked, "time(s) ( size", m, ", used", sum([x[v].Xn for v in I if v[2]==M]), ")") print(k+1, end=": ") for (i,j) in [v[:2] for v in I if v[2:]==(M,G) and np.rint(x[v].Xn)==1]: print(i+1, "->", j+1, end=", ") print() def main(): # construction of the model start=timeit.default_timer() Z = gp.Model() Z.setParam('OutputFlag',1) x,I,y=addvariables(Z) addconstraints(Z,x,I,y) setobjective(Z,x,I,y) stop=timeit.default_timer() print("Model setup: ", stop-start, "s") # model parameters # PoolSearchMode 1/2 forces to fill the solution pool. 2 finds the best solutions. # Set to 1 because of duplicating solutions which differ by 1e-9 and are seen as different. Z.setParam("PoolSearchMode", 1) # these prevent non integral values although some solutions are still duplicating -- to fix? Z.setParam("IntFeasTol", 1e-9) Z.setParam("IntegralityFocus", 1) Z.setParam("TimeLimit", timelimit) Z.setParam("PoolSolutions", numbersolutions) # optimization print("------------") print("Optimization") print("------------") Z.optimize() print("------------") print() print("Optimization runtime: ", Z.RunTime, "s") if(Z.status == 3): print("Model was proven to be infeasible.") return 1 print_sols(Z,x,I,y) return 0 # modifiable global settings target_index=0 # 0, 1, or 2 for qm9, vitc, or vitd. maxduplicates=2 # number of possible copies of each molecule of the database timelimit=360 # in seconds (not counting setup) -numbersolutions=50 # size of solution pool -representation=5 # 0 for Coulomb Matrix (CM), 1 for SLATM, 2 for aCM, 3 for SOAP, 4 for FCHL, 5 for qm7 -#penaltyconst=[1,1,1,1,1][representation] # constant in front of size penalty -penaltyconst=1 +numbersolutions=5 # size of solution pool +representation=2 # 0 for Coulomb Matrix (CM), 1 for SLATM, 2 for aCM, 3 for SOAP, 4 for FCHL +penaltyconst=[1,1,10000,1,1][representation] # constant in front of size penalty +prefix="amons" # "amons" or "qm7" # global constants repname=["CM", "SLATM", "aCM", "SOAP", "FCHL", "qm7"][representation] -dataname="amons_"+repname+"_data.npz" +dataname=prefix+"_"+repname+"_data.npz" data=np.load(dataname, allow_pickle=True) targetdataname="target_"+repname+"_data.npz" targetdata=np.load(targetdataname, allow_pickle=True) CT=targetdata['target_ncharges'][target_index] n=len(CT) targetname=["qm9", "vitc", "vitd"][target_index] -size_database=len(data[targetname+"_amons_labels"]) # set this to a fixed number if the setup is too slow in case of qm7 database +size_database=len(data[targetname+"_"+prefix+"_labels"]) # set this to a fixed number if the setup is too slow in case of qm7 database database_indices=range(size_database) main() diff --git a/preprocess.py b/preprocess.py index d2bf21e..cfe6455 100644 --- a/preprocess.py +++ b/preprocess.py @@ -1,40 +1,40 @@ import numpy as np import copy data=np.load("data.npz", allow_pickle=True) data=dict(data) size_database=len(data['database_labels']) for target_index in range(len(data['target_labels'])): CT=data['target_ncharges'][target_index] T=data['target_CMs'][target_index] I=[] for i in range(len(CT)): if CT[i] == 1: I.append(i) CT=np.delete(CT, I) T=np.delete(T, I, axis=0) T=np.delete(T,I,axis=1) data['target_ncharges'][target_index]=CT data['target_CMs'][target_index]=T for i in range(size_database): print(" ", i) M=data['database_CMs'][i] CM=data['database_ncharges'][i] m=len(CM) J=[] for j in range(m): if CM[j]==1: J.append(j) CM=np.delete(CM,J) M=np.delete(M,J,axis=0) M=np.delete(M,J,axis=1) data['database_CMs'][i]=M data['database_ncharges'][i]=CM labels='database_labels' ncharges='database_ncharges' CMs='database_CMs' -np.savez("amons_qm7_data.npz", vitc_amons_labels=data[labels], vitc_amons_ncharges=data[ncharges], vitc_amons_CMs=data[CMs], vitd_amons_labels=data[labels], vitd_amons_ncharges=data[ncharges], vitd_amons_CMs=data[CMs], qm9_amons_labels=data[labels], qm9_amons_ncharges=data[ncharges], qm9_amons_CMs=data[CMs]) +np.savez("qm7_CM_data.npz", vitc_qm7_labels=data[labels], vitc_qm7_ncharges=data[ncharges], vitc_qm7_CMs=data[CMs], vitd_qm7_labels=data[labels], vitd_qm7_ncharges=data[ncharges], vitd_qm7_CMs=data[CMs], qm9_qm7_labels=data[labels], qm9_qm7_ncharges=data[ncharges], qm9_qm7_CMs=data[CMs]) diff --git a/amons_qm7_data.npz b/qm7_CM_data.npz similarity index 99% rename from amons_qm7_data.npz rename to qm7_CM_data.npz index 54fda05..e7a8629 100644 Binary files a/amons_qm7_data.npz and b/qm7_CM_data.npz differ