diff --git a/Additional_analyses/Compare_methods/compare_method_pps.py b/Additional_analyses/Compare_methods/compare_method_pps.py index 848fc49..2f6a303 100644 --- a/Additional_analyses/Compare_methods/compare_method_pps.py +++ b/Additional_analyses/Compare_methods/compare_method_pps.py @@ -1,399 +1,399 @@ # -*- coding: utf-8 -*- """ Created on Tue Sep 13 11:51:15 2022 @author: trostele """ # Python version 3.6.13 ################################################################################################################################################################################################################################################################ # import all necessary packages import pandas as pd import rdkit # rdkit is only supported before Python 3.7 from rdkit import Chem from rdkit.Chem.MolStandardize import rdMolStandardize from rdkit.Chem import Descriptors from rdkit.Chem.rdMolDescriptors import CalcMolFormula import pickle ################################################################################################################################################################################################################################################################ # script to compare predicted TPs with found TPs from different methods ################################################################################################################################################################################################################################################################ # INPUT found_tp_smiles_input = "./input/found_TP_SMILES.txt" # contains SMILES of all TPs that were found in samples pickle_file_data_dict = "./input/data_dict_com_with_CAS.pickle" # pickle file location of combined dictionary with all predictions made by different methods package_method_1 = "EAWAG_BBD-PPS_round_2" package_method_2 = "EAWAG_BBD-PPS_round_2b" package_method_5 = "enviPath-BBD_1" package_method_6 = "enviPath-BBD+SOIL_2" package_method_7 = "enviPath-BBD+SLUDGE_3" package_method_8 = "enviPath-BBD+SOIL+SLUDGE_4" # enter used package (source name in combined data dict) # same input as in "get_mass_list_from_prediction.py" #!!! # I used two rounds of predictions with the EAWAG/BBD-PPS, so it combines them # the script only works with 7 methods, needs to be updated! #!!! ################################################################################################################################################################################################################################################################ # FUNCTIONS def canonicalize_smiles(smiles): mol = Chem.MolFromSmiles(smiles) # creates mol object from SMILES uncharger = rdMolStandardize.Uncharger() # easier to access uncharged = uncharger.uncharge(mol) # protonates or deprotonates the mol object new_smiles = rdkit.Chem.rdmolfiles.MolToSmiles(uncharged) # converts mol object to canonical SMILES can_smiles = Chem.CanonSmiles(new_smiles) return can_smiles def do_pickle(d, pickle_file): with open(pickle_file, 'wb') as handle: pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL) def get_pickle(pickle_file): with open(pickle_file, 'rb') as handle: d = pickle.load(handle) return d def export(data_dict, smi_list): df_name_list_com = [] df_smi_list_com = [] df_ID_list_com = [] df_Formula_list_com = [] df_MolWeight_list_com = [] df_name_parent_list_com = [] df_inchikey_list_com = [] df_source_list_com = [] df_alt_parent_list_com = [] # add all the data to the lists for parent in data_dict: df_name_list_com.append(data_dict[parent]["code_parent"][0]) df_smi_list_com.append(parent) df_ID_list_com.append(data_dict[parent]["ID_parent"]) df_MolWeight_list_com.append(data_dict[parent]["mass_parent"][0]) df_Formula_list_com.append(data_dict[parent]["Formula_parent"]) df_name_parent_list_com.append(data_dict[parent]["name"][0]) df_inchikey_list_com.append(data_dict[parent]["inchi_parent"]) df_source_list_com.append("") df_alt_parent_list_com.append([]) for tp in data_dict[parent]["TP_dict"]: if tp in predicted_and_found: df_name_list_com.append(data_dict[parent]["TP_dict"][tp]["code"]) df_smi_list_com.append(tp) df_ID_list_com.append(data_dict[parent]["TP_dict"][tp]["CAS"]) df_MolWeight_list_com.append(data_dict[parent]["TP_dict"][tp]["mass"]) df_Formula_list_com.append(data_dict[parent]["TP_dict"][tp]["Formula"]) df_name_parent_list_com.append(data_dict[parent]["name"][0]) df_inchikey_list_com.append(data_dict[parent]["TP_dict"][tp]["InchiKey"]) df_source_list_com.append(data_dict[parent]["TP_dict"][tp]["source_list"]) df_alt_parent_list_com.append(data_dict[parent]["TP_dict"][tp]["alternative_parent"]) df_complete_dict = {"Name of parent":df_name_parent_list_com,"SMILES": df_smi_list_com, "Name": df_name_list_com, "Source": df_source_list_com, "CAS": df_ID_list_com,"Formula": df_Formula_list_com, "MolWeight": df_MolWeight_list_com, "InchiKey":df_inchikey_list_com, "Alternative parent": df_alt_parent_list_com} df_complete = pd.DataFrame.from_dict(df_complete_dict) df_complete.to_csv("./output/predicted_and_found_TPs.csv", index = False, sep = ",") ################################################################################################################################################################################################################################################################ # export overview of predicted TPs that were also found (on txt file) as csv file found_tp_smiles = [] SMILES_comp_file = open(found_tp_smiles_input) for line in SMILES_comp_file: found_tp_smiles.append(line.rstrip()) found_tp_smiles_canon = [] for tp in found_tp_smiles: found_tp_smiles_canon.append(canonicalize_smiles(tp)) data_dict = get_pickle(pickle_file_data_dict) Bar = 'CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)cn2)C1' Abe = 'CCN1CCN(Cc2ccc(Nc3ncc(F)c(-c4cc(F)c5nc(C)n(C(C)C)c5c4)n3)nc2)CC1' # remove Abe and Bar from dict because they were not spiked data_dict.pop(Abe) data_dict.pop(Bar) predicted_tp_smiles = [] for parent in data_dict: for tp in data_dict[parent]["TP_dict"]: predicted_tp_smiles.append(tp) predicted_and_found = [] for smi in found_tp_smiles_canon: if smi in predicted_tp_smiles: predicted_and_found.append(smi) export(data_dict, predicted_and_found) ################################################################################################################################################################################################################################################################ # export precision (= found/predicted) for each method as tsv file source_list_nest = [] for parent in data_dict: for tp in data_dict[parent]["TP_dict"]: if tp in predicted_and_found: source_list_nest.append(data_dict[parent]["TP_dict"][tp]["source_list"]) source_list = [] for sublist in source_list_nest: for item in sublist: source_list.append(item) pps = [] envi_updated = [] envi_soil_updated = [] envi_sludge = [] envi_soil_sludge = [] for source in source_list: if "EAWAG_BBD-PPS" in source: pps.append(source) if package_method_5 in source: envi_updated.append(source) if package_method_6 in source: envi_soil_updated.append(source) if package_method_7 in source: envi_sludge.append(source) if package_method_8 in source: envi_soil_sludge.append(source) pps_predicted = [] envi_updated_predicted = [] envi_soil_updated_predicted = [] envi_sludge_predicted = [] envi_soil_sludge_predicted = [] for parent in data_dict: for tp in data_dict[parent]["TP_dict"]: if package_method_1 in data_dict[parent]["TP_dict"][tp]["source_list"] or package_method_2 in data_dict[parent]["TP_dict"][tp]["source_list"]: pps_predicted.append(tp) if package_method_5 in data_dict[parent]["TP_dict"][tp]["source_list"]: envi_updated_predicted.append(tp) if package_method_6 in data_dict[parent]["TP_dict"][tp]["source_list"]: envi_soil_updated_predicted.append(tp) if package_method_7 in data_dict[parent]["TP_dict"][tp]["source_list"]: envi_sludge_predicted.append(tp) if package_method_8 in data_dict[parent]["TP_dict"][tp]["source_list"]: envi_soil_sludge_predicted.append(tp) string_precision = "Precision of methods:" + "\n" + "Prediction Method" + "\t" + "Found" + "\t" + "Predicted" + "\t" + "Precision" + "\n" + package_method_1 + "\t" + str(len(pps))+ "\t" + str(len(pps_predicted))+ "\t" + str(round(len(pps)/len(pps_predicted)*100, 2)) + "%\n"+ package_method_5+ "\t" + str(len(envi_updated))+ "\t" + str(len(envi_updated_predicted))+ "\t" + str(round(len(envi_updated)/len(envi_updated_predicted)*100, 2)) + "%\n"+ package_method_6 + "\t"+ str(len(envi_soil_updated))+ "\t" + str(len(envi_soil_updated_predicted))+ "\t" + str(round(len(envi_soil_updated)/len(envi_soil_updated_predicted)*100, 2)) + "%\n"+ package_method_7+ "\t" + str(len(envi_sludge))+ "\t" + str(len(envi_sludge_predicted))+ "\t" + str(round(len(envi_sludge)/len(envi_sludge_predicted)*100, 2)) + "%\n"+ package_method_8+ "\t" + str(len(envi_soil_sludge))+ "\t" + str(len(envi_soil_sludge_predicted))+ "\t" + str(round(len(envi_soil_sludge)/len(envi_soil_sludge_predicted)*100, 2)) + "%\n" with open("./output/precision_of_methods.tsv", 'w') as t: t.write(string_precision) ################################################################################################################################################################################################################################################################ # export how many times a subset of methods were used to predict found TPs as tsv source_dict = {} for source in source_list_nest: key = " ".join(source) source_dict[key] = 0 for source in source_list_nest: key = " ".join(source) source_dict[key] += 1 tp_num = 0 for key in source_dict: tp_num += source_dict[key] string_sources = "Sources of TPs\t" + "Times used\n" for key in source_dict: string_sources += key + "\t" string_sources += str(source_dict[key]) + "\n" with open("./output/sources_of_TPs.tsv", 'w') as s: s.write(string_sources) ################################################################################################################################################################################################################################################################ # check how many TPs were for a given parent for each method and export as tsv string_tp_tsv = "Number of predicted TPs per parent (without considering overlap): \n" + "Parent\t" + package_method_1 + "\t" + package_method_5 + "\t" + package_method_6 + "\t" + package_method_7 + "\t" + package_method_8 + "\t" + "Total predicted TPs" + "\t" + "Found" + "\t" + "Overall Precision" + "\n" for parent in data_dict: list_tp = [] list_pps = [] list_envi_updated = [] list_envi_soil_updated = [] list_envi_sludge = [] list_envi_soil_sludge = [] found_tp_per_parent = [] for tp in data_dict[parent]["TP_dict"]: list_tp.append(tp) if package_method_1 in data_dict[parent]["TP_dict"][tp]["source_list"] or package_method_2 in data_dict[parent]["TP_dict"][tp]["source_list"]: list_pps.append(tp) if package_method_5 in data_dict[parent]["TP_dict"][tp]["source_list"]: list_envi_updated.append(tp) if package_method_6 in data_dict[parent]["TP_dict"][tp]["source_list"]: list_envi_soil_updated.append(tp) if package_method_7 in data_dict[parent]["TP_dict"][tp]["source_list"]: list_envi_sludge.append(tp) if package_method_8 in data_dict[parent]["TP_dict"][tp]["source_list"]: list_envi_soil_sludge.append(tp) if tp in found_tp_smiles_canon: found_tp_per_parent.append(tp) string_tp_tsv += data_dict[parent]["name"][0] + "\t" + str(len(list_pps)) + "\t"+ str(len(list_envi_updated)) + "\t" + str(len(list_envi_soil_updated)) + "\t"+ str(len(list_envi_sludge)) + "\t"+ str(len(list_envi_soil_sludge)) + "\t" + str(len(list_tp)) + "\t" + str(len(found_tp_per_parent)) + "\t" + str(round(len(found_tp_per_parent)/len(list_tp)*100, 2)) + "%" + "\n" list_tp.clear() list_pps.clear() list_envi_updated.clear() list_envi_soil_updated.clear() list_envi_sludge.clear() list_envi_soil_sludge.clear() with open("./output/number_of_predicted_TPs_per_parent.tsv", 'w') as h: h.write(string_tp_tsv) ################################################################################################################################################################################################################################################################ # get combined probabilities of TPs that were predicted and found and only predicted but not found predicted_and_found_combined_probability = [] only_predicted_combined_probability = [] for parent in data_dict: for tp in data_dict[parent]["TP_dict"]: if data_dict.get(parent, {}).get("TP_dict", {}).get(tp, {}).get("combined_prob") is not None: if tp in predicted_and_found: predicted_and_found_combined_probability += ((data_dict[parent]["TP_dict"][tp]["combined_prob"])) else: only_predicted_combined_probability += ((data_dict[parent]["TP_dict"][tp]["combined_prob"])) predicted_and_found_combined_probability_float = [] only_predicted_combined_probability_float = [] for prob in predicted_and_found_combined_probability: predicted_and_found_combined_probability_float.append(float(prob)) for prob in only_predicted_combined_probability: only_predicted_combined_probability_float.append(float(prob)) do_pickle(predicted_and_found_combined_probability_float, "predicted_and_found_combined_probability.pickle") do_pickle(only_predicted_combined_probability_float, "only_predicted_combined_probability.pickle") ################################################################################################################################################################################################################################################################ list_all = [] for parent in data_dict: for tp in data_dict[parent]["TP_dict"]: if package_method_1 in data_dict[parent]["TP_dict"][tp]["source_list"] or package_method_2 in data_dict[parent]["TP_dict"][tp]["source_list"]: if package_method_5 in data_dict[parent]["TP_dict"][tp]["source_list"]: if package_method_6 in data_dict[parent]["TP_dict"][tp]["source_list"]: if package_method_7 in data_dict[parent]["TP_dict"][tp]["source_list"]: if package_method_8 in data_dict[parent]["TP_dict"][tp]["source_list"]: list_all.append(tp) # print(list_all) ################################################################################################################################################################################################################################################################ # check which rules were used to predict: rule_list = [] for parent in data_dict: for tp in data_dict[parent]["TP_dict"]: rule = data_dict[parent]["TP_dict"][tp]["rule_list"][0] rule_split = rule.split(",") rule_list += rule_split # the bt rule classification is not final, those are only the most important rules add_O_rules = ["bt0063", "bt0023", "bt0003", "bt0242", "bt0243", "bt0193", "bt0014", "bt0259", "bt0374", "bt0005", "bt0332"] add_H2O_rules = ["bt0067", "bt0350", "bt0430", "bt0024", "bt0021", "bt0020", "bt0389", "bt0373", "bt0391"] desat_rules = ["bt0002", "bt0001"] add_O_list = [] add_H2O_list = [] desat_list = [] for rule in rule_list: if rule in add_O_rules: add_O_list.append(rule) if rule in add_H2O_rules: add_H2O_list.append(rule) if rule in desat_rules: desat_list.append(rule) print("Oxygen addition: ", len(add_O_list), ", water addition: ", len(add_H2O_list), ", desat: ", len(desat_list)) -string_rule = "Rules that were used in the predictions: need to remove all spaces " " and '"' signs manually \n" +string_rule = "Rules that were used in the predictions: need to remove all spaces \" \" and '' signs manually \n" for rule in rule_list: string_rule += rule string_rule += "\n" with open("./output/used_rules.txt", 'w') as p: p.write(string_rule) ################################################################################################################################################################################################################################################################ t.close() h.close() s.close() print("Script finished successfully") ################################################################################################################################################################################################################################################################ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⣀⣠⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣴⣶⣶⣶⣶⣶⣶⣶⣶⣶⣶⣶⣶⣶⣦⣤⣤⣤⣤⣤⣤⣄⣀⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣾⡿⠟⠛⠛⠛⠛⠋⠉⠉⠉⠉⠉⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠉⠉⠉⠉⠛⣿⣦⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢸⣿⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢠⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢸⣿⡆⠀⠀⠀⠀⠀⠀⠀⠀⠀ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⣿⠀⠀⣴⡄⠀⠀⠀⠀⣠⡄⠀⠀⠀⠀⠀⠀⠶⠀⠀⠀⠀⠀⠈⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⢰⣿⠀⠀⠀⠀⣿⡇⠀⠀⠀⠀⠀⠀⠀⠀⠀ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⣿⠀⠀⠉⠁⠀⠀⠀⠘⠟⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠁⠀⠀⠀⢸⣿⡇⠀⠀⠀⠀⠀⠀⠀⠀⠀ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⣿⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣤⠀⢀⣶⣿⣷⣦⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢸⣿⡇⠀⠀⠀⣀⣤⣄⠀⠀⠀ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⣿⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣤⠀⠀⠀⠀⠀⠀⣸⡇⠀⣸⡿⠀⠀⠉⠻⣿⣦⡀⠀⢰⡿⠀⠀⠀⠀⠀⣸⣿⣁⣴⣾⡿⠟⠛⣿⡄⠀⠀ # ⣴⣿⠿⠿⣿⣶⣦⣄⡀⠀⠀⠀⠀⣿⣿⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠘⠛⠀⠀⠀⠀⠀⠀⠉⠁⠀⣿⡇⠀⠀⠀⠀⠈⠻⣿⣆⠀⠀⠀⠀⠀⠀⠀⣿⣿⣿⠟⠁⠀⠀⠀⣿⡇⠀⠀ # ⢿⣧⠀⠀⠀⠀⠉⠛⢿⣶⣄⠀⠀⣿⣿⠀⠀⠀⠀⠀⠙⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⡀⠀⠀⠀⣿⠇⠀⠀⠀⠀⠀⠀⠈⢻⣷⣤⣤⣤⣤⣤⣼⣿⠟⠁⠀⠀⠀⠀⠀⣿⡇⠀⠀ # ⠈⢿⣧⡀⠀⠀⠀⠀⠀⠈⢻⣷⡄⣿⣿⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠛⠃⠀⠀⢰⣿⡆⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⡇⠀⠀ # ⠀⠈⠻⣷⣄⠀⠀⠀⠀⠀⠀⠙⣿⣿⣿⠀⠀⠀⠀⠀⠀⠀⠀⡿⠇⠀⠀⠀⠀⠀⠀⠀⠀⠀⣴⣿⠟⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠹⣿⣦⡀ # ⠀⠀⠀⠘⢿⣷⣄⠀⠀⠀⠀⠀⠘⣿⣿⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠠⣼⡇⠀⠀⣾⡿⠁⠀⠀⠀⠀⢠⣾⠋⠉⢳⡄⠀⠀⠀⠀⠀⠀⠀⠀⢠⣾⠋⠙⢳⡄⠀⠀⠈⢿⣷ # ⠀⠀⠀⠀⠀⠙⢿⣷⣄⡀⠀⠀⠀⢹⣿⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠿⠁⠀⠀⣿⡇⠀⠀⠀⠀⠀⠸⣿⣶⣤⣾⡇⠀⠀⠀⠀⠀⠀⠀⠀⠸⣿⣧⣤⣾⣿⠀⠀⠀⠘⣿ # ⠀⠀⠀⠀⠀⠀⠀⠈⠻⢿⣦⣄⠀⢸⣿⠀⠀⠀⠀⠀⠀⣾⡇⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⡇⠀⠀⠀⠀⠀⠀⠈⠛⠛⠛⠁⠀⠀⠀⢿⣉⣩⠿⠀⠀⠉⠛⠿⠛⠃⠀⠀⠀⠀⣿ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⠿⣿⣾⣿⠀⠀⠀⠀⠀⠀⠉⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢿⣧⠀⠀⠀⠀⠀⠀⠀⠀⠀⡄⠀⠀⠀⠀⠀⢸⡇⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢰⣿ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⢻⣿⠀⠀⠀⠀⠀⠀⠀⠀⣀⠀⠀⠀⠀⠀⣠⡄⠀⠀⠘⣿⣧⡀⠀⠀⠀⠀⠀⠀⠀⢷⣤⣀⣀⣀⣴⠟⢿⣤⣀⣀⣀⣴⠇⠀⠀⠀⠀⢠⣿⡟ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⣟⠀⠀⠀⠀⠀⠀⠀⠀⠛⠀⠀⠀⠀⠀⠻⠃⠀⠀⠀⠈⢿⣷⣄⡀⠀⠀⠀⠀⠀⠀⠈⠉⠉⠉⠁⠀⠀⠈⠉⠉⠉⠁⠀⠀⠀⢀⣴⣿⠟⠀ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣽⣿⣶⣦⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣽⣿⣿⣷⣶⣦⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣴⣶⣶⣾⠿⠛⠁⠀⠀ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣼⡟⠀⠈⠉⠉⣻⡟⠛⣻⡿⠛⠛⠛⠛⢿⣿⠿⠛⠛⠛⠛⠛⠛⠛⢻⣿⠏⠉⠉⠉⠉⢻⡟⠛⠛⣻⣿⠋⠉⠉⠙⣿⠉⠉⠉⠀⠀⠀⠀⠀⠀⠀ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⣅⠀⠀⣠⣾⠟⠁⠀⣿⠀⠀⠀⢀⣠⣿⠏⠀⠀⠀⠀⠀⠀⠀⠀⢸⣿⡀⠀⠀⠀⣠⣿⠃⠀⠀⣿⡇⠀⠀⠀⢸⣿⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠘⠻⠿⠿⠛⠁⠀⠀⠀⠻⢿⣶⣾⠿⠛⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠛⢿⣷⣶⡿⠟⠁⠀⠀⠀⠻⣷⣄⣠⣴⣿⠃⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ # ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ⠀⠈⠛⠛⠉⠀ diff --git a/File_conversion/Prediction_output_to_mass_list/README.md b/File_conversion/Prediction_output_to_mass_list/README.md index 94a80bb..89c7171 100644 --- a/File_conversion/Prediction_output_to_mass_list/README.md +++ b/File_conversion/Prediction_output_to_mass_list/README.md @@ -1,49 +1,50 @@ # Generate Mass and Inclusion Lists from Prediction Output ## Purpose The purpose of the script is to generate a mass lists, which can be imported into `Compound Discoverer` from multiple `find_best_TP.py` outputs. Further, the script combines the data of all methods and can select transformation products (TP) based on a scoring system. It also creates inclusion lists for positive and negative mode with predicted NCE for the use on a QExactivePlus. The script uses the `PubChem` database and for every compound a request is sent, thus the code can run for some time. ## Input The scripts needs at least 1 input file, but it supports up to 8 files. The input file(s) consist of the tsv results from `find_best_TP.py`. The filepaths need to be changed at the start of the script. The prediction method (enter `EAWAG-PPS` or `envipath`) and package (do not use `.` or `/`) for each file needs to be specified in the script. Additionally, each optinal file can be considered in the processing (enter "yes") or not. Moreover, 3 txt files for mapping are required: -`SMILES_selected_comp.txt` contains the SMILES of the selected parent compounds -`name_subst.txt` contains the full names of the parents -`code_subst.txt` contains the short code of the parents (exp. first 3 letters of name) -Each txt files contains one string per line and the order needs to be matched over the three txt files, so each line of the txt files corresponds to one parent. +* `SMILES_selected_comp.txt` contains the SMILES of the selected parent compounds +* `name_subst.txt` contains the full names of the parents +* `code_subst.txt` contains the short code of the parents (exp. first 3 letters of name) + +* Each txt file contains one string per line and the order needs to be matched over the three txt files, so each line of the txt files corresponds to one parent. These txt files can be easily created by copying from a Excel Worksheet and pasting into a new txt file. ## Scoring System -Can be turned on (`True`) or off (`False`). A maximal number of allowed TPs per parent can be set as a variable. +Can be turned on (`True`) or off (`False`, default). A maximal number of allowed TPs per parent can be set as a variable. The scoring system removes all TPs, which have a mass below 100 u. The score is lowered if the TPs has a CAS number, has a low probability or is not predicted by all methods. If a parent has more than the specified maximal allowed TPs per parent, they get removed starting with lowest score. ## Output The script generates a multitude of output files. Most of them are pickle files, which contain all the data from a given dictionary. These can be copied and moved to a different location to run just one part of the code or to save on code running time. Each of the input files get an individual csv file that can be imported into `Compound Discoverer`. If more than two input files are processed, 3 csv files are created from the combination of all methods. 2 contain all the data about the parents and TPs before and after the scoring, which is meant to be an overview. The last one contains only the most important combined data with SMILES, Name, CAS, Formula, monoisotopic mass and InchiKey, which can be imported into `Compound Discoverer` as a mass list. Additionally, two inclusion lists as well as a txt file is created containing the maximal element count of C, H, O, N, S, P, Cl, Br an I (useful for `Compound Discoverer` workflow). The output paths set per default to an output folder that is in the same folder as the script. ## Run script in Spyder 1. open the script: `get_mass_list_from_prediction.py` 2. change the pathway of the input files according to the filepath on your harddrive and turn Scoring System on or off. 3. click somewhere in the left window and press `F5` or click `Run File`, if Run settings windows appears, click `Run` 4. wait for the script to finish running (may take a while, `Script finished successfully` marks end of script) Author: Leo Trostel, 2022 \ No newline at end of file diff --git a/File_conversion/Prediction_output_to_mass_list/get_mass_list_from_prediction.py b/File_conversion/Prediction_output_to_mass_list/get_mass_list_from_prediction.py index 9116a1f..1ee9d8a 100644 --- a/File_conversion/Prediction_output_to_mass_list/get_mass_list_from_prediction.py +++ b/File_conversion/Prediction_output_to_mass_list/get_mass_list_from_prediction.py @@ -1,1425 +1,1425 @@ # -*- coding: utf-8 -*- """ Created on Mon Nov 22 16:17:39 2021 Edited on Tue Aug 17 13:55:32 2022 @author: trostele """ # start of script # Python version 3.6.13 ################################################################################################################################################################################################################################################################ # import all necessary packages import pandas as pd import rdkit # rdkit is only supported before Python 3.7 from rdkit import Chem from rdkit.Chem.MolStandardize import rdMolStandardize from rdkit.Chem import Descriptors from rdkit.Chem.rdMolDescriptors import CalcMolFormula import pickle import pubchempy as pcp import re import copy ################################################################################################################################################################################################################################################################ ######################################## see READme file for instructions! ##################################################################################################################################################################################### ################################################################################################################################################################################################################################################################ # INPUT FILES: # first file (mandatory) file_location_1 = "./input/round 2b/Eawag_PPS_BBD_results.tsv" prediction_method_1 = "EAWAG-PPS" # "EAWAG-PPS" or "envipath" allowed package_method_1 = "EAWAG_BBD-PPS_round_2b" # enter used package (is used for naming source in combined data dicts and output) # second file (optional) file_location_2 = "./input/updated predictions 2/TP_prediction_BBD+SOIL_top_50.tsv" prediction_method_2 = "envipath" package_method_2 = "enviPath-BBD+SOIL_2" consider_file_2 = "yes" # third file (optional) file_location_3 = "./input/updated predictions 2/TP_prediction_BBD+SLUDGE_top_50.tsv" prediction_method_3 = "envipath" package_method_3 = "enviPath-BBD+SLUDGE_3" consider_file_3 = "yes" # fourth file (optional) file_location_4 = "./input/updated predictions 2/TP_prediction_BBD+SOIL+SLUDGE_top_50.tsv" prediction_method_4 = "envipath" package_method_4 = "enviPath-BBD+SOIL+SLUDGE_4" consider_file_4 = "yes" # fifth file (optional) file_location_5 = "./input/round 2/Eawag_PPS_BBD_results_edited.tsv" prediction_method_5 = "EAWAG-PPS" package_method_5 = "EAWAG_BBD-PPS_round_2" consider_file_5 = "yes" # sixth file (optional) file_location_6 = "./input/updated predictions 2/TP_prediction_BBD_top_50.tsv" prediction_method_6 = "envipath" package_method_6 = "enviPath-BBD_1" consider_file_6 = "yes" # seventh file (optional) file_location_7 = "" prediction_method_7 = "" package_method_7 = "" consider_file_7 = "no" # eighth file (optional) file_location_8 = "" prediction_method_8 = "" package_method_8 = "" consider_file_8 = "no" # mapping files: # code_location (mandatory) code_location = "./input/code_subst.txt" # SMILES_location (mandatory) smi_location = "./input/SMILES_selected_comp.txt" # name_location (mandatory) name_location = "./input/name_subst.txt" # turn search for CAS numbers for all compounds on (True) or off (False) CAS_search = False # SCORING SYSTEM: -scoring_system = False # True = active, False = inactive +scoring_system = False # True = active, False = inactive (default) max_TP_per_parent = 50 # add number of maximal allowed TPs per parent (must be an integer!) # OUTPUT FILES: # output could be changed output_location_1 = "./output/CD_masslist_1_" + package_method_1 + ".csv" output_location_2 = "./output/CD_masslist_2_" + package_method_2 + ".csv" output_location_3 = "./output/CD_masslist_3_" + package_method_3 + ".csv" output_location_4 = "./output/CD_masslist_4_" + package_method_4 + ".csv" output_location_5 = "./output/CD_masslist_5_" + package_method_5 + ".csv" output_location_6 = "./output/CD_masslist_6_" + package_method_6 + ".csv" output_location_7 = "./output/CD_masslist_7_" + package_method_7 + ".csv" output_location_8 = "./output/CD_masslist_8_" + package_method_8 + ".csv" # output: combined mass list of all methods output_file_CD_masslist = "./output/CD_masslist_combined.csv" # output: combined data of all methods output_file_all_data = "./output/combined_overview.csv" # output: inclusion list for QExactivePlus for positive mode output_inclusion_pos = "./output/inclusion_list_pos.csv" # output: inclusion list for QExactivePlus for negative mode output_inclusion_neg = "./output/inclusion_list_neg.csv" # output: max element count file output_max_element = "./output/max_element_count.txt" # output: removed TPs above 100 u output_removed_tps = "./output/removed_TPs_above_100_u.csv" ################################################################################################################################################################################################################################################################ ################################################################################################################################################################################################################################################################ # FUNCTIONS def file_to_csv(input_file, pickle_file, csv_file, file_type): """ :param input_file: :param pickle_file: :param csv_file: :param file_type: 'envipath' or 'EAWAG-PPS' """ print("Converting {} file".format(input_file)) if file_type == 'envipath': D1 = read_enviPath_file_to_dict(input_file) elif file_type == 'EAWAG-PPS': D1 = read_PPS_file_to_dict(input_file) else: raise ValueError("Possible values for file_type: 'envipath' or 'EAWAG-PPS'") D2 = canonicalize_dict(D1) D3 = annotate_dict(D2, file_type) do_pickle(D3, pickle_file) dict_to_csv(D3, csv_file) return D3 def read_enviPath_file_to_dict(input_file): envipath_file = open(input_file) sep = '\t' slash = "///" smiles = "SMILES" data_dict_envi = {} for line in envipath_file: linelist_envi = line.rstrip().split(sep) if line.startswith(slash): #skip pathway line substance = "" continue else: if line.startswith(smiles): #skip header line continue else: if len(linelist_envi) == 6:#skip parent line because TP_1 is always first generation (except Atv) substance = linelist_envi[0] continue else: if data_dict_envi.get(substance): #if parent SMILES exists as key then append list data_dict_envi[substance]['TP_list'].append(linelist_envi[0]) data_dict_envi[substance]['bt_list'].append(linelist_envi[3]) data_dict_envi[substance]['code_TP'].append(linelist_envi[1]) data_dict_envi[substance]['combined_prob'].append(linelist_envi[2]) else: #otherwise create new entry into dict data_dict_envi[substance] = {'TP_list': [linelist_envi[0]], "TP_list_canon_2":[],"TP_list_canon":[],'combined_prob': [linelist_envi[2]], 'bt_list': [linelist_envi[3]], 'code_TP': [linelist_envi[1]], "code_parent":[], "name" : [], "ID_TP": [], "ID_parent":None, "mass_TP" : [], "mass_parent" : [], "Structure_TP": [], "Structure_parent": None, "Formula_TP":[], "Formula_parent":None, "inchi_TP":[], "inchi_parent": None} for parent in code_dict: if parent not in data_dict_envi: data_dict_envi[parent] = {'TP_list': [], "TP_list_canon": [], "TP_list_canon_2": [], 'bt_list': [], 'code_TP': [], "code_parent": [], "name": [], "ID_TP": [], "ID_parent": None, "mass_TP": [], "mass_parent": [], "Structure_TP": [], "Structure_parent": None, "Formula_TP": [], "Formula_parent": None, "inchi_TP": [], "inchi_parent": None} for key in data_dict_envi: if key in code_dict.keys(): data_dict_envi[key]['code_parent'].append(code_dict.get(key)) # add the code to the data dict if key in name_dict.keys(): data_dict_envi[key]["name"].append(name_dict.get(key)) # add name to data_dict return data_dict_envi def canonicalize_smiles(smiles): mol = Chem.MolFromSmiles(smiles) # creates mol object from SMILES uncharger = rdMolStandardize.Uncharger() # easier to access uncharged = uncharger.uncharge(mol) # protonates or deprotonates the mol object new_smiles = rdkit.Chem.rdmolfiles.MolToSmiles(uncharged) # converts mol object to canonical SMILES can_smiles = Chem.CanonSmiles(new_smiles) return can_smiles def max_element_count(smi_list): max_C = 0 max_N = 0 max_F = 0 max_O = 0 max_S = 0 max_P = 0 max_Cl = 0 max_Br = 0 max_I = 0 max_H = 0 for smi in smi_list: if smi.count("C") > max_C: max_C = smi.count("C") if smi.count("N") > max_N: max_N = smi.count("N") if smi.count("F") > max_F: max_F = smi.count("F") if smi.count("O") > max_O: max_O = smi.count("O") if smi.count("S") > max_S: max_S = smi.count("S") if smi.count("P") > max_P: max_P = smi.count("P") if smi.count("Cl") > max_Cl: max_Cl = smi.count("Cl") if smi.count("Br") > max_Br: max_Br = smi.count("Br") if smi.count("I") > max_I: max_I = smi.count("I") mol = Chem.MolFromSmiles(smi) mol = Chem.AddHs(mol) mol = Chem.MolToSmiles(mol, allHsExplicit=True) hcount = mol.count("H") if hcount > max_H: max_H = hcount with open(output_max_element, 'w') as f: f.write("max. element count: " + "max C = " + str(max_C) + ", max H = " + str(max_H) + ", max O = " + str(max_O) + ", max N = " + str(max_N) + ", max S = " + str(max_S) + ", max P = " + str(max_P) + ", max Cl = " + str(max_Cl) + ", max Br = " + str(max_Br) + ", max I = " + str(max_I)) print("max. element count: " + "max C = " + str(max_C) + ", max H = " + str(max_H) + ", max O = " + str(max_O) + ", max N = " + str(max_N) + ", max S = " + str(max_S) + ", max P = " + str(max_P) + ", max Cl = " + str(max_Cl) + ", max Br = " + str(max_Br) + ", max I = " + str(max_I)) return def suggest_stepped_nce(smi_list): MolWeight_list = [] for compound in smi_list: MolWeight_list.append(Chem.Descriptors.ExactMolWt(Chem.MolFromSmiles(compound))) nce_list = [] for mass in MolWeight_list: if mass > 350: nce_list.append(15) else: nce_calc = 5 * round(((mass * -0.41) + 160)/5) nce_list.append(nce_calc) max_nce = max(nce_list) if max_nce > 120: high_nce = 100 else: high_nce = max_nce - 20 avg_nce = sum(nce_list)/len(nce_list) middle_nce = (5 * round(avg_nce/5)) - 5 min_nce = min(nce_list) if min_nce == 15: low_nce = min_nce else: low_nce = min_nce - 5 if high_nce < 0 or middle_nce - low_nce < 10 or high_nce - middle_nce < 10: print("Stepped NCE approach not recommended") else: print("Suggested Stepped NCE: " + "Low NCE = " + str(low_nce) + ", Middle NCE = " + str(middle_nce) + ", High NCE = " + str(high_nce)) return def do_pickle(d, pickle_file): with open("./output/" + pickle_file, 'wb') as handle: pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL) def get_pickle(pickle_file): with open("./output/" + pickle_file, 'rb') as handle: d = pickle.load(handle) return d def canonicalize_dict(D): new_D = {} for parent in D: tp_dict = D[parent] tp_dict["TP_list_canon"] = [] for tp in D[parent]["TP_list"]: tp_dict["TP_list_canon"].append(canonicalize_smiles(tp)) new_D[canonicalize_smiles(parent)] = tp_dict return new_D def annotate_dict(data_dict, data_type): for parent in data_dict: counter = 1 for tp in data_dict[parent]["TP_list_canon"]: data_dict[parent]['mass_TP'].append(Chem.Descriptors.ExactMolWt(Chem.MolFromSmiles(tp))) # add structure of TP to dict data_dict[parent]["Structure_TP"].append(Chem.MolFromSmiles(tp)) # add molecular formula of TP data_dict[parent]["Formula_TP"].append(CalcMolFormula(Chem.MolFromSmiles(tp))) # add inchikey of TP data_dict[parent]["inchi_TP"].append(Chem.inchi.MolToInchiKey(Chem.MolFromSmiles(tp))) if data_type == 'EAWAG-PPS': # used for PPS where TPs are not named automatically data_dict[parent]['code_TP'].append("TP_" + data_dict[parent]['code_parent'][0] + "_" + str(counter)) counter += 1 # add parent mass data_dict[parent]['mass_parent'].append(Chem.Descriptors.ExactMolWt(Chem.MolFromSmiles(parent))) # add structure of parent to dict data_dict[parent]["Structure_parent"] = Chem.MolFromSmiles(parent) # add molecular formula of parent data_dict[parent]["Formula_parent"] = CalcMolFormula(Chem.MolFromSmiles(parent)) # add inchikey of parent data_dict[parent]["inchi_parent"] = Chem.inchi.MolToInchiKey(Chem.MolFromSmiles(parent)) # !!! # add CAS of parent from inchi key, if there is no CAS number for TP then there still needs to be an entry so that later list has same length! if CAS_search == True: parent_cas = get_cas_inchi(data_dict[parent]["inchi_parent"]) if CAS_search == False: parent_cas = "CAS search disabled" if len(parent_cas) > 0: # only add CAS if it was found data_dict[parent]["ID_parent"] = parent_cas else: #otherwise add empty string data_dict[parent]["ID_parent"] = "" # add CAS of TPs from inchi key for tp in data_dict[parent]["inchi_TP"]: if CAS_search == True: tp_cas = get_cas_inchi(tp) if CAS_search == False: tp_cas = "CAS search disabled" if len(tp_cas) > 0: data_dict[parent]["ID_TP"].append(tp_cas) else: data_dict[parent]["ID_TP"].append("") del data_dict[parent]["TP_list"] return data_dict def dict_to_csv(data_dict, output_file): # create lists for each column and then create dict with correct layout which can be converted to dataframe using pandas df_name_list = [] df_ID_list = [] df_Formula_list = [] df_MolWeight_list = [] df_Structure_list = [] # add all the data to the lists for parent in data_dict: df_name_list.append(data_dict[parent]["code_parent"][0]) #append code of parent first for tp in data_dict[parent]["code_TP"]: #then add all the codes of the TPs df_name_list.append(tp) # add CAS number df_ID_list.append(data_dict[parent]["ID_parent"]) for tp in data_dict[parent]["ID_TP"]: df_ID_list.append(tp) # add monoisotopic mass df_MolWeight_list.append(data_dict[parent]["mass_parent"][0]) for tp in data_dict[parent]["mass_TP"]: df_MolWeight_list.append(tp) # add chemical formula df_Formula_list.append(data_dict[parent]["Formula_parent"]) for tp in data_dict[parent]["Formula_TP"]: df_Formula_list.append(tp) # add mol file mol_rep_1 = (Chem.MolToMolBlock(data_dict[parent]["Structure_parent"])).replace("\n", ";") #replace newline character with semicolon mol_rep_2 = mol_rep_1[6:] #skip first few spaces mol_rep_3 = mol_rep_2[:-1] #remove the last semicolon df_Structure_list.append(mol_rep_3) #add string to the list for tp in data_dict[parent]["Structure_TP"]: tp_mol_rep_1 = (Chem.MolToMolBlock(tp)).replace("\n",";") tp_mol_rep_2 = tp_mol_rep_1[6:] tp_mol_rep_3 = tp_mol_rep_2[:-1] df_Structure_list.append(tp_mol_rep_3) # all lists must be the same length to convert it to dataframe assert len(df_name_list) == len(df_Formula_list) == len(df_MolWeight_list) == len(df_Structure_list) == len(df_ID_list), "Error: all lists must be the same length to convert it to dataframe" #create dict and convert to dataframe df_dict = {"Name": df_name_list, "ID": df_ID_list,"Formula": df_Formula_list,"MolWeight": df_MolWeight_list, "Structure": df_Structure_list} df = pd.DataFrame.from_dict(df_dict) # export dataframe as csv df.to_csv(output_file, index = False, sep = "\t") def combined_dict_to_csv(data_dict, output_file_CD, output_file_complete): # create lists for each column and then create dict with correct which can be converted to dataframe using pandas df_name_list_com = [] df_smi_list_com = [] df_ID_list_com = [] df_Formula_list_com = [] df_MolWeight_list_com = [] df_Structure_list_com = [] df_name_parent_list_com = [] df_inchikey_list_com = [] df_score_list_com = [] df_rules_list_com = [] df_source_list_com = [] df_alt_parent_list_com = [] # change the codes of the TPs, so each TP has its own name for parent in data_dict: counter = 1 for tp in data_dict[parent]["TP_dict"]: data_dict[parent]["TP_dict"][tp]["code"] = "TP_" + data_dict[parent]['code_parent'][0] + "_" + str(counter) counter = counter + 1 # add all the data to the lists for parent in data_dict: df_name_list_com.append(data_dict[parent]["code_parent"][0]) df_smi_list_com.append(parent) df_ID_list_com.append(data_dict[parent]["ID_parent"]) df_MolWeight_list_com.append(data_dict[parent]["mass_parent"][0]) df_Formula_list_com.append(data_dict[parent]["Formula_parent"]) mol_rep_1 = (Chem.MolToMolBlock(data_dict[parent]["Structure_parent"])).replace("\n", ";") #replace newline character with semicolon mol_rep_2 = mol_rep_1[6:] #skip first few spaces mol_rep_3 = mol_rep_2[:-1] #remove the last semicolon df_Structure_list_com.append(mol_rep_3) #add string to the list df_name_parent_list_com.append(data_dict[parent]["name"][0]) df_inchikey_list_com.append(data_dict[parent]["inchi_parent"]) df_score_list_com.append("100") df_rules_list_com.append("") df_source_list_com.append("") df_alt_parent_list_com.append([]) for tp in data_dict[parent]["TP_dict"]: df_name_list_com.append(data_dict[parent]["TP_dict"][tp]["code"]) df_smi_list_com.append(tp) df_ID_list_com.append(data_dict[parent]["TP_dict"][tp]["CAS"]) df_MolWeight_list_com.append(data_dict[parent]["TP_dict"][tp]["mass"]) df_Formula_list_com.append(data_dict[parent]["TP_dict"][tp]["Formula"]) tp_mol_rep_1 = (Chem.MolToMolBlock(data_dict[parent]["TP_dict"][tp]["Structure"])).replace("\n",";") tp_mol_rep_2 = tp_mol_rep_1[6:] tp_mol_rep_3 = tp_mol_rep_2[:-1] df_Structure_list_com.append(tp_mol_rep_3) df_name_parent_list_com.append(data_dict[parent]["name"][0]) df_inchikey_list_com.append(data_dict[parent]["TP_dict"][tp]["InchiKey"]) df_score_list_com.append(data_dict[parent]["TP_dict"][tp]["score"]) df_rules_list_com.append(data_dict[parent]["TP_dict"][tp]["rule_list"]) df_source_list_com.append(data_dict[parent]["TP_dict"][tp]["source_list"]) df_alt_parent_list_com.append(data_dict[parent]["TP_dict"][tp]["alternative_parent"]) max_element_count(df_smi_list_com) # all lists must be the same length to convert it to dataframe assert len(df_name_list_com) == len(df_Formula_list_com) == len(df_MolWeight_list_com) == len(df_Structure_list_com) == len(df_ID_list_com), "Error: all lists must be the same length to convert it to dataframe" #create dict and convert to dataframe df_com_dict = {"Name": df_name_list_com, "ID": df_ID_list_com,"Formula": df_Formula_list_com,"MolWeight": df_MolWeight_list_com, "Structure": df_Structure_list_com} df_com = pd.DataFrame.from_dict(df_com_dict) # export dataframe as csv df_com.to_csv(output_file_CD, index = False, sep = "\t") df_complete_dict = {"Name of parent":df_name_parent_list_com,"SMILES": df_smi_list_com, "Name": df_name_list_com, "Score": df_score_list_com, "Source": df_source_list_com, "CAS": df_ID_list_com,"Formula": df_Formula_list_com, "MolWeight": df_MolWeight_list_com, "InchiKey":df_inchikey_list_com, "Alternative parent": df_alt_parent_list_com, "bt rules": df_rules_list_com} df_complete = pd.DataFrame.from_dict(df_complete_dict) # export dataframe as csv df_complete.to_csv(output_file_complete, index = False, sep = ",") # create inclusion list for QExactivePlus m_proton = 1.0072756 df_M_plus_H = [] df_M_minus_H = [] df_polarity_pos = [] df_polarity_neg = [] df_empty = [] df_nce_type = [] df_nce = [] for mass in df_MolWeight_list_com: df_M_plus_H.append(mass + m_proton) df_M_minus_H.append(mass - m_proton) df_polarity_pos.append("Positive") df_polarity_neg.append("Negative") df_empty.append(" ") df_nce_type.append("NCE") if mass > 350: df_nce.append(15) else: nce_calc = 5 * round(((mass * -0.41) + 160)/5) df_nce.append(nce_calc) d_inclusion_pos = {"Mass [m/z]": df_M_plus_H ,"Formula [M]": df_empty, "Formula type": df_empty, "Species": df_empty, "CS [z]": df_empty, "Polarity": df_polarity_pos, "Start [min]": df_empty, "End [min]": df_empty, "(N)CE": df_nce, "(N)CE type":df_nce_type, "MSX ID": df_empty, "Comment": df_name_list_com} df_inclusion_pos = pd.DataFrame.from_dict(d_inclusion_pos) # export dataframe as csv df_inclusion_pos.to_csv(output_inclusion_pos, index = False, sep = ",") d_inclusion_neg = {"Mass [m/z]": df_M_minus_H ,"Formula [M]": df_empty, "Formula type": df_empty, "Species": df_empty, "CS [z]": df_empty, "Polarity": df_polarity_neg, "Start [min]": df_empty, "End [min]": df_empty, "(N)CE": df_nce, "(N)CE type":df_nce_type, "MSX ID": df_empty, "Comment": df_name_list_com} df_inclusion_neg = pd.DataFrame.from_dict(d_inclusion_neg) # export dataframe as csv df_inclusion_neg.to_csv(output_inclusion_neg, index = False, sep = ",") suggest_stepped_nce(df_smi_list_com) print("Export complete") def get_cas_inchi(inchi): # add get CAS from inchikey function cas_rns = [] inchi_split = inchi.split("-")[0] results = pcp.get_synonyms(inchi_split, 'inchikey') for result in results: for syn in result.get('Synonym', []): match = re.match('(\d{2,7}-\d\d-\d)', syn) if match: cas_rns.append(match.group(1)) return cas_rns def read_PPS_file_to_dict(PPS_file_location): PPS_file = open(PPS_file_location) sep = '\t' line_1 = PPS_file.readline() line_list_1 = line_1.rstrip().split(sep) # rstrip() removes the newline character '\n' at the end of the file Settings = {} Settings[line_list_1[0]] = line_list_1[1] line_2 = PPS_file.readline() line_list_2 = line_2.rstrip().split(sep) Settings[line_list_2[0]] = line_list_2[1] line_3 = PPS_file.readline() line_4 = PPS_file.readline() PPS_file.readline() header_line = PPS_file.readline() compound_list = header_line.rstrip().split(sep) data = {} data_dict = {} for line in PPS_file: linelist = line.rstrip().split(sep) # we get the list # we know that the first item of the list is the TP, and the following items are biotransformation rules producing the TP from a given compound for index, substance in enumerate(compound_list): # The first item in the compound list is empty, so let's skip that if index == 0: continue # empty fields at the end of the line are not imported as empty strings, add them manually while len(linelist) < len(compound_list): linelist.append('') set_of_rules = linelist[index] if set_of_rules != '': # Only print if the set of rules is not empty if data_dict.get(substance): # if key exist then append list data_dict[substance]['TP_list'].append(linelist[0]) data_dict[substance]['bt_list'].append(set_of_rules) else: # otherwise create new entry into dict data_dict[substance] = {'TP_list': [linelist[0]], "TP_list_canon": [], "TP_list_canon_2": [], 'bt_list': [set_of_rules], 'code_TP': [], "code_parent": [], "name": [], "ID_TP": [], "ID_parent": None, "mass_TP": [], "mass_parent": [], "Structure_TP": [], "Structure_parent": None, "Formula_TP": [], "Formula_parent": None, "inchi_TP": [], "inchi_parent": None} for key in data_dict: if key in code_dict.keys(): data_dict[key]['code_parent'].append(code_dict.get(key)) #add the code to the data dict if key in name_dict.keys(): data_dict[key]["name"].append(name_dict.get(key)) # add name to data_dict like the code before for parent in code_dict and name_dict: if parent not in data_dict.keys(): data_dict[parent] = {'TP_list': [], "TP_list_canon": [], "TP_list_canon_2": [], 'bt_list': [], 'code_TP': [], "code_parent": [code_dict.get(parent)], "name": [name_dict.get(parent)], "ID_TP": [], "ID_parent": None, "mass_TP": [], "mass_parent": [], "Structure_TP": [], "Structure_parent": None, "Formula_TP": [], "Formula_parent": None, "inchi_TP": [], "inchi_parent": None} return data_dict def combine_dict (d_1, method_1_package, d_2, method_2_package, d_3, method_3_package, d_4, method_4_package, d_5, method_5_package, d_6, method_6_package, d_7, method_7_package, d_8, method_8_package): print("Combining "+ method_1_package + ", " + method_2_package + ", " + method_3_package + " , " + method_4_package + " , " + method_5_package+ " , " + method_6_package + " , " + method_7_package+ " and " + method_8_package) # create new dict from copy of envi dict and then delete TP info data_dict_com = copy.deepcopy(d_1) # need deepcopy otherwise it will still change original dict for parent in data_dict_com: if data_dict_com.get(parent, {}).get("combined_prob") is None: if data_dict_com.get(parent, {}).get("bt_list") is not None: del data_dict_com[parent]["bt_list"] del data_dict_com[parent]["code_TP"] del data_dict_com[parent]["inchi_TP"] del data_dict_com[parent]["mass_TP"] del data_dict_com[parent]["Formula_TP"] del data_dict_com[parent]["Structure_TP"] del data_dict_com[parent]["ID_TP"] del data_dict_com[parent]["TP_list_canon"] del data_dict_com[parent]["TP_list_canon_2"] else: if data_dict_com.get(parent, {}).get("bt_list") is not None: del data_dict_com[parent]["bt_list"] del data_dict_com[parent]["code_TP"] del data_dict_com[parent]["inchi_TP"] del data_dict_com[parent]["mass_TP"] del data_dict_com[parent]["Formula_TP"] del data_dict_com[parent]["Structure_TP"] del data_dict_com[parent]["ID_TP"] del data_dict_com[parent]["TP_list_canon"] del data_dict_com[parent]["combined_prob"] del data_dict_com[parent]["TP_list_canon_2"] # add TP data from first data dict for parent in d_1: if d_1.get(parent, {}).get("combined_prob") is not None: for index, tp in enumerate(d_1[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_1[parent]["ID_TP"][index], "rule_list":[d_1[parent]["bt_list"][index]], "mass": d_1[parent]["mass_TP"][index], "Formula": d_1[parent]["Formula_TP"][index], "source_list" :[method_1_package], "code": d_1[parent]["code_TP"][index], "Structure" : d_1[parent]["Structure_TP"][index], "combined_prob": [d_1[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_1[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list if method_1_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_1_package) data_dict_com[parent]["TP_dict"][tp]["combined_prob"].append(d_1[parent]["combined_prob"][index]) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_1[parent]["ID_TP"][index], "rule_list":[d_1[parent]["bt_list"][index]], "mass": d_1[parent]["mass_TP"][index], "Formula": d_1[parent]["Formula_TP"][index], "source_list" :[method_1_package], "code": d_1[parent]["code_TP"][index], "Structure" : d_1[parent]["Structure_TP"][index], "combined_prob": [d_1[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_1[parent]["inchi_TP"][index], "alternative_parent" : []} else: for parent in d_1: for index, tp in enumerate(d_1[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_1[parent]["ID_TP"][index], "rule_list":[d_1[parent]["bt_list"][index]], "mass": d_1[parent]["mass_TP"][index], "Formula": d_1[parent]["Formula_TP"][index], "source_list" :[method_1_package], "code": d_1[parent]["code_TP"][index], "Structure" : d_1[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_1[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list if method_1_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_1_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_1[parent]["ID_TP"][index], "rule_list":[d_1[parent]["bt_list"][index]], "mass": d_1[parent]["mass_TP"][index], "Formula": d_1[parent]["Formula_TP"][index], "source_list" :[method_1_package], "code": d_1[parent]["code_TP"][index], "Structure" : d_1[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_1[parent]["inchi_TP"][index], "alternative_parent" : []} # add TP data from second data dict for parent in d_2: if d_2.get(parent, {}).get("combined_prob") is not None: for index, tp in enumerate(d_2[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_2[parent]["ID_TP"][index], "rule_list":[d_2[parent]["bt_list"][index]], "mass": d_2[parent]["mass_TP"][index], "Formula": d_2[parent]["Formula_TP"][index], "source_list" :[method_2_package], "code": d_2[parent]["code_TP"][index], "Structure" : d_2[parent]["Structure_TP"][index], "combined_prob": [d_2[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_2[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list data_dict_com[parent]["TP_dict"][tp]["combined_prob"].append(d_2[parent]["combined_prob"][index]) if d_2[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_2[parent]["bt_list"][index]) if method_2_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_2_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_2[parent]["ID_TP"][index], "rule_list":[d_2[parent]["bt_list"][index]], "mass": d_2[parent]["mass_TP"][index], "Formula": d_2[parent]["Formula_TP"][index], "source_list" :[method_2_package], "code": d_2[parent]["code_TP"][index], "Structure" : d_2[parent]["Structure_TP"][index], "combined_prob": [d_2[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_2[parent]["inchi_TP"][index], "alternative_parent" : []} else: for parent in d_2: for index, tp in enumerate(d_2[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_2[parent]["ID_TP"][index], "rule_list":[d_2[parent]["bt_list"][index]], "mass": d_2[parent]["mass_TP"][index], "Formula": d_2[parent]["Formula_TP"][index], "source_list" :[method_2_package], "code": d_2[parent]["code_TP"][index], "Structure" : d_2[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_2[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list if d_2[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_2[parent]["bt_list"][index]) if method_2_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_2_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_2[parent]["ID_TP"][index], "rule_list":[d_2[parent]["bt_list"][index]], "mass": d_2[parent]["mass_TP"][index], "Formula": d_2[parent]["Formula_TP"][index], "source_list" :[method_2_package], "code": d_2[parent]["code_TP"][index], "Structure" : d_2[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_2[parent]["inchi_TP"][index], "alternative_parent" : []} # add TP data from third data dict (if specified) if d_3 != "none": for parent in d_3: if d_3.get(parent, {}).get("combined_prob") is not None: for index, tp in enumerate(d_3[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_3[parent]["ID_TP"][index], "rule_list":[d_3[parent]["bt_list"][index]], "mass": d_3[parent]["mass_TP"][index], "Formula": d_3[parent]["Formula_TP"][index], "source_list" :[method_3_package], "code": d_3[parent]["code_TP"][index], "Structure" : d_3[parent]["Structure_TP"][index], "combined_prob": [d_3[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_3[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list data_dict_com[parent]["TP_dict"][tp]["combined_prob"].append(d_3[parent]["combined_prob"][index]) if d_3[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_3[parent]["bt_list"][index]) if method_3_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_3_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_3[parent]["ID_TP"][index], "rule_list":[d_3[parent]["bt_list"][index]], "mass": d_3[parent]["mass_TP"][index], "Formula": d_3[parent]["Formula_TP"][index], "source_list" :[method_3_package], "code": d_3[parent]["code_TP"][index], "Structure" : d_3[parent]["Structure_TP"][index], "combined_prob": [d_3[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_3[parent]["inchi_TP"][index], "alternative_parent" : []} else: for parent in d_3: for index, tp in enumerate(d_3[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_3[parent]["ID_TP"][index], "rule_list":[d_3[parent]["bt_list"][index]], "mass": d_3[parent]["mass_TP"][index], "Formula": d_3[parent]["Formula_TP"][index], "source_list" :[method_3_package], "code": d_3[parent]["code_TP"][index], "Structure" : d_3[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_3[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list if d_3[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_3[parent]["bt_list"][index]) if method_3_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_3_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_3[parent]["ID_TP"][index], "rule_list":[d_3[parent]["bt_list"][index]], "mass": d_3[parent]["mass_TP"][index], "Formula": d_3[parent]["Formula_TP"][index], "source_list" :[method_3_package], "code": d_3[parent]["code_TP"][index], "Structure" : d_3[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_3[parent]["inchi_TP"][index], "alternative_parent" : []} # add TP data from fourth data dict (if specified) if d_4 != "none": for parent in d_4: if d_4.get(parent, {}).get("combined_prob") is not None: for index, tp in enumerate(d_4[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_4[parent]["ID_TP"][index], "rule_list":[d_4[parent]["bt_list"][index]], "mass": d_4[parent]["mass_TP"][index], "Formula": d_4[parent]["Formula_TP"][index], "source_list" :[method_4_package], "code": d_4[parent]["code_TP"][index], "Structure" : d_4[parent]["Structure_TP"][index], "combined_prob": [d_4[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_4[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list data_dict_com[parent]["TP_dict"][tp]["combined_prob"].append(d_4[parent]["combined_prob"][index]) if d_4[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_4[parent]["bt_list"][index]) if method_4_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_4_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_4[parent]["ID_TP"][index], "rule_list":[d_4[parent]["bt_list"][index]], "mass": d_4[parent]["mass_TP"][index], "Formula": d_4[parent]["Formula_TP"][index], "source_list" :[method_4_package], "code": d_4[parent]["code_TP"][index], "Structure" : d_4[parent]["Structure_TP"][index], "combined_prob": [d_4[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_4[parent]["inchi_TP"][index], "alternative_parent" : []} else: for parent in d_4: for index, tp in enumerate(d_4[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_4[parent]["ID_TP"][index], "rule_list":[d_4[parent]["bt_list"][index]], "mass": d_4[parent]["mass_TP"][index], "Formula": d_4[parent]["Formula_TP"][index], "source_list" :[method_4_package], "code": d_4[parent]["code_TP"][index], "Structure" : d_4[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_4[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list if d_4[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_4[parent]["bt_list"][index]) if method_4_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_4_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_4[parent]["ID_TP"][index], "rule_list":[d_4[parent]["bt_list"][index]], "mass": d_4[parent]["mass_TP"][index], "Formula": d_4[parent]["Formula_TP"][index], "source_list" :[method_4_package], "code": d_4[parent]["code_TP"][index], "Structure" : d_4[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_4[parent]["inchi_TP"][index], "alternative_parent" : []} # add TP data from fifth data dict (if specified) if d_5 != "none": for parent in d_5: if d_5.get(parent, {}).get("combined_prob") is not None: for index, tp in enumerate(d_5[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_5[parent]["ID_TP"][index], "rule_list":[d_5[parent]["bt_list"][index]], "mass": d_5[parent]["mass_TP"][index], "Formula": d_5[parent]["Formula_TP"][index], "source_list" :[method_5_package], "code": d_5[parent]["code_TP"][index], "Structure" : d_5[parent]["Structure_TP"][index], "combined_prob": [d_5[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_5[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list data_dict_com[parent]["TP_dict"][tp]["combined_prob"].append(d_5[parent]["combined_prob"][index]) if d_5[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_5[parent]["bt_list"][index]) if method_5_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_5_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_5[parent]["ID_TP"][index], "rule_list":[d_5[parent]["bt_list"][index]], "mass": d_5[parent]["mass_TP"][index], "Formula": d_5[parent]["Formula_TP"][index], "source_list" :[method_5_package], "code": d_5[parent]["code_TP"][index], "Structure" : d_5[parent]["Structure_TP"][index], "combined_prob": [d_5[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_5[parent]["inchi_TP"][index], "alternative_parent" : []} else: for parent in d_5: for index, tp in enumerate(d_5[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_5[parent]["ID_TP"][index], "rule_list":[d_5[parent]["bt_list"][index]], "mass": d_5[parent]["mass_TP"][index], "Formula": d_5[parent]["Formula_TP"][index], "source_list" :[method_5_package], "code": d_5[parent]["code_TP"][index], "Structure" : d_5[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_5[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list if d_5[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_5[parent]["bt_list"][index]) if method_5_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_5_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_5[parent]["ID_TP"][index], "rule_list":[d_5[parent]["bt_list"][index]], "mass": d_5[parent]["mass_TP"][index], "Formula": d_5[parent]["Formula_TP"][index], "source_list" :[method_5_package], "code": d_5[parent]["code_TP"][index], "Structure" : d_5[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_5[parent]["inchi_TP"][index], "alternative_parent" : []} # add TP data from sixth data dict (if specified) if d_6 != "none": for parent in d_6: if d_6.get(parent, {}).get("combined_prob") is not None: for index, tp in enumerate(d_6[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_6[parent]["ID_TP"][index], "rule_list":[d_6[parent]["bt_list"][index]], "mass": d_6[parent]["mass_TP"][index], "Formula": d_6[parent]["Formula_TP"][index], "source_list" :[method_6_package], "code": d_6[parent]["code_TP"][index], "Structure" : d_6[parent]["Structure_TP"][index], "combined_prob": [d_6[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_6[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list data_dict_com[parent]["TP_dict"][tp]["combined_prob"].append(d_6[parent]["combined_prob"][index]) if d_6[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_6[parent]["bt_list"][index]) if method_6_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_6_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_6[parent]["ID_TP"][index], "rule_list":[d_6[parent]["bt_list"][index]], "mass": d_6[parent]["mass_TP"][index], "Formula": d_6[parent]["Formula_TP"][index], "source_list" :[method_6_package], "code": d_6[parent]["code_TP"][index], "Structure" : d_6[parent]["Structure_TP"][index], "combined_prob": [d_6[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_6[parent]["inchi_TP"][index], "alternative_parent" : []} else: for parent in d_6: for index, tp in enumerate(d_6[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_6[parent]["ID_TP"][index], "rule_list":[d_6[parent]["bt_list"][index]], "mass": d_6[parent]["mass_TP"][index], "Formula": d_6[parent]["Formula_TP"][index], "source_list" :[method_6_package], "code": d_6[parent]["code_TP"][index], "Structure" : d_6[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_6[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list if d_6[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_6[parent]["bt_list"][index]) if method_6_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_6_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_6[parent]["ID_TP"][index], "rule_list":[d_6[parent]["bt_list"][index]], "mass": d_6[parent]["mass_TP"][index], "Formula": d_6[parent]["Formula_TP"][index], "source_list" :[method_6_package], "code": d_6[parent]["code_TP"][index], "Structure" : d_6[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_6[parent]["inchi_TP"][index], "alternative_parent" : []} # add TP data from seventh data dict (if specified) if d_7 != "none": for parent in d_7: if d_7.get(parent, {}).get("combined_prob") is not None: for index, tp in enumerate(d_7[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_7[parent]["ID_TP"][index], "rule_list":[d_7[parent]["bt_list"][index]], "mass": d_7[parent]["mass_TP"][index], "Formula": d_7[parent]["Formula_TP"][index], "source_list" :[method_7_package], "code": d_7[parent]["code_TP"][index], "Structure" : d_7[parent]["Structure_TP"][index], "combined_prob": [d_7[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_7[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list data_dict_com[parent]["TP_dict"][tp]["combined_prob"].append(d_7[parent]["combined_prob"][index]) if d_7[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_7[parent]["bt_list"][index]) if method_7_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_7_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_7[parent]["ID_TP"][index], "rule_list":[d_7[parent]["bt_list"][index]], "mass": d_7[parent]["mass_TP"][index], "Formula": d_7[parent]["Formula_TP"][index], "source_list" :[method_7_package], "code": d_7[parent]["code_TP"][index], "Structure" : d_7[parent]["Structure_TP"][index], "combined_prob": [d_7[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_7[parent]["inchi_TP"][index], "alternative_parent" : []} else: for parent in d_7: for index, tp in enumerate(d_7[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_7[parent]["ID_TP"][index], "rule_list":[d_7[parent]["bt_list"][index]], "mass": d_7[parent]["mass_TP"][index], "Formula": d_7[parent]["Formula_TP"][index], "source_list" :[method_7_package], "code": d_7[parent]["code_TP"][index], "Structure" : d_7[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_7[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list if d_7[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_7[parent]["bt_list"][index]) if method_7_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_7_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_7[parent]["ID_TP"][index], "rule_list":[d_7[parent]["bt_list"][index]], "mass": d_7[parent]["mass_TP"][index], "Formula": d_7[parent]["Formula_TP"][index], "source_list" :[method_7_package], "code": d_7[parent]["code_TP"][index], "Structure" : d_7[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_7[parent]["inchi_TP"][index], "alternative_parent" : []} # add TP data from eighth data dict (if specified) if d_8 != "none": for parent in d_8: if d_8.get(parent, {}).get("combined_prob") is not None: for index, tp in enumerate(d_8[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_8[parent]["ID_TP"][index], "rule_list":[d_8[parent]["bt_list"][index]], "mass": d_8[parent]["mass_TP"][index], "Formula": d_8[parent]["Formula_TP"][index], "source_list" :[method_8_package], "code": d_8[parent]["code_TP"][index], "Structure" : d_8[parent]["Structure_TP"][index], "combined_prob": [d_8[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_8[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list data_dict_com[parent]["TP_dict"][tp]["combined_prob"].append(d_8[parent]["combined_prob"][index]) if d_8[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_8[parent]["bt_list"][index]) if method_8_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_8_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_8[parent]["ID_TP"][index], "rule_list":[d_8[parent]["bt_list"][index]], "mass": d_8[parent]["mass_TP"][index], "Formula": d_8[parent]["Formula_TP"][index], "source_list" :[method_8_package], "code": d_8[parent]["code_TP"][index], "Structure" : d_8[parent]["Structure_TP"][index], "combined_prob": [d_8[parent]["combined_prob"][index]], "score": 100, "InchiKey": d_8[parent]["inchi_TP"][index], "alternative_parent" : []} else: for parent in d_8: for index, tp in enumerate(d_8[parent]["TP_list_canon"]): if data_dict_com.get(parent, {}).get("TP_dict") is None: data_dict_com[parent]["TP_dict"] = {} data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_8[parent]["ID_TP"][index], "rule_list":[d_8[parent]["bt_list"][index]], "mass": d_8[parent]["mass_TP"][index], "Formula": d_8[parent]["Formula_TP"][index], "source_list" :[method_8_package], "code": d_8[parent]["code_TP"][index], "Structure" : d_8[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_8[parent]["inchi_TP"][index], "alternative_parent" : []} else: if data_dict_com[parent]["TP_dict"].get(tp): # if TP already there then append list if d_8[parent]["bt_list"][index] not in data_dict_com[parent]["TP_dict"][tp]["rule_list"]: data_dict_com[parent]["TP_dict"][tp]["rule_list"].append(d_8[parent]["bt_list"][index]) if method_8_package not in data_dict_com[parent]["TP_dict"][tp]["source_list"]: data_dict_com[parent]["TP_dict"][tp]["source_list"].append(method_8_package) else: data_dict_com[parent]["TP_dict"][tp] = {"CAS": d_8[parent]["ID_TP"][index], "rule_list":[d_8[parent]["bt_list"][index]], "mass": d_8[parent]["mass_TP"][index], "Formula": d_8[parent]["Formula_TP"][index], "source_list" :[method_8_package], "code": d_8[parent]["code_TP"][index], "Structure" : d_8[parent]["Structure_TP"][index], "combined_prob": [], "score": 100, "InchiKey": d_8[parent]["inchi_TP"][index], "alternative_parent" : []} do_pickle(data_dict_com, "data_dict_com.pickle") return data_dict_com def score_dict(data_dict_com): # Scoring system, each TP starts with a score of 100 made up points print("Scoring dictionary") # check the mass for parent in data_dict_com: for tp in data_dict_com[parent]["TP_dict"]: if data_dict_com[parent]["TP_dict"][tp]["mass"] < 100: # if the mass of the TPs is below 100 u then set the score to 0 data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 100 # check if TP has CAS number # if TP has CAS number then it was studied before and we want to look for new TPs, however if it has no CAS number then the chance is higher that it is only a wanky prediction and not an actual TP that is observed in the environment for parent in data_dict_com: for tp in data_dict_com[parent]["TP_dict"]: if len(data_dict_com[parent]["TP_dict"][tp]["CAS"]) > 1: # if TP has (at least one) CAS number, then reduce score (empty string "" has len 0) data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 10 # check if TP was predicted by other method for parent in data_dict_com: for tp in data_dict_com[parent]["TP_dict"]: if len(data_dict_com[parent]["TP_dict"][tp]["source_list"]) == 1: # if TP was predicted by only one method data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 50 if len(data_dict_com[parent]["TP_dict"][tp]["source_list"]) == 2: # if TP was predicted by two methods data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 40 if len(data_dict_com[parent]["TP_dict"][tp]["source_list"]) == 3: data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 30 if len(data_dict_com[parent]["TP_dict"][tp]["source_list"]) == 4: data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 20 if len(data_dict_com[parent]["TP_dict"][tp]["source_list"]) == 5: data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 15 if len(data_dict_com[parent]["TP_dict"][tp]["source_list"]) == 6: data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 10 if len(data_dict_com[parent]["TP_dict"][tp]["source_list"]) == 7: data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 5 # if TP is predicted by all 8 methods then it doesn't change the score # check the combined probability (is saved as str so must convert to float)(only for enviPath files possible) for parent in data_dict_com: for tp in data_dict_com[parent]["TP_dict"]: # check first if it comes from enviPath method because otherwise it doesnt have a probability if data_dict_com.get(parent, {}).get(tp, {}).get("combined_prob") is not None: # each time a condition is fullfilled, the score is reduced (careful, the penalty is summed up!) if float(data_dict_com[parent]["TP_dict"][tp]["combined_prob"]) < 0.3: data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 5 if float(data_dict_com[parent]["TP_dict"][tp]["combined_prob"]) < 0.1: data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 5 if float(data_dict_com[parent]["TP_dict"][tp]["combined_prob"]) < 0.01: data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 5 if float(data_dict_com[parent]["TP_dict"][tp]["combined_prob"]) < 0.005: data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 10 if float(data_dict_com[parent]["TP_dict"][tp]["combined_prob"]) < 0.001: data_dict_com[parent]["TP_dict"][tp]["score"] = data_dict_com[parent]["TP_dict"][tp]["score"] - 10 # now the unwanted TPs will be deleted (delete the entries with very low score) # first copy the data dict com, so we can iterate over the copy, otherwise it gives runtime error: dictionary changed size during iteration data_dict_com_copy = copy.deepcopy(data_dict_com) for parent in data_dict_com_copy: for tp in data_dict_com_copy[parent]["TP_dict"]: if data_dict_com_copy[parent]["TP_dict"][tp]["score"] <= 0: del data_dict_com[parent]["TP_dict"][tp] # check for dublicates, if TP is found for other parent then append alternative_parent list of that parent for parent in data_dict_com: for tp in data_dict_com[parent]["TP_dict"]: for parent2 in data_dict_com: if tp in data_dict_com[parent2]["TP_dict"]: if data_dict_com[parent]["code_parent"][0] not in data_dict_com[parent2]["TP_dict"][tp]["code"]: # data_dict_com[parent2]["TP_dict"][tp]["alternative_parent"].append(data_dict_com[parent]["code_parent"]) data_dict_com[parent2]["TP_dict"][tp]["alternative_parent"] = data_dict_com[parent]["code_parent"] else: continue # create new dict to which the removed TPs are added removed_tps_dict = {} # if a parent has more than max allowed TPs, they get removed starting with lowest score # first copy the data dict com, so we can iterate over the copy, otherwise it gives runtime error: dictionary changed size during iteration data_dict_com_copy_3 = copy.deepcopy(data_dict_com) for parent in data_dict_com_copy_3: temp_tp_list = [] # create temporary lists containing the TP smiles and corresponding score of one parent temp_score_list = [] # create entries for every parent removed_tps_dict[parent] = {'TP_list': [], "parent_name" : [], "tp_name" : []} for tp in data_dict_com_copy_3[parent]["TP_dict"]: temp_tp_list.append(tp) temp_score_list.append(data_dict_com_copy_3[parent]["TP_dict"][tp]["score"]) while len(temp_tp_list) > max_TP_per_parent: # if parent has more than max_TP_per_parent TPs proceed index = temp_score_list.index(min(temp_score_list)) # find the first index of the lowest score # add smiles of TP that is deleted removed_tps_dict[parent]['TP_list'].append(temp_tp_list[index]) removed_tps_dict[parent]["parent_name"].append(data_dict_com_copy_3[parent]["code_parent"]) removed_tps_dict[parent]["tp_name"].append(data_dict_com_copy_3[parent]["TP_dict"][tp]["code"]) del data_dict_com[parent]["TP_dict"][(temp_tp_list[index])] # index of score is the same as the corresponding TP temp_score_list.pop(index) # now remove the score and the TP smiles from temporary list temp_tp_list.pop(index) do_pickle(data_dict_com, "data_dict_com_scored.pickle") parent_removed_list = [] tp_removed_list = [] tp_code_removed_list = [] parent_code_removed_list = [] for parent in removed_tps_dict: for tp in removed_tps_dict[parent]['TP_list']: parent_removed_list.append(parent) tp_removed_list.append(tp) for name in removed_tps_dict[parent]['parent_name']: parent_code_removed_list.append(name) for code in removed_tps_dict[parent]['tp_name']: tp_code_removed_list.append(code) df_removed_dict = {"Parent Code": parent_code_removed_list ,"Parent SMILES": parent_removed_list, "TP Code" : tp_code_removed_list, "TP SMILES": tp_removed_list} df_removed = pd.DataFrame.from_dict(df_removed_dict) df_removed.to_csv(output_removed_tps , index = False, sep = "\t") return data_dict_com ################################################################################################################################################################################################################################################################ # START SCRIPT assert type(file_location_1) == str, "file_location_1 must be a string!" assert type(prediction_method_1) == str, "prediction_method_1 must be a string!" assert type(package_method_1) == str, "package_method_1 must be a string!" assert type(file_location_2) == str, "file_location_2 must be a string!" assert type(prediction_method_2) == str, "prediction_method_2 must be a string!" assert type(package_method_2) == str, "package_method_2 must be a string!" assert type(file_location_3) == str, "file_location_3 must be a string!" assert type(prediction_method_3) == str, "prediction_method_3 must be a string!" assert type(package_method_3) == str, "package_method_3 must be a string!" assert type(file_location_4) == str, "file_location_4 must be a string!" assert type(prediction_method_4) == str, "prediction_method_4 must be a string!" assert type(package_method_4) == str, "package_method_4 must be a string!" assert type(file_location_5) == str, "file_location_5 must be a string!" assert type(prediction_method_5) == str, "prediction_method_5 must be a string!" assert type(package_method_5) == str, "package_method_5 must be a string!" assert type(file_location_6) == str, "file_location_6 must be a string!" assert type(prediction_method_6) == str, "prediction_method_6 must be a string!" assert type(package_method_6) == str, "package_method_6 must be a string!" assert type(file_location_7) == str, "file_location_7 must be a string!" assert type(prediction_method_7) == str, "prediction_method_7 must be a string!" assert type(package_method_7) == str, "package_method_7 must be a string!" assert type(file_location_8) == str, "file_location_8 must be a string!" assert type(prediction_method_8) == str, "prediction_method_8 must be a string!" assert type(package_method_8) == str, "package_method_8 must be a string!" assert type(code_location) == str, "code_location must be a string!" assert type(smi_location) == str, "smi_location must be a string!" assert type(name_location) == str, "name_location must be a string!" assert type(output_location_1) == str, "output_location_1 must be a string!" assert type(output_location_2) == str, "output_location_2 must be a string!" assert type(output_location_3) == str, "output_location_3 must be a string!" assert type(output_location_4) == str, "output_location_4 must be a string!" assert type(output_location_5) == str, "output_location_5 must be a string!" assert type(output_location_6) == str, "output_location_6 must be a string!" assert type(output_file_CD_masslist) == str, "output_file_CD_masslist must be a string!" assert type(output_file_all_data) == str, "output_file_all_data must be a string!" assert type(output_inclusion_pos) == str, "output_inclusion_pos must be a string!" assert type(output_inclusion_neg) == str, "output_inclusion_neg must be a string!" assert type(scoring_system) == bool, "scoring_sytem must be either 'True' or 'False'!" assert type(max_TP_per_parent) == int, "max_TP_per_parent must be an integer (e.g. 15 or 50)!" def load_mapping_files(code_location, smi_location, name_location): # read files with code, name and SMILES of selected parents to get dictionaries, the order is important! code_file = open(code_location) code_list = [] for line2 in code_file: code_list.append(line2.rstrip()) SMILES_comp_file = open(smi_location) SMILES_list = [] for line3 in SMILES_comp_file: SMILES_list.append(line3.rstrip()) name_file = open(name_location) name_list = [] for line4 in name_file: name_list.append(line4.rstrip()) code_dict = dict(zip(SMILES_list, code_list)) # dictionary with code of selected compounds with corresponding SMILES name_dict = dict(zip(SMILES_list, name_list)) # dictionary with name of selected compounds with corresponding SMILES return code_dict, name_dict ################################################################################################################################################################################################################################################################ # MAIN ################################################################################################################################################################################################################################################################ code_dict, name_dict = load_mapping_files(code_location, smi_location, name_location) # read first file data_dict_1 = file_to_csv(file_location_1, "data_dict_" + package_method_1 + ".pickle", output_location_1, prediction_method_1) # read second file (if available) if consider_file_2 == "yes": data_dict_2 = file_to_csv(file_location_2, "data_dict_" + package_method_2 + ".pickle", output_location_2, prediction_method_2) # read third file (if available) if consider_file_3 == "yes": data_dict_3 = file_to_csv(file_location_3, "data_dict_" + package_method_3 + ".pickle", output_location_3, prediction_method_3) # read fourth file (if available) if consider_file_4 == "yes": data_dict_4 = file_to_csv(file_location_4, "data_dict_" + package_method_4 + ".pickle", output_location_4, prediction_method_4) # read fifth file (if available) if consider_file_5 == "yes": data_dict_5 = file_to_csv(file_location_5, "data_dict_" + package_method_5 + ".pickle", output_location_5, prediction_method_5) # read sixth file (if available) if consider_file_6 == "yes": data_dict_6 = file_to_csv(file_location_6, "data_dict_" + package_method_6 + ".pickle", output_location_6, prediction_method_6) # read seventh file (if available) if consider_file_7 == "yes": data_dict_7 = file_to_csv(file_location_7, "data_dict_" + package_method_7 + ".pickle", output_location_7, prediction_method_7) # read eighth file (if available) if consider_file_8 == "yes": data_dict_8 = file_to_csv(file_location_8, "data_dict_" + package_method_8 + ".pickle", output_location_8, prediction_method_8) # import saved dictionaries # data_dict_1 = get_pickle("data_dict_1.pickle") # data_dict_2 = get_pickle("data_dict_2.pickle") # data_dict_3 = get_pickle("data_dict_3.pickle") # data_dict_4 = get_pickle("data_dict_4.pickle") # data_dict_5 = get_pickle("data_dict_5.pickle") # data_dict_6 = get_pickle("data_dict_6.pickle") # data_dict_7 = get_pickle("data_dict_7.pickle") # data_dict_8 = get_pickle("data_dict_8.pickle") # combine all data dicts into one and apply scoring system # also, generate mass list for CD, csv file with all the data and iclusion lists for QExactivePlus if scoring_system == True: if consider_file_2 == "yes": if consider_file_3 == "yes": if consider_file_4 == "yes": if consider_file_5 == "yes": if consider_file_6 == "yes": if consider_file_7 == "yes": if consider_file_8 == "yes": data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, data_dict_3, package_method_3, data_dict_4, package_method_4, data_dict_5, package_method_5, data_dict_6, package_method_6, data_dict_7, package_method_7, data_dict_8, package_method_8) data_dict_com_scored = score_dict(data_dict_com) combined_dict_to_csv(data_dict_com_scored, output_file_CD_masslist, output_file_all_data) else: data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, data_dict_3, package_method_3, data_dict_4, package_method_4, data_dict_5, package_method_5, data_dict_6, package_method_6, data_dict_7, package_method_7, "none", "none") data_dict_com_scored = score_dict(data_dict_com) combined_dict_to_csv(data_dict_com_scored, output_file_CD_masslist, output_file_all_data) else: data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, data_dict_3, package_method_3, data_dict_4, package_method_4, data_dict_5, package_method_5, data_dict_6, package_method_6, "none", "none", "none", "none") data_dict_com_scored = score_dict(data_dict_com) combined_dict_to_csv(data_dict_com_scored, output_file_CD_masslist, output_file_all_data) else: data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, data_dict_3, package_method_3, data_dict_4, package_method_4, data_dict_5, package_method_5, "none", "none", "none", "none", "none", "none") data_dict_com_scored = score_dict(data_dict_com) combined_dict_to_csv(data_dict_com_scored, output_file_CD_masslist, output_file_all_data) else: data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, data_dict_3, package_method_3, data_dict_4, package_method_4, "none", "none", "none", "none", "none", "none", "none", "none") data_dict_com_scored = score_dict(data_dict_com) combined_dict_to_csv(data_dict_com_scored, output_file_CD_masslist, output_file_all_data) else: data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, data_dict_3, package_method_3, "none", "none", "none", "none", "none", "none", "none", "none", "none", "none") data_dict_com_scored = score_dict(data_dict_com) combined_dict_to_csv(data_dict_com_scored, output_file_CD_masslist, output_file_all_data) else: data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, "none", "none", "none", "none", "none", "none", "none", "none", "none", "none", "none", "none") data_dict_com_scored = score_dict(data_dict_com) combined_dict_to_csv(data_dict_com_scored, output_file_CD_masslist, output_file_all_data) if scoring_system == False: if consider_file_2 == "yes": if consider_file_3 == "yes": if consider_file_4 == "yes": if consider_file_5 == "yes": if consider_file_6 == "yes": if consider_file_7 == "yes": if consider_file_8 == "yes": data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, data_dict_3, package_method_3, data_dict_4, package_method_4, data_dict_5, package_method_5, data_dict_6, package_method_6, data_dict_7, package_method_7, data_dict_8, package_method_8) combined_dict_to_csv(data_dict_com, output_file_CD_masslist, output_file_all_data) else: data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, data_dict_3, package_method_3, data_dict_4, package_method_4, data_dict_5, package_method_5, data_dict_6, package_method_6, data_dict_7, package_method_7, "none", "none") combined_dict_to_csv(data_dict_com, output_file_CD_masslist, output_file_all_data) else: data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, data_dict_3, package_method_3, data_dict_4, package_method_4, data_dict_5, package_method_5, data_dict_6, package_method_6, "none", "none", "none", "none") combined_dict_to_csv(data_dict_com, output_file_CD_masslist, output_file_all_data) else: data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, data_dict_3, package_method_3, data_dict_4, package_method_4, data_dict_5, package_method_5, "none", "none", "none", "none", "none", "none") combined_dict_to_csv(data_dict_com, output_file_CD_masslist, output_file_all_data) else: data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, data_dict_3, package_method_3, data_dict_4, package_method_4, "none", "none", "none", "none", "none", "none", "none", "none") combined_dict_to_csv(data_dict_com, output_file_CD_masslist, output_file_all_data) else: data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, data_dict_3, package_method_3, "none", "none", "none", "none", "none", "none", "none", "none", "none", "none") combined_dict_to_csv(data_dict_com, output_file_CD_masslist, output_file_all_data) else: data_dict_com = combine_dict(data_dict_1, package_method_1, data_dict_2, package_method_2, "none", "none", "none", "none", "none", "none", "none", "none", "none", "none", "none", "none") combined_dict_to_csv(data_dict_com, output_file_CD_masslist, output_file_all_data) ############################################################################################################################################################################################################################################################### print("Script finished successfully") # end of script ############################################################################################################################################################################################################################################################### # ░░░░░░░░███████████████░░░░░░░░ # ░░░░░█████████████████████░░░░░ # ░░░░████████████████████████░░░ # ░░░██████████████████████████░░ # ░░█████████████████████████████ # ░░███████████▀░░░░░░░░░████████ # ░░███████████░░░░░░░░░░░░░░░███ # ░████████████░░░░░░░░░░░░░░░░██ # ░█░░███████░░░░░░░░░░░▄▄░░░░░██ # █░░░░█████░░░░░░▄███████░░██░░█ # █░░█░░░███░░░░░██▀▀░░░░░░░░██░█ # █░░░█░░░░░░░░░░░░▄██▄░░░░░░░███ # █░░▄█░░░░░░░░░░░░░░░░░░█▀▀█▄░██ # █░░░░░░░░░░░░░░░░░░░░░░█░░░░██░ # ░███░░░░░░░░░░░░░░░░░░░█░░░░█░░ # ░░█░█░░░░░░░█░░░░░██▀▄░▄██░░░█░ # ░░█░█░░░░░░█░░░░░░░░░░░░░░░░░█░ # ░░░██░░░░░░█░░░░▄▄▄▄▄▄░░░░░░█░░ # ░░░██░░░░░░░█░░█▄▄▄▄░▀▀██░░█░░░ # ░░░██░░░░░░░█░░▀████████░░█░░░░ # ░░█░░█░░░░░░░█░░▀▄▄▄▄██░░█░░░░░ # ░░█░░░█░░░░░░░█░░░░░░░░░█░░░░░░ # ░█░░░░░█░░░░░░░░░░░░░░░░█░░░░░░ # ░░░░░░░░█░░░░░░█░░░░░░░░█░░░░░░ # ░░░░░░░░░░░░░░░░████████░░░░░░░ diff --git a/readme.md b/readme.md index c849028..07bf684 100644 --- a/readme.md +++ b/readme.md @@ -1,46 +1,46 @@ # TP_predict - Predict TPs and create suspect lists This collection of scripts allows the user to reproduce the TP prediction and data analyses presented in the following publication: Trostel, L. & Coll, C., Fenner, K., Hafner, J. Synergy of predictive and analytical methods advances elucidating biotransformation processes in activated sludge, 2023. [insert DOI] The tools can further be used to perform the same predictions and analyses on a different set of compounds. ## Content * **TP_prediction**: Script to predict TPs and corresponding biodegradation pathways * **File_conversion**: Conversion of prediction output to input for suspect screening tools * Prediction_output_to_mass_list * SMILES_to_mass_and_inclusion_list * **Additional_analyses** * Compare_methods * Analyse_cutoff_thresholds Specific user guidance can be found in the README.md files of the content folders. ## How to To fetch the code from the git repository, open a terminal and run: ``` $ git clone [insert link] ``` To install the dependencies, go to the nicepath directory and run: ``` $ cd TP_predict $ make ``` ## Installation and requirements The scripts requires rdkit for python, which is easiest installed in a conda environment. All scripts have been developed and tested in Python 3.6 and higher. -### Anaconda: Step by step guide for non-python users: +### Anaconda step by step guide for non-python users: 1. [Download Anaconda](https://docs.anaconda.com/anaconda/install/index.html) and install it, then run `Anaconda Navigator` 2. create new environment under the `Environment` tab, select python version 3.6.13 3. go to environments, click `play button` on newly created environment, open Terminal 4. run following lines individually (need to confirm: type `y` and press `enter`)(might take a while): `conda install -c rdkit rdkit` and `pip install pubchempy` 5. check if pandas is installed and active according to [this Tutorial](https://docs.anaconda.com/anaconda/navigator/tutorials/pandas/) 6. open `Anaconda Navigator`, go to `Home` tab, check if `Applications on` is set to the new environment 7. click `gear icon` on `Spyder` > install specific version > 5.0.5 and wait for installation to finish 8. click `launch button` below `Spyder`