diff --git a/extract_all_compounds.py b/extract_all_compounds.py index 4ffd3e9..a1e8e6e 100644 --- a/extract_all_compounds.py +++ b/extract_all_compounds.py @@ -1,669 +1,669 @@ import sys import numpy as np import pandas as pd # pd.set_option('display.max_columns', None) sys.path.insert(0, "C:\\Users\\leetseng\\enviPath-python\\enviPath_python\\") #previous address C:\envipath_code\Users\leetseng\enviPath-python C:\\Users\\leetseng\\enviPath-python\\enviPath_python\\ sys.path.insert(1, "C:\\Users\\leetseng\\enviPath-python\\") from enviPath import * from objects import * import rdkit from rdkit import Chem from rdkit.Chem.MolStandardize import rdMolStandardize from rdkit.Chem import Descriptors from rdkit.Chem.rdMolDescriptors import CalcMolFormula file_location = "C:\\Users\\leetseng\\TWtest" # input_file_path = file_location+'input/sludge_compounds_final.txt' output_file_path_full = file_location+'\\output\\sludge_Rich_raw.tsv' # Define the instance to use INSTANCE_HOST = 'https://envipath.org' USERNAME = 'leetseng' -# PASSWORD = getpass.getpass(prompt = "Enter your password:") -PASSWORD = 'Joan1388' +PASSWORD = getpass.getpass(prompt = "Enter your password:") + eP = enviPath(INSTANCE_HOST) eP.login(USERNAME, PASSWORD) #getpass.getpass() # eP.logout() print(eP.who_am_i().get_name()) -package_id = 'https://envipath.org/package/8d3d7ca2-ae4e-4779-a6a2-d3539237c439' +package_id = 'https://envipath.org/packag3e/8dd7ca2-ae4e-4779-a6a2-d3539237c439' #Leo Sludge 'https://envipath.org/package/195bc500-f0c6-4bcb-b2fe-f1602b5f20a2' #Rich Sludge 'https://envipath.org/package/8d3d7ca2-ae4e-4779-a6a2-d3539237c439' #Original Sludge 'https://envipath.org/package/4a3cd0f4-4d2b-4f00-b3e6-a29e721f7038' package = Package(eP.requester, id=package_id) all_scenarios = package.get_scenarios() all_pathways = package.get_pathways() print('Number of scenarios found:', len(all_scenarios)) print('Number of pathways found:', len(all_pathways)) def __main__(): D = { "scenario_id": [], "compound_id": [], "compound_name": [], "smiles": [], "reduced_smiles": [], "halflife_raw": [], "halflife_unit": [], "halflife_model_TF": [], "halflife_comment": [], "rateconstant": [], "rateconstant_unit": [], "rateconstant_comment": [], "halflife_model": [], "acidity": [], "acidity_unit": [], "temperature": [], "temperature_unit": [], "original_sludge_amount": [], "original_sludge_amount_unit": [], "sludge_retention_time": [], "sludge_retention_time_unit": [], "sludge_retention_time_type": [], "total_suspended_solids_concentration_start": [], "total_suspended_solids_concentration_end": [], "total_suspended_solids_concentration_unit": [], # "volatile_suspended_solids_concentration_start": [], "volatile_suspended_solids_concentration_end": [], "volatile_suspended_solids_concentration_unit": [], "addition_of_nutrients": [], "biological_treatment_technology": [], "bioreactor_type": [], "bioreactor_value": [], "bioreactor_value_unit": [], "nitrogen_content_type": [], "nitrogen_content_influent": [], "oxygen_demand_type": [], "oxygen_demand_value": [], "oxygen_uptake_rate": [], "oxygen_uptake_rate_unit": [], "phosphorus_content": [], "redox": [], "source_of_liquid_matrix": [], "type_of_addition": [], "type_of_aeration": [], "inoculum_source": [], "location": [], "purpose_of_wwtp": [], } #testing # scenario_id = '' # test_scenario = Scenario(eP.requester, id=scenario_id) # add_info = test_scenario.get_additional_information() # rateconstant_test = add_info.get_rateconstant() # # print(rateconstant_test) i = 0 for pathway in all_pathways: for node in pathway.get_nodes(): print(node.get_id()) #this is not compound_id, it should be the Node id. i += 1 print("checking node # ", i) try: scenarios = node.get_scenarios() except: continue else: for scenario in scenarios: D = add_scenario_information(D, scenario, node) sludge_df = pd.DataFrame.from_dict(D) sludge_df.index = np.arange(1, len(sludge_df) + 1) sludge_df.index.name = 'index' print(sludge_df.describe()) sludge_df.to_csv(output_file_path_full, mode='w', sep="\t") #a, w, r def add_scenario_information(D, scenario, node): full_scenario = Scenario(eP.requester, id=scenario.get_id()) add_info = full_scenario.get_additional_information() try: halflife_object = add_info.get_halflife() has_hf = True except AttributeError: has_hf = False try: rateconstant_object = add_info.get_rateconstant() has_rateconstant = True except AttributeError: has_rateconstant = False if has_hf or has_rateconstant: ### D['compound_id'].append(arcessere_compound_id(node)) D['compound_name'].append(arcessere_compound_name(node)) D['smiles'].append(arcessere_smiles(node)) D['reduced_smiles'].append(canonicalize_smiles(arcessere_smiles(node))) # D['reduced_smiles'].append(reduced_smiles) # cropped_canonical_smiles_no_stereo print(full_scenario.get_id()) D['acidity'].append(arcessere_acidity(add_info)) D['acidity_unit'].append(arcessere_acidity_unit(add_info)) D['addition_of_nutrients'].append(arcessere_addition_of_nutrients(add_info)) D['biological_treatment_technology'].append(arcessere_biological_treatment_technology(add_info)) D['bioreactor_type'].append(arcessere_bioreactor_type(add_info)) D['bioreactor_value'].append(arcessere_bioreactor_value(add_info)) D['bioreactor_value_unit'].append(arcessere_bioreactor_value_unit(add_info)) # D['confidencelevel'].append(arcessere_confidencelevel(add_info)) D['halflife_raw'].append(arcessere_halflife(add_info)) D['halflife_unit'].append(arcessere_halflife_unit(add_info)) D['halflife_model_TF'].append(arcessere_halflife_model(add_info)) D['halflife_comment'].append(arcessere_halflife_comment(add_info)) D['inoculum_source'].append(arcessere_inoculum_source(add_info)) D['location'].append(arcessere_location(add_info)) D['nitrogen_content_type'].append(arcessere_nitrogen_content_type(add_info)) D['nitrogen_content_influent'].append(arcessere_nitrogen_content_influent(add_info)) D['original_sludge_amount'].append(arcessere_original_sludge_amount(add_info)) D['original_sludge_amount_unit'].append(arcessere_original_sludge_amount_unit(add_info)) D['oxygen_demand_type'].append(arcessere_oxygen_demand_type(add_info)) D['oxygen_demand_value'].append(arcessere_oxygen_demand_value(add_info)) D['oxygen_uptake_rate_unit'].append(arcessere_oxygen_uptake_rate_unit(add_info)) D['oxygen_uptake_rate'].append(arcessere_oxygen_uptake_rate(add_info)) D['phosphorus_content'].append(arcessere_phosphorus_content(add_info)) D['purpose_of_wwtp'].append(arcessere_purpose_of_wwtp(add_info)) D['rateconstant'].append(arcessere_rate_constant(add_info)) D['rateconstant_unit'].append(arcessere_rate_constant_unit(add_info)) D['halflife_model'].append(arcessere_reaction_order(add_info)) D['rateconstant_comment'].append(arcessere_rate_constant_comment(add_info)) D['redox'].append(arcessere_redox(add_info)) D['scenario_id'].append(scenario.get_id()) D['sludge_retention_time_type'].append(arcessere_sludge_retention_time_type(add_info)) D['sludge_retention_time'].append(arcessere_sludge_retention_time(add_info)) D['sludge_retention_time_unit'].append(arcessere_sludge_retention_time_unit(add_info)) D['source_of_liquid_matrix'].append(arcessere_source_of_liquid_matrix(add_info)) D['temperature'].append(arcessere_temperature(add_info)) D['temperature_unit'].append(arcessere_temperature_unit(add_info)) D['total_suspended_solids_concentration_start'].append(arcessere_tss_start(add_info)) D['total_suspended_solids_concentration_end'].append(arcessere_tss_end(add_info)) D['total_suspended_solids_concentration_unit'].append(arcessere_tss_unit(add_info)) D['type_of_addition'].append(arcessere_type_of_addition(add_info)) D['type_of_aeration'].append(arcessere_type_of_aeration(add_info)) # D['volatile_suspended_solids_concentration_start'].append(arcessere_volatile_ss_start(add_info)) # D['volatile_suspended_solids_concentration_end'].append(arcessere_volatile_ss_end(add_info)) # D['volatile_suspended_solids_concentration_unit'].append(arcessere_volatile_ss_unit(add_info)) return D def arcessere_compound_id(node): try: id_from_node = node.get_default_structure().get_id() except: return '' else: return id_from_node def arcessere_compound_name(node): try: name_from_node = node.get_default_structure().get_name() except ValueError: return '' else: return name_from_node #.split(',')[0] only pick up one compound name def arcessere_smiles(node): try: smiles_from_node = node.get_default_structure().get_smiles() except: return '' else: return smiles_from_node def canonicalize_smiles(smiles_from_node): mol = Chem.MolFromSmiles(smiles_from_node) # creates mol object from SMILES uncharger = rdMolStandardize.Uncharger() # easier to access uncharged = uncharger.uncharge(mol) # protonates or deprotonates the mol object new_smiles = rdkit.Chem.rdmolfiles.MolToSmiles(uncharged) # converts mol object to canonical SMILES can_smiles = Chem.CanonSmiles(new_smiles) return can_smiles def arcessere_acidity(add_info): try: raw_pH = add_info.get_acidity().get_value() except: return np.NaN else: if ';' in raw_pH: if ' - ' in raw_pH: pH = range_to_average(raw_pH.split(';')[0]) else: pH = float(raw_pH.split(';')[0]) return np.round(pH, 1) def arcessere_acidity_unit(add_info): try: pH_unit = add_info.get_acidity().get_unit() except: return '' else: return pH_unit def arcessere_addition_of_nutrients(add_info): try: addition_of_nutrients = add_info.get_additionofnutrients().get_value() except: return '' else: return addition_of_nutrients def arcessere_biological_treatment_technology(add_info): try: biological_treatment_technology = add_info.get_biologicaltreatmenttechnology().get_value() except: return '' else: return biological_treatment_technology def arcessere_bioreactor_type(add_info): ######################################### try: bioreactor_type = add_info.get_bioreactor().get_value().split(',')[0] except: return '' else: return bioreactor_type def arcessere_bioreactor_value(add_info): try: bioreactor = float(add_info.get_bioreactor().get_value().split(',')[1]) except ValueError: return np.NaN except: return np.NaN else: return bioreactor def arcessere_bioreactor_value_unit(add_info): try: bioreactor_unit = add_info.get_bioreactor().get_unit() except: return '' else: return bioreactor_unit def arcessere_confidencelevel(add_info): try: confidencelevel = add_info.get_confidencelevel().get_value() except: return np.NaN else: return float(confidencelevel) def arcessere_inoculum_source(add_info): try: inoculumsource = add_info.get_inoculumsource().get_value() except: return '' else: return inoculumsource def arcessere_location(add_info): try: location = add_info.get_location().get_value() except: return '' else: return location def arcessere_minormajor(add_info): try: minormajor = add_info.get_minormajor().get_value() except: return '' else: return minormajor def arcessere_nitrogen_content_type(add_info): try: nitrogencontent = add_info.get_nitrogencontent().get_value() except: return '' else: if '₂' in nitrogencontent.split(';')[0]: return nitrogencontent.split(';')[0].replace('₂', '\u2082') elif '₃' in nitrogencontent.split(';')[0]: return nitrogencontent.split(';')[0].replace('₃', '\u2083') elif '₄' in nitrogencontent.split(';')[0]: return nitrogencontent.split(';')[0].replace('₄', '\u2084') def arcessere_nitrogen_content_influent(add_info): try: nitrogencontent = add_info.get_nitrogencontent().get_value() except: return np.NaN else: return float(nitrogencontent.split(';')[1]) def arcessere_original_sludge_amount(add_info): try: originalsludgeamount = add_info.get_originalsludgeamount().get_value() except: return np.NaN else: return originalsludgeamount def arcessere_original_sludge_amount_unit(add_info): try: originalsludgeamount_unit = add_info.get_originalsludgeamount().get_unit() except: return '' else: return originalsludgeamount_unit def arcessere_oxygen_demand_type(add_info): #checking scenario 298 try: oxygendemand = add_info.get_oxygendemand().get_value() except: return '' else: return oxygendemand.split(';')[0] #return chemical oxygen demand or biological oxygen demand def arcessere_oxygen_demand_value(add_info): try: oxygendemand = add_info.get_oxygendemand().get_value() except: return np.NaN else: return float(oxygendemand.split(';')[1]) def arcessere_oxygen_uptake_rate(add_info): try: our = add_info.get_oxygenuptakerate.get_value() except: return np.NaN else: return range_to_average(our) def arcessere_oxygen_uptake_rate_unit(add_info): try: sludgeretentiontime_unit = add_info.get_oxygenuptakerate().get_unit() except: return '' else: # return sludgeretentiontime_unit if "⁻¹" in sludgeretentiontime_unit.split(' ')[1] and "⁻¹" in sludgeretentiontime_unit.split(' ')[2]: return sludgeretentiontime_unit.split(' ')[0] + '/(L * h)' def arcessere_phosphorus_content(add_info): try: phosphoruscontent = add_info.get_phosphoruscontent().get_value() except: return np.NaN else: return phosphoruscontent.split(';')[0] def arcessere_proposed_intermediate(add_info): try: proposedintermediate = add_info.get_proposedintermediate().get_value() except: return '' else: return proposedintermediate def arcessere_purpose_of_wwtp(add_info): try: purposeofwwtp = add_info.get_purposeofwwtp().get_value() except: return '' else: return purposeofwwtp def arcessere_redox(add_info): try: redox = add_info.get_redox().get_value() except: return '' else: return redox def arcessere_sludge_retention_time_type(add_info): try: sludgeretentiontime = add_info.get_sludgeretentiontime().get_value() except: return '' else: return sludgeretentiontime.split(';')[0] def arcessere_sludge_retention_time(add_info): try: sludgeretentiontime = add_info.get_sludgeretentiontime().get_value() except: return np.NaN else: return float(sludgeretentiontime.split(';')[1]) def arcessere_sludge_retention_time_unit(add_info): try: sludgeretentiontime_unit = add_info.get_sludgeretentiontime().get_unit() except: return '' else: return sludgeretentiontime_unit def arcessere_source_of_liquid_matrix(add_info): try: sourceofliquidmatrix = add_info.get_sourceofliquidmatrix().get_value() except: return '' else: return sourceofliquidmatrix def arcessere_tss_start(add_info): try: tts = add_info.get_tts().get_value() except: return np.NaN #not sure if this return type is correct or not else: return float(tts.split(' _ ')[0].split(' - ')[0]) def arcessere_tss_end(add_info): try: tts = add_info.get_tts().get_value() except: return np.NaN #not sure if this return type is correct or not else: return float(tts.split(' _ ')[0].split(' - ')[1]) def arcessere_tss_unit(add_info): try: tts_unit = add_info.get_tts().get_unit() except: return '' else: return tts_unit def arcessere_type_of_addition(add_info): try: typeofaddition = add_info.get_typeofaddition().get_value() except: return '' else: return typeofaddition def arcessere_type_of_aeration(add_info): try: typeofaeration = add_info.get_typeofaeration().get_value() except: return '' else: return typeofaeration def arcessere_volatile_ss_start(add_info): try: vss_start = add_info.get_volatiletts().get_value() except: return np.NaN #not sure if this return type is correct or not else: if ' - ' in vss_start: return float(vss_start.split(' - ')[0]) else: return float(vss_start) def arcessere_volatile_ss_end(add_info): try: vss_end = add_info.get_volatiletts().get_value() except: return np.NaN #not sure if this return type is correct or not else: if ' - ' in vss_end: return float(vss_end.split(' - ')[1]) else: return float(vss_end) def arcessere_volatile_ss_unit(add_info): try: vss_unit = add_info.get_volatiletts().get_unit() except: return '' else: return vss_unit def range_to_average(input_string): if '-' in input_string: min = float(input_string.split(' - ')[0]) max = float(input_string.split(' - ')[1]) average = np.average([min, max]) elif ';' in input_string: min = float(input_string.split(';')[0]) max = float(input_string.split(';')[1]) average = np.average([min, max]) else: average = input_string return average def arcessere_rate_constant(add_info): try: rate_constant = add_info.get_rateconstant().get_value() except: return np.NaN else: # return float(rate_constant.split(';')[2].split(' - ')[0]) min = rate_constant.split(';')[2].split(' - ')[0] max = rate_constant.split(';')[2].split(' - ')[1] if min != 'NaN' and max != 'NaN': average = np.average([float(min), float(max)]) elif min != 'NaN' and max == 'NaN': average = float(min) elif min == 'NaN' and max != 'NaN': average = float(max) return average def arcessere_rate_constant_unit(add_info): try: rate_constant_unit = add_info.get_rateconstant().get_unit() except: return '' else: if 'μg' in rate_constant_unit: return rate_constant_unit.replace('μg', '\u338D') else: return rate_constant_unit def arcessere_reaction_order(add_info): try: rate_constant = add_info.get_rateconstant().get_value() except: return '' else: return rate_constant.split(';')[0] def arcessere_rate_constant_comment(add_info): try: rate_constant_comment = add_info.get_rateconstant().get_value().split(';')[3] except: return '' else: return rate_constant_comment def arcessere_halflife(add_info): try: hf = add_info.get_halflife().get_value() except: return np.NaN else: # return float(hf.split(';')[3].split(' - ')[0]) min = float(hf.split(';')[3].split(' - ')[0]) max = float(hf.split(';')[3].split(' - ')[1]) average = np.average([min, max]) return average def arcessere_halflife_unit(add_info): try: hf_unit = add_info.get_halflife().get_unit() except: return '' else: return hf_unit def arcessere_halflife_model(add_info): try: hl = add_info.get_halflife().get_value() except: return '' else: return hl.split(';')[0] def arcessere_halflife_comment(add_info): try: hl = add_info.get_halflife().get_value() except: return '' else: return hl.split(';')[2] def arcessere_halflife_source(add_info): try: hl = add_info.get_halflife().get_value() except: return '' else: return hl.split(';')[4] def arcessere_temperature(add_info): try: temp = add_info.get_temperature().get_value() except: return np.NaN else: min = float(temp.split(';')[0]) max = float(temp.split(';')[1]) return np.round(np.average([min, max]), 0) def arcessere_temperature_unit(add_info): try: temp_unit = add_info.get_temperature().get_unit() except: return '' else: return '\u2103' #Do not iterate all the scenarios at once. # scenario_datatypes = set() # for scenario in all_scenarios[350:494]: # scen = Scenario(eP.requester, id=scenario.get_id()) # addinfo = scen.get_additional_information() # if addinfo: # for dt in addinfo.get_data_types(): # scenario_datatypes.add(dt) # print(scenario_atatypes) # # "Dissolvedoxygenconcentrationaerationtype": "No attribute", # "finalcompoundconcentration": "No attribute", # "solventforcompoundsolution": "No attribute", # "dissolvedorganiccarbonamionauptakerate": 0, # "typeofaerationoxygenuptakerate": 0, # # scenario_data_types_count = { # "acidity": 439, # "additionofnutrients": 38, # "biologicaltreatmenttechnology": 356, # "bioreactor_type": 491, # "bioreactor_value": 491, # "confidencelevel": 164, # "halflife": 15, # "halflife_model": 15, # "halflife_comment": 15, # "inoculumsource": 449, # "location": 473, # "minormajor": 18, # "nitrogencontent_type": 105, # "nitrogencontent_influent": 105, # "originalsludgeamount": 408, # "oxygendemand": 99, # "phosphoruscontent": 88, # "proposedintermediate": 18, # "purposeofwwtp": 475, # "rateconstant": 218, # "rateconstant_comment": 218, # "reaction_order": 218, # "redox": 482, # "sludgeretentiontime": 381, # "sludgeretentiontimetype": 381, # "sourceofliquidmatrix": 403, # "temperature": 403, # "tts_start": 398, # "tts_end": 398, # "typeofaddition": 426, # "typeofaeration": 470, # "volatiletts_start": 15, # "volatiletts_end": 15, # } # df2 = pd.DataFrame.from_dict(scenario_data_types_count, orient='index') # print(df2) # sns.countplot(data=df2, x=) # plt.show() __main__() \ No newline at end of file