diff --git a/expand_dataset_.py b/expand_dataset_.py new file mode 100644 index 0000000..dafbf73 --- /dev/null +++ b/expand_dataset_.py @@ -0,0 +1,261 @@ +# Script by Jasmin, Jan 2022, Eawag +# Expand compound entries with all half-lives and associated data +# Goal: create full data set +import pandas as pd +import numpy as np +import sys +import re +sys.path.insert(0, 'C:\\Users\\leetseng\\enviPath-python\\') #/Users/jasmin/enviPath-python/enviPath_python/ +from enviPath_python.enviPath import * +from enviPath_python.objects import * +file_location = '/Halflife_modeling/' + +# Define the instance to use +INSTANCE_HOST = 'https://envipath.org' +username = 'leetseng' +password = getpass.getpass() +eP = enviPath(INSTANCE_HOST) +eP.login(username, password) + +# files +input_file_path = file_location+'input/soil_compounds_final.txt' +output_file_path_full = file_location+'output/full_dataset_half-lives.txt' + + +#todo : +# 1. create file similar to tomu's data set - can we improve model just by re-obtaining all the data ? +# 2. try to improve model + +# try to switch to the sludge dataset +# try to collect the information and fill into the dictionary +# transfer the function to extract all compounds + + + +def __main__(): + # data frames + data = pd.read_csv(input_file_path, sep='\t') + # available headers: Index Name ID Inchikey_full Inchikey_first_14 Full_SMILES Canonical_SMIES Is_composite + # Cropped_canonical_SMILES Cropped_canonical_SMILES_no_stereo SMILES_pH_7 Number_halflives Comment Additional_HL_source + # Mol_weigth pKa pKa_5to8 charge_pH_7 log(Koc) Koc_source + + D = {'index': [], 'compound_id': [], 'smiles': [], 'reduced_smiles': [], 'halflife': [], 'scenario_id': [], 'study_name':[], + 'halflife_model': [], 'halflife_comment': [], 'spike_compound': [], 'acidity': [], 'CEC': [], 'OC': [], + 'biomass_start': [], 'biomass_end': [], 'biomass': [], 'temperature': [], 'wst_value': [], + 'wst_type': [], 'humidity': [], 'humidity_conditions': [], 'soil_texture': [], 'sand': [], 'silt': [], 'clay': [], 'log(Koc)':[], 'Koc_source':[]} + + for index, row in data.iterrows(): + compound_id = row['ID'] + print("COMPOUND: {}\n".format(compound_id)) + D = add_halflives(D, compound_id, row) + if type(row['Additional_HL_source']) == str: + D = add_halflives(D, row['Additional_HL_source'], row) + + hl_data = pd.DataFrame.from_dict(D) #convert dict into DF + hl_data.to_csv(output_file_path_full, sep='\t') + + +def fetch_acidity(info): + try: + raw_pH = info.get_acidity().get_value() + except: + return np.NaN + else: + if ';' in raw_pH: + if '-' in raw_pH.split(';')[0]: + pH = range_to_average(raw_pH.split(';')[0]) + else: + pH = float(raw_pH.split(';')[0]) + elif '-' in raw_pH: # if range, get mean value + pH = range_to_average(raw_pH) + else: + pH = float(raw_pH) + return np.round(pH, 1) + +def range_to_average(input_string): + min = float(input_string.split(' - ')[0]) + max = float(input_string.split(' - ')[1]) + avg = np.average([min, max]) + return avg + +def fetch_cec(info): + try: + cec = info.get_cec().get_value() + except: + return np.NaN + else: + return cec + +def fetch_organic_content(info): + try: + raw = info.get_omcontent().get_value() + except: + return np.NaN + else: + raw_list = raw.split(';') + oc = np.NaN + for i in raw_list: + if i == 'OC': + oc = val + elif i == 'OM': + oc = val / 1.7 # OC = OM / 1.7, source: Schwarzenbach + else: + if '<' in i: + val = float(i[1:]) + print("Warning: {} was converted to {}".format(i, val)) + elif i == '' or i == '-': + val = np.NaN + else: + val = float(i) + return oc + +def fetch_biomass(info): + try: + raw = info.get_biomass().get_value() + except: + return np.NaN, np.NaN + else: + l = raw.split(' - ') + return float(l[0]), float(l[1]) + +def fetch_temperature(info): + try: + raw = info.get_temperature().get_value() + except: + return np.NaN + else: + min = float(raw.split(';')[0]) + max = float(raw.split(';')[1]) + return np.round(np.average([min, max]), 0) + +def fetch_wst(info): + try: + raw = info.get_waterstoragecapacity().get_value() + except: + return np.NaN, '' + else: + raw_list = raw.replace(" ", "").split('-') + if len(raw_list) < 4: + value = float(raw_list[0]) + type = raw_list[1] + else: + value = np.NaN + type = raw_list[2] + return value, type + +def fetch_humidity(info): + try: + raw = info.get_humidity().get_value() + except: + return np.NaN, '' + else: + if type(raw) == float: + return raw, '' + else: + l = raw.split(' - ') + return float(l[0]), l[1] + +def fetch_soiltexture1(info): + try: + raw = info.get_soiltexture1().get_value() + except: + return '' + else: + return raw + +def fetch_spikecompound(info): + try: + raw = info.get_spikecompound().get_compoundLink() + spike_cpd = CompoundStructure(eP.requester, id=raw) + spike_smiles = spike_cpd.get_smiles() + except: + return '' + else: + return spike_smiles + +def fetch_soiltexture2(info): + try: + raw = info.get_soiltexture2().get_value() + except: + return np.NaN, np.NaN, np.NaN + else: + values = re.findall(r'\s([\d.]+)%', raw) ######################## + if values == []: + return np.NaN, np.NaN, np.NaN + return get_float_or_nan(values[0]), get_float_or_nan(values[1]), get_float_or_nan(values[2]) # sand, silt, clay + +def fetch_halflife_model(info): + try: + raw = info.get_halflife().get_value() + except: + return '' + else: + return raw.split(';')[0] + +def fetch_halflife_comment(info): + try: + raw = info.get_halflife().get_value() + except: + return '' + else: + return raw.split(';')[2] + +def get_float_or_nan(x): + try: + return float(x) + except: + return np.NaN + +def add_halflives(D, compound_id, row): + compound_index = row['Index'] + reduced_smiles = row['Cropped_canonical_SMILES_no_stereo'] + compound_structure = CompoundStructure(eP.requester, id=compound_id) + compound = Compound(eP.requester, id=compound_id) + print(compound_id) + halflives = compound_structure.get_halflifes() + smiles = compound.get_smiles() + for hl in halflives: + compound info + D['index'].append(compound_index) + D['compound_id'].append(compound_id) + D['smiles'].append(smiles) + D['reduced_smiles'].append(reduced_smiles) # cropped_canonical_smiles_no_stereo + D['halflife'].append(float(hl.hl)) + D['scenario_id'].append(hl.scenarioId) + D['log(Koc)'].append(row['log(Koc)']) + D['Koc_source'].append(row['Koc_source']) + print(hl.scenarioId) + fetch data structures + scenario = Scenario(eP.requester, id=hl.scenarioId) + add_info = scenario.get_additional_information() + add halflife details + D['halflife_model'].append(fetch_halflife_model(add_info)) + D['halflife_comment'].append(fetch_halflife_comment(add_info)) + D['study_name'].append(scenario.get_name().split(' - ')[0]) + D['spike_compound'].append(fetch_spikecompound(add_info)) + fetch data points + D['acidity'].append(fetch_acidity(add_info)) + D['CEC'].append(fetch_cec(add_info)) # cation exchange capacity + D['OC'].append(fetch_organic_content(add_info)) # organic content as organic carbon (oc) + start, end = fetch_biomass(add_info) + D['biomass_start'].append(start) + D['biomass_end'].append(end) + D['biomass'].append(np.round(np.average([start, end]), 2)) + D['temperature'].append(fetch_temperature(add_info)) + wst_value, wst_type = fetch_wst(add_info) # water storage capacity, + D['wst_value'].append(wst_value) + D['wst_type'].append(wst_type) + hum, hum_cond = fetch_humidity(add_info) + D['humidity'].append(hum) + D['humidity_conditions'].append(hum_cond) + D['soil_texture'].append(fetch_soiltexture1(add_info)) + _sand, _silt, _clay = fetch_soiltexture2(add_info) + D['sand'].append(_sand) + D['silt'].append(_silt) + D['clay'].append(_clay) + return D + + +__main__() + +