diff --git a/expand_dataset.py b/expand_dataset.py deleted file mode 100644 index 24e7a02..0000000 --- a/expand_dataset.py +++ /dev/null @@ -1,261 +0,0 @@ -# Script by Jasmin, Jan 2022, Eawag -# Expand compound entries with all half-lives and associated data -# Goal: create full data set -import pandas as pd -import numpy as np -import sys -import re -sys.path.insert(0, 'C:\\Users\\leetseng\\enviPath-python\\') #/Users/jasmin/enviPath-python/enviPath_python/ -from enviPath_python.enviPath import * -from enviPath_python.objects import * -file_location = '/Halflife_modeling/' - -# Define the instance to use -INSTANCE_HOST = 'https://envipath.org' -username = 'leetseng' -password = 'Joan1388' #password = getpass.getpass() -eP = enviPath(INSTANCE_HOST) -eP.login(username, password) - -# files -input_file_path = file_location+'input/soil_compounds_final.txt' -output_file_path_full = file_location+'output/full_dataset_half-lives.txt' - - -#todo : -# 1. create file similar to tomu's data set - can we improve model just by re-obtaining all the data ? -# 2. try to improve model - -# try to switch to the sludge dataset -# try to collect the information and fill into the dictionary -# transfer the function to extract all compounds - - - -def __main__(): - # data frames - data = pd.read_csv(input_file_path, sep='\t') - # available headers: Index Name ID Inchikey_full Inchikey_first_14 Full_SMILES Canonical_SMIES Is_composite - # Cropped_canonical_SMILES Cropped_canonical_SMILES_no_stereo SMILES_pH_7 Number_halflives Comment Additional_HL_source - # Mol_weigth pKa pKa_5to8 charge_pH_7 log(Koc) Koc_source - - D = {'index': [], 'compound_id': [], 'smiles': [], 'reduced_smiles': [], 'halflife': [], 'scenario_id': [], 'study_name':[], - 'halflife_model': [], 'halflife_comment': [], 'spike_compound': [], 'acidity': [], 'CEC': [], 'OC': [], - 'biomass_start': [], 'biomass_end': [], 'biomass': [], 'temperature': [], 'wst_value': [], - 'wst_type': [], 'humidity': [], 'humidity_conditions': [], 'soil_texture': [], 'sand': [], 'silt': [], 'clay': [], 'log(Koc)':[], 'Koc_source':[]} - - for index, row in data.iterrows(): - compound_id = row['ID'] - print("COMPOUND: {}\n".format(compound_id)) - D = add_halflives(D, compound_id, row) - if type(row['Additional_HL_source']) == str: - D = add_halflives(D, row['Additional_HL_source'], row) - - hl_data = pd.DataFrame.from_dict(D) #convert dict into DF - hl_data.to_csv(output_file_path_full, sep='\t') - - -def fetch_acidity(info): - try: - raw_pH = info.get_acidity().get_value() - except: - return np.NaN - else: - if ';' in raw_pH: - if '-' in raw_pH.split(';')[0]: - pH = range_to_average(raw_pH.split(';')[0]) - else: - pH = float(raw_pH.split(';')[0]) - elif '-' in raw_pH: # if range, get mean value - pH = range_to_average(raw_pH) - else: - pH = float(raw_pH) - return np.round(pH, 1) - -def range_to_average(input_string): - min = float(input_string.split(' - ')[0]) - max = float(input_string.split(' - ')[1]) - avg = np.average([min, max]) - return avg - -def fetch_cec(info): - try: - cec = info.get_cec().get_value() - except: - return np.NaN - else: - return cec - -def fetch_organic_content(info): - try: - raw = info.get_omcontent().get_value() - except: - return np.NaN - else: - raw_list = raw.split(';') - oc = np.NaN - for i in raw_list: - if i == 'OC': - oc = val - elif i == 'OM': - oc = val / 1.7 # OC = OM / 1.7, source: Schwarzenbach - else: - if '<' in i: - val = float(i[1:]) - print("Warning: {} was converted to {}".format(i, val)) - elif i == '' or i == '-': - val = np.NaN - else: - val = float(i) - return oc - -def fetch_biomass(info): - try: - raw = info.get_biomass().get_value() - except: - return np.NaN, np.NaN - else: - l = raw.split(' - ') - return float(l[0]), float(l[1]) - -def fetch_temperature(info): - try: - raw = info.get_temperature().get_value() - except: - return np.NaN - else: - min = float(raw.split(';')[0]) - max = float(raw.split(';')[1]) - return np.round(np.average([min, max]), 0) - -def fetch_wst(info): - try: - raw = info.get_waterstoragecapacity().get_value() - except: - return np.NaN, '' - else: - raw_list = raw.replace(" ", "").split('-') - if len(raw_list) < 4: - value = float(raw_list[0]) - type = raw_list[1] - else: - value = np.NaN - type = raw_list[2] - return value, type - -def fetch_humidity(info): - try: - raw = info.get_humidity().get_value() - except: - return np.NaN, '' - else: - if type(raw) == float: - return raw, '' - else: - l = raw.split(' - ') - return float(l[0]), l[1] - -def fetch_soiltexture1(info): - try: - raw = info.get_soiltexture1().get_value() - except: - return '' - else: - return raw - -def fetch_spikecompound(info): - try: - raw = info.get_spikecompound().get_compoundLink() - spike_cpd = CompoundStructure(eP.requester, id=raw) - spike_smiles = spike_cpd.get_smiles() - except: - return '' - else: - return spike_smiles - -def fetch_soiltexture2(info): - try: - raw = info.get_soiltexture2().get_value() - except: - return np.NaN, np.NaN, np.NaN - else: - values = re.findall(r'\s([\d.]+)%', raw) ######################## - if values == []: - return np.NaN, np.NaN, np.NaN - return get_float_or_nan(values[0]), get_float_or_nan(values[1]), get_float_or_nan(values[2]) # sand, silt, clay - -def fetch_halflife_model(info): - try: - raw = info.get_halflife().get_value() - except: - return '' - else: - return raw.split(';')[0] - -def fetch_halflife_comment(info): - try: - raw = info.get_halflife().get_value() - except: - return '' - else: - return raw.split(';')[2] - -def get_float_or_nan(x): - try: - return float(x) - except: - return np.NaN - -def add_halflives(D, compound_id, row): - compound_index = row['Index'] - reduced_smiles = row['Cropped_canonical_SMILES_no_stereo'] - compound_structure = CompoundStructure(eP.requester, id=compound_id) - compound = Compound(eP.requester, id=compound_id) - print(compound_id) - halflives = compound_structure.get_halflifes() - smiles = compound.get_smiles() - for hl in halflives: - compound info - D['index'].append(compound_index) - D['compound_id'].append(compound_id) - D['smiles'].append(smiles) - D['reduced_smiles'].append(reduced_smiles) # cropped_canonical_smiles_no_stereo - D['halflife'].append(float(hl.hl)) - D['scenario_id'].append(hl.scenarioId) - D['log(Koc)'].append(row['log(Koc)']) - D['Koc_source'].append(row['Koc_source']) - print(hl.scenarioId) - fetch data structures - scenario = Scenario(eP.requester, id=hl.scenarioId) - add_info = scenario.get_additional_information() - add halflife details - D['halflife_model'].append(fetch_halflife_model(add_info)) - D['halflife_comment'].append(fetch_halflife_comment(add_info)) - D['study_name'].append(scenario.get_name().split(' - ')[0]) - D['spike_compound'].append(fetch_spikecompound(add_info)) - fetch data points - D['acidity'].append(fetch_acidity(add_info)) - D['CEC'].append(fetch_cec(add_info)) # cation exchange capacity - D['OC'].append(fetch_organic_content(add_info)) # organic content as organic carbon (oc) - start, end = fetch_biomass(add_info) - D['biomass_start'].append(start) - D['biomass_end'].append(end) - D['biomass'].append(np.round(np.average([start, end]), 2)) - D['temperature'].append(fetch_temperature(add_info)) - wst_value, wst_type = fetch_wst(add_info) # water storage capacity, - D['wst_value'].append(wst_value) - D['wst_type'].append(wst_type) - hum, hum_cond = fetch_humidity(add_info) - D['humidity'].append(hum) - D['humidity_conditions'].append(hum_cond) - D['soil_texture'].append(fetch_soiltexture1(add_info)) - _sand, _silt, _clay = fetch_soiltexture2(add_info) - D['sand'].append(_sand) - D['silt'].append(_silt) - D['clay'].append(_clay) - return D - - -__main__() - -