Page MenuHomec4science

expand_dataset_.py
No OneTemporary

File Metadata

Created
Fri, Apr 19, 08:41

expand_dataset_.py

# Script by Jasmin, Jan 2022, Eawag
# Expand compound entries with all half-lives and associated data
# Goal: create full data set
import pandas as pd
import numpy as np
import sys
import re
sys.path.insert(0, 'C:\\Users\\leetseng\\enviPath-python\\') #/Users/jasmin/enviPath-python/enviPath_python/
from enviPath_python.enviPath import *
from enviPath_python.objects import *
file_location = '/Halflife_modeling/'
# Define the instance to use
INSTANCE_HOST = 'https://envipath.org'
username = 'leetseng'
password = getpass.getpass()
eP = enviPath(INSTANCE_HOST)
eP.login(username, password)
# files
input_file_path = file_location+'input/soil_compounds_final.txt'
output_file_path_full = file_location+'output/full_dataset_half-lives.txt'
#todo :
# 1. create file similar to tomu's data set - can we improve model just by re-obtaining all the data ?
# 2. try to improve model
# try to switch to the sludge dataset
# try to collect the information and fill into the dictionary
# transfer the function to extract all compounds
def __main__():
# data frames
data = pd.read_csv(input_file_path, sep='\t')
# available headers: Index Name ID Inchikey_full Inchikey_first_14 Full_SMILES Canonical_SMIES Is_composite
# Cropped_canonical_SMILES Cropped_canonical_SMILES_no_stereo SMILES_pH_7 Number_halflives Comment Additional_HL_source
# Mol_weigth pKa pKa_5to8 charge_pH_7 log(Koc) Koc_source
D = {'index': [], 'compound_id': [], 'smiles': [], 'reduced_smiles': [], 'halflife': [], 'scenario_id': [], 'study_name':[],
'halflife_model': [], 'halflife_comment': [], 'spike_compound': [], 'acidity': [], 'CEC': [], 'OC': [],
'biomass_start': [], 'biomass_end': [], 'biomass': [], 'temperature': [], 'wst_value': [],
'wst_type': [], 'humidity': [], 'humidity_conditions': [], 'soil_texture': [], 'sand': [], 'silt': [], 'clay': [], 'log(Koc)':[], 'Koc_source':[]}
for index, row in data.iterrows():
compound_id = row['ID']
print("COMPOUND: {}\n".format(compound_id))
D = add_halflives(D, compound_id, row)
if type(row['Additional_HL_source']) == str:
D = add_halflives(D, row['Additional_HL_source'], row)
hl_data = pd.DataFrame.from_dict(D) #convert dict into DF
hl_data.to_csv(output_file_path_full, sep='\t')
def fetch_acidity(info):
try:
raw_pH = info.get_acidity().get_value()
except:
return np.NaN
else:
if ';' in raw_pH:
if '-' in raw_pH.split(';')[0]:
pH = range_to_average(raw_pH.split(';')[0])
else:
pH = float(raw_pH.split(';')[0])
elif '-' in raw_pH: # if range, get mean value
pH = range_to_average(raw_pH)
else:
pH = float(raw_pH)
return np.round(pH, 1)
def range_to_average(input_string):
min = float(input_string.split(' - ')[0])
max = float(input_string.split(' - ')[1])
avg = np.average([min, max])
return avg
def fetch_cec(info):
try:
cec = info.get_cec().get_value()
except:
return np.NaN
else:
return cec
def fetch_organic_content(info):
try:
raw = info.get_omcontent().get_value()
except:
return np.NaN
else:
raw_list = raw.split(';')
oc = np.NaN
for i in raw_list:
if i == 'OC':
oc = val
elif i == 'OM':
oc = val / 1.7 # OC = OM / 1.7, source: Schwarzenbach
else:
if '<' in i:
val = float(i[1:])
print("Warning: {} was converted to {}".format(i, val))
elif i == '' or i == '-':
val = np.NaN
else:
val = float(i)
return oc
def fetch_biomass(info):
try:
raw = info.get_biomass().get_value()
except:
return np.NaN, np.NaN
else:
l = raw.split(' - ')
return float(l[0]), float(l[1])
def fetch_temperature(info):
try:
raw = info.get_temperature().get_value()
except:
return np.NaN
else:
min = float(raw.split(';')[0])
max = float(raw.split(';')[1])
return np.round(np.average([min, max]), 0)
def fetch_wst(info):
try:
raw = info.get_waterstoragecapacity().get_value()
except:
return np.NaN, ''
else:
raw_list = raw.replace(" ", "").split('-')
if len(raw_list) < 4:
value = float(raw_list[0])
type = raw_list[1]
else:
value = np.NaN
type = raw_list[2]
return value, type
def fetch_humidity(info):
try:
raw = info.get_humidity().get_value()
except:
return np.NaN, ''
else:
if type(raw) == float:
return raw, ''
else:
l = raw.split(' - ')
return float(l[0]), l[1]
def fetch_soiltexture1(info):
try:
raw = info.get_soiltexture1().get_value()
except:
return ''
else:
return raw
def fetch_spikecompound(info):
try:
raw = info.get_spikecompound().get_compoundLink()
spike_cpd = CompoundStructure(eP.requester, id=raw)
spike_smiles = spike_cpd.get_smiles()
except:
return ''
else:
return spike_smiles
def fetch_soiltexture2(info):
try:
raw = info.get_soiltexture2().get_value()
except:
return np.NaN, np.NaN, np.NaN
else:
values = re.findall(r'\s([\d.]+)%', raw) ########################
if values == []:
return np.NaN, np.NaN, np.NaN
return get_float_or_nan(values[0]), get_float_or_nan(values[1]), get_float_or_nan(values[2]) # sand, silt, clay
def fetch_halflife_model(info):
try:
raw = info.get_halflife().get_value()
except:
return ''
else:
return raw.split(';')[0]
def fetch_halflife_comment(info):
try:
raw = info.get_halflife().get_value()
except:
return ''
else:
return raw.split(';')[2]
def get_float_or_nan(x):
try:
return float(x)
except:
return np.NaN
def add_halflives(D, compound_id, row):
compound_index = row['Index']
reduced_smiles = row['Cropped_canonical_SMILES_no_stereo']
compound_structure = CompoundStructure(eP.requester, id=compound_id)
compound = Compound(eP.requester, id=compound_id)
print(compound_id)
halflives = compound_structure.get_halflifes()
smiles = compound.get_smiles()
for hl in halflives:
compound info
D['index'].append(compound_index)
D['compound_id'].append(compound_id)
D['smiles'].append(smiles)
D['reduced_smiles'].append(reduced_smiles) # cropped_canonical_smiles_no_stereo
D['halflife'].append(float(hl.hl))
D['scenario_id'].append(hl.scenarioId)
D['log(Koc)'].append(row['log(Koc)'])
D['Koc_source'].append(row['Koc_source'])
print(hl.scenarioId)
fetch data structures
scenario = Scenario(eP.requester, id=hl.scenarioId)
add_info = scenario.get_additional_information()
add halflife details
D['halflife_model'].append(fetch_halflife_model(add_info))
D['halflife_comment'].append(fetch_halflife_comment(add_info))
D['study_name'].append(scenario.get_name().split(' - ')[0])
D['spike_compound'].append(fetch_spikecompound(add_info))
fetch data points
D['acidity'].append(fetch_acidity(add_info))
D['CEC'].append(fetch_cec(add_info)) # cation exchange capacity
D['OC'].append(fetch_organic_content(add_info)) # organic content as organic carbon (oc)
start, end = fetch_biomass(add_info)
D['biomass_start'].append(start)
D['biomass_end'].append(end)
D['biomass'].append(np.round(np.average([start, end]), 2))
D['temperature'].append(fetch_temperature(add_info))
wst_value, wst_type = fetch_wst(add_info) # water storage capacity,
D['wst_value'].append(wst_value)
D['wst_type'].append(wst_type)
hum, hum_cond = fetch_humidity(add_info)
D['humidity'].append(hum)
D['humidity_conditions'].append(hum_cond)
D['soil_texture'].append(fetch_soiltexture1(add_info))
_sand, _silt, _clay = fetch_soiltexture2(add_info)
D['sand'].append(_sand)
D['silt'].append(_silt)
D['clay'].append(_clay)
return D
__main__()

Event Timeline