expand_dataset_.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Oct 18, 07:59

expand_dataset_.py
View Options

	# Script by Jasmin, Jan 2022, Eawag
	# Expand compound entries with all half-lives and associated data
	# Goal: create full data set
	import pandas as pd
	import numpy as np
	import sys
	import re
	sys.path.insert(0, 'C:\\Users\\leetseng\\enviPath-python\\') #/Users/jasmin/enviPath-python/enviPath_python/
	from enviPath_python.enviPath import *
	from enviPath_python.objects import *
	file_location = '/Halflife_modeling/'

	# Define the instance to use
	INSTANCE_HOST = 'https://envipath.org'
	username = 'leetseng'
	password = getpass.getpass()
	eP = enviPath(INSTANCE_HOST)
	eP.login(username, password)

	# files
	input_file_path = file_location+'input/soil_compounds_final.txt'
	output_file_path_full = file_location+'output/full_dataset_half-lives.txt'


	#todo :
	# 1. create file similar to tomu's data set - can we improve model just by re-obtaining all the data ?
	# 2. try to improve model

	# try to switch to the sludge dataset
	# try to collect the information and fill into the dictionary
	# transfer the function to extract all compounds



	def __main__():
	# data frames
	data = pd.read_csv(input_file_path, sep='\t')
	# available headers: Index Name ID Inchikey_full Inchikey_first_14 Full_SMILES Canonical_SMIES Is_composite
	# Cropped_canonical_SMILES Cropped_canonical_SMILES_no_stereo SMILES_pH_7 Number_halflives Comment Additional_HL_source
	# Mol_weigth pKa pKa_5to8 charge_pH_7 log(Koc) Koc_source

	D = {'index': [], 'compound_id': [], 'smiles': [], 'reduced_smiles': [], 'halflife': [], 'scenario_id': [], 'study_name':[],
	'halflife_model': [], 'halflife_comment': [], 'spike_compound': [], 'acidity': [], 'CEC': [], 'OC': [],
	'biomass_start': [], 'biomass_end': [], 'biomass': [], 'temperature': [], 'wst_value': [],
	'wst_type': [], 'humidity': [], 'humidity_conditions': [], 'soil_texture': [], 'sand': [], 'silt': [], 'clay': [], 'log(Koc)':[], 'Koc_source':[]}

	for index, row in data.iterrows():
	compound_id = row['ID']
	print("COMPOUND: {}\n".format(compound_id))
	D = add_halflives(D, compound_id, row)
	if type(row['Additional_HL_source']) == str:
	D = add_halflives(D, row['Additional_HL_source'], row)

	hl_data = pd.DataFrame.from_dict(D) #convert dict into DF
	hl_data.to_csv(output_file_path_full, sep='\t')


	def fetch_acidity(info):
	try:
	raw_pH = info.get_acidity().get_value()
	except:
	return np.NaN
	else:
	if ';' in raw_pH:
	if '-' in raw_pH.split(';')[0]:
	pH = range_to_average(raw_pH.split(';')[0])
	else:
	pH = float(raw_pH.split(';')[0])
	elif '-' in raw_pH: # if range, get mean value
	pH = range_to_average(raw_pH)
	else:
	pH = float(raw_pH)
	return np.round(pH, 1)

	def range_to_average(input_string):
	min = float(input_string.split(' - ')[0])
	max = float(input_string.split(' - ')[1])
	avg = np.average([min, max])
	return avg

	def fetch_cec(info):
	try:
	cec = info.get_cec().get_value()
	except:
	return np.NaN
	else:
	return cec

	def fetch_organic_content(info):
	try:
	raw = info.get_omcontent().get_value()
	except:
	return np.NaN
	else:
	raw_list = raw.split(';')
	oc = np.NaN
	for i in raw_list:
	if i == 'OC':
	oc = val
	elif i == 'OM':
	oc = val / 1.7 # OC = OM / 1.7, source: Schwarzenbach
	else:
	if '<' in i:
	val = float(i[1:])
	print("Warning: {} was converted to {}".format(i, val))
	elif i == '' or i == '-':
	val = np.NaN
	else:
	val = float(i)
	return oc

	def fetch_biomass(info):
	try:
	raw = info.get_biomass().get_value()
	except:
	return np.NaN, np.NaN
	else:
	l = raw.split(' - ')
	return float(l[0]), float(l[1])

	def fetch_temperature(info):
	try:
	raw = info.get_temperature().get_value()
	except:
	return np.NaN
	else:
	min = float(raw.split(';')[0])
	max = float(raw.split(';')[1])
	return np.round(np.average([min, max]), 0)

	def fetch_wst(info):
	try:
	raw = info.get_waterstoragecapacity().get_value()
	except:
	return np.NaN, ''
	else:
	raw_list = raw.replace(" ", "").split('-')
	if len(raw_list) < 4:
	value = float(raw_list[0])
	type = raw_list[1]
	else:
	value = np.NaN
	type = raw_list[2]
	return value, type

	def fetch_humidity(info):
	try:
	raw = info.get_humidity().get_value()
	except:
	return np.NaN, ''
	else:
	if type(raw) == float:
	return raw, ''
	else:
	l = raw.split(' - ')
	return float(l[0]), l[1]

	def fetch_soiltexture1(info):
	try:
	raw = info.get_soiltexture1().get_value()
	except:
	return ''
	else:
	return raw

	def fetch_spikecompound(info):
	try:
	raw = info.get_spikecompound().get_compoundLink()
	spike_cpd = CompoundStructure(eP.requester, id=raw)
	spike_smiles = spike_cpd.get_smiles()
	except:
	return ''
	else:
	return spike_smiles

	def fetch_soiltexture2(info):
	try:
	raw = info.get_soiltexture2().get_value()
	except:
	return np.NaN, np.NaN, np.NaN
	else:
	values = re.findall(r'\s([\d.]+)%', raw) ########################
	if values == []:
	return np.NaN, np.NaN, np.NaN
	return get_float_or_nan(values[0]), get_float_or_nan(values[1]), get_float_or_nan(values[2]) # sand, silt, clay

	def fetch_halflife_model(info):
	try:
	raw = info.get_halflife().get_value()
	except:
	return ''
	else:
	return raw.split(';')[0]

	def fetch_halflife_comment(info):
	try:
	raw = info.get_halflife().get_value()
	except:
	return ''
	else:
	return raw.split(';')[2]

	def get_float_or_nan(x):
	try:
	return float(x)
	except:
	return np.NaN

	def add_halflives(D, compound_id, row):
	compound_index = row['Index']
	reduced_smiles = row['Cropped_canonical_SMILES_no_stereo']
	compound_structure = CompoundStructure(eP.requester, id=compound_id)
	compound = Compound(eP.requester, id=compound_id)
	print(compound_id)
	halflives = compound_structure.get_halflifes()
	smiles = compound.get_smiles()
	for hl in halflives:
	compound info
	D['index'].append(compound_index)
	D['compound_id'].append(compound_id)
	D['smiles'].append(smiles)
	D['reduced_smiles'].append(reduced_smiles) # cropped_canonical_smiles_no_stereo
	D['halflife'].append(float(hl.hl))
	D['scenario_id'].append(hl.scenarioId)
	D['log(Koc)'].append(row['log(Koc)'])
	D['Koc_source'].append(row['Koc_source'])
	print(hl.scenarioId)
	fetch data structures
	scenario = Scenario(eP.requester, id=hl.scenarioId)
	add_info = scenario.get_additional_information()
	add halflife details
	D['halflife_model'].append(fetch_halflife_model(add_info))
	D['halflife_comment'].append(fetch_halflife_comment(add_info))
	D['study_name'].append(scenario.get_name().split(' - ')[0])
	D['spike_compound'].append(fetch_spikecompound(add_info))
	fetch data points
	D['acidity'].append(fetch_acidity(add_info))
	D['CEC'].append(fetch_cec(add_info)) # cation exchange capacity
	D['OC'].append(fetch_organic_content(add_info)) # organic content as organic carbon (oc)
	start, end = fetch_biomass(add_info)
	D['biomass_start'].append(start)
	D['biomass_end'].append(end)
	D['biomass'].append(np.round(np.average([start, end]), 2))
	D['temperature'].append(fetch_temperature(add_info))
	wst_value, wst_type = fetch_wst(add_info) # water storage capacity,
	D['wst_value'].append(wst_value)
	D['wst_type'].append(wst_type)
	hum, hum_cond = fetch_humidity(add_info)
	D['humidity'].append(hum)
	D['humidity_conditions'].append(hum_cond)
	D['soil_texture'].append(fetch_soiltexture1(add_info))
	_sand, _silt, _clay = fetch_soiltexture2(add_info)
	D['sand'].append(_sand)
	D['silt'].append(_silt)
	D['clay'].append(_clay)
	return D


	__main__()

expand_dataset_.pyNo OneTemporaryActions

File Metadata

expand_dataset_.pyView Options

Event Timeline

expand_dataset_.py
No OneTemporary
Actions

expand_dataset_.py
View Options