Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F88348047
expand_dataset_.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Oct 18, 07:59
Size
8 KB
Mime Type
text/x-python
Expires
Sun, Oct 20, 07:59 (2 d)
Engine
blob
Format
Raw Data
Handle
21757815
Attached To
rTWTEST master_thesis_Lee
expand_dataset_.py
View Options
# Script by Jasmin, Jan 2022, Eawag
# Expand compound entries with all half-lives and associated data
# Goal: create full data set
import
pandas
as
pd
import
numpy
as
np
import
sys
import
re
sys
.
path
.
insert
(
0
,
'C:
\\
Users
\\
leetseng
\\
enviPath-python
\\
'
)
#/Users/jasmin/enviPath-python/enviPath_python/
from
enviPath_python.enviPath
import
*
from
enviPath_python.objects
import
*
file_location
=
'/Halflife_modeling/'
# Define the instance to use
INSTANCE_HOST
=
'https://envipath.org'
username
=
'leetseng'
password
=
getpass
.
getpass
()
eP
=
enviPath
(
INSTANCE_HOST
)
eP
.
login
(
username
,
password
)
# files
input_file_path
=
file_location
+
'input/soil_compounds_final.txt'
output_file_path_full
=
file_location
+
'output/full_dataset_half-lives.txt'
#todo :
# 1. create file similar to tomu's data set - can we improve model just by re-obtaining all the data ?
# 2. try to improve model
# try to switch to the sludge dataset
# try to collect the information and fill into the dictionary
# transfer the function to extract all compounds
def
__main__
():
# data frames
data
=
pd
.
read_csv
(
input_file_path
,
sep
=
'
\t
'
)
# available headers: Index Name ID Inchikey_full Inchikey_first_14 Full_SMILES Canonical_SMIES Is_composite
# Cropped_canonical_SMILES Cropped_canonical_SMILES_no_stereo SMILES_pH_7 Number_halflives Comment Additional_HL_source
# Mol_weigth pKa pKa_5to8 charge_pH_7 log(Koc) Koc_source
D
=
{
'index'
:
[],
'compound_id'
:
[],
'smiles'
:
[],
'reduced_smiles'
:
[],
'halflife'
:
[],
'scenario_id'
:
[],
'study_name'
:[],
'halflife_model'
:
[],
'halflife_comment'
:
[],
'spike_compound'
:
[],
'acidity'
:
[],
'CEC'
:
[],
'OC'
:
[],
'biomass_start'
:
[],
'biomass_end'
:
[],
'biomass'
:
[],
'temperature'
:
[],
'wst_value'
:
[],
'wst_type'
:
[],
'humidity'
:
[],
'humidity_conditions'
:
[],
'soil_texture'
:
[],
'sand'
:
[],
'silt'
:
[],
'clay'
:
[],
'log(Koc)'
:[],
'Koc_source'
:[]}
for
index
,
row
in
data
.
iterrows
():
compound_id
=
row
[
'ID'
]
print
(
"COMPOUND: {}
\n
"
.
format
(
compound_id
))
D
=
add_halflives
(
D
,
compound_id
,
row
)
if
type
(
row
[
'Additional_HL_source'
])
==
str
:
D
=
add_halflives
(
D
,
row
[
'Additional_HL_source'
],
row
)
hl_data
=
pd
.
DataFrame
.
from_dict
(
D
)
#convert dict into DF
hl_data
.
to_csv
(
output_file_path_full
,
sep
=
'
\t
'
)
def
fetch_acidity
(
info
):
try
:
raw_pH
=
info
.
get_acidity
()
.
get_value
()
except
:
return
np
.
NaN
else
:
if
';'
in
raw_pH
:
if
'-'
in
raw_pH
.
split
(
';'
)[
0
]:
pH
=
range_to_average
(
raw_pH
.
split
(
';'
)[
0
])
else
:
pH
=
float
(
raw_pH
.
split
(
';'
)[
0
])
elif
'-'
in
raw_pH
:
# if range, get mean value
pH
=
range_to_average
(
raw_pH
)
else
:
pH
=
float
(
raw_pH
)
return
np
.
round
(
pH
,
1
)
def
range_to_average
(
input_string
):
min
=
float
(
input_string
.
split
(
' - '
)[
0
])
max
=
float
(
input_string
.
split
(
' - '
)[
1
])
avg
=
np
.
average
([
min
,
max
])
return
avg
def
fetch_cec
(
info
):
try
:
cec
=
info
.
get_cec
()
.
get_value
()
except
:
return
np
.
NaN
else
:
return
cec
def
fetch_organic_content
(
info
):
try
:
raw
=
info
.
get_omcontent
()
.
get_value
()
except
:
return
np
.
NaN
else
:
raw_list
=
raw
.
split
(
';'
)
oc
=
np
.
NaN
for
i
in
raw_list
:
if
i
==
'OC'
:
oc
=
val
elif
i
==
'OM'
:
oc
=
val
/
1.7
# OC = OM / 1.7, source: Schwarzenbach
else
:
if
'<'
in
i
:
val
=
float
(
i
[
1
:])
print
(
"Warning: {} was converted to {}"
.
format
(
i
,
val
))
elif
i
==
''
or
i
==
'-'
:
val
=
np
.
NaN
else
:
val
=
float
(
i
)
return
oc
def
fetch_biomass
(
info
):
try
:
raw
=
info
.
get_biomass
()
.
get_value
()
except
:
return
np
.
NaN
,
np
.
NaN
else
:
l
=
raw
.
split
(
' - '
)
return
float
(
l
[
0
]),
float
(
l
[
1
])
def
fetch_temperature
(
info
):
try
:
raw
=
info
.
get_temperature
()
.
get_value
()
except
:
return
np
.
NaN
else
:
min
=
float
(
raw
.
split
(
';'
)[
0
])
max
=
float
(
raw
.
split
(
';'
)[
1
])
return
np
.
round
(
np
.
average
([
min
,
max
]),
0
)
def
fetch_wst
(
info
):
try
:
raw
=
info
.
get_waterstoragecapacity
()
.
get_value
()
except
:
return
np
.
NaN
,
''
else
:
raw_list
=
raw
.
replace
(
" "
,
""
)
.
split
(
'-'
)
if
len
(
raw_list
)
<
4
:
value
=
float
(
raw_list
[
0
])
type
=
raw_list
[
1
]
else
:
value
=
np
.
NaN
type
=
raw_list
[
2
]
return
value
,
type
def
fetch_humidity
(
info
):
try
:
raw
=
info
.
get_humidity
()
.
get_value
()
except
:
return
np
.
NaN
,
''
else
:
if
type
(
raw
)
==
float
:
return
raw
,
''
else
:
l
=
raw
.
split
(
' - '
)
return
float
(
l
[
0
]),
l
[
1
]
def
fetch_soiltexture1
(
info
):
try
:
raw
=
info
.
get_soiltexture1
()
.
get_value
()
except
:
return
''
else
:
return
raw
def
fetch_spikecompound
(
info
):
try
:
raw
=
info
.
get_spikecompound
()
.
get_compoundLink
()
spike_cpd
=
CompoundStructure
(
eP
.
requester
,
id
=
raw
)
spike_smiles
=
spike_cpd
.
get_smiles
()
except
:
return
''
else
:
return
spike_smiles
def
fetch_soiltexture2
(
info
):
try
:
raw
=
info
.
get_soiltexture2
()
.
get_value
()
except
:
return
np
.
NaN
,
np
.
NaN
,
np
.
NaN
else
:
values
=
re
.
findall
(
r'\s([\d.]+)%'
,
raw
)
########################
if
values
==
[]:
return
np
.
NaN
,
np
.
NaN
,
np
.
NaN
return
get_float_or_nan
(
values
[
0
]),
get_float_or_nan
(
values
[
1
]),
get_float_or_nan
(
values
[
2
])
# sand, silt, clay
def
fetch_halflife_model
(
info
):
try
:
raw
=
info
.
get_halflife
()
.
get_value
()
except
:
return
''
else
:
return
raw
.
split
(
';'
)[
0
]
def
fetch_halflife_comment
(
info
):
try
:
raw
=
info
.
get_halflife
()
.
get_value
()
except
:
return
''
else
:
return
raw
.
split
(
';'
)[
2
]
def
get_float_or_nan
(
x
):
try
:
return
float
(
x
)
except
:
return
np
.
NaN
def
add_halflives
(
D
,
compound_id
,
row
):
compound_index
=
row
[
'Index'
]
reduced_smiles
=
row
[
'Cropped_canonical_SMILES_no_stereo'
]
compound_structure
=
CompoundStructure
(
eP
.
requester
,
id
=
compound_id
)
compound
=
Compound
(
eP
.
requester
,
id
=
compound_id
)
print
(
compound_id
)
halflives
=
compound_structure
.
get_halflifes
()
smiles
=
compound
.
get_smiles
()
for
hl
in
halflives
:
compound
info
D
[
'index'
]
.
append
(
compound_index
)
D
[
'compound_id'
]
.
append
(
compound_id
)
D
[
'smiles'
]
.
append
(
smiles
)
D
[
'reduced_smiles'
]
.
append
(
reduced_smiles
)
# cropped_canonical_smiles_no_stereo
D
[
'halflife'
]
.
append
(
float
(
hl
.
hl
))
D
[
'scenario_id'
]
.
append
(
hl
.
scenarioId
)
D
[
'log(Koc)'
]
.
append
(
row
[
'log(Koc)'
])
D
[
'Koc_source'
]
.
append
(
row
[
'Koc_source'
])
print
(
hl
.
scenarioId
)
fetch
data
structures
scenario
=
Scenario
(
eP
.
requester
,
id
=
hl
.
scenarioId
)
add_info
=
scenario
.
get_additional_information
()
add
halflife
details
D
[
'halflife_model'
]
.
append
(
fetch_halflife_model
(
add_info
))
D
[
'halflife_comment'
]
.
append
(
fetch_halflife_comment
(
add_info
))
D
[
'study_name'
]
.
append
(
scenario
.
get_name
()
.
split
(
' - '
)[
0
])
D
[
'spike_compound'
]
.
append
(
fetch_spikecompound
(
add_info
))
fetch
data
points
D
[
'acidity'
]
.
append
(
fetch_acidity
(
add_info
))
D
[
'CEC'
]
.
append
(
fetch_cec
(
add_info
))
# cation exchange capacity
D
[
'OC'
]
.
append
(
fetch_organic_content
(
add_info
))
# organic content as organic carbon (oc)
start
,
end
=
fetch_biomass
(
add_info
)
D
[
'biomass_start'
]
.
append
(
start
)
D
[
'biomass_end'
]
.
append
(
end
)
D
[
'biomass'
]
.
append
(
np
.
round
(
np
.
average
([
start
,
end
]),
2
))
D
[
'temperature'
]
.
append
(
fetch_temperature
(
add_info
))
wst_value
,
wst_type
=
fetch_wst
(
add_info
)
# water storage capacity,
D
[
'wst_value'
]
.
append
(
wst_value
)
D
[
'wst_type'
]
.
append
(
wst_type
)
hum
,
hum_cond
=
fetch_humidity
(
add_info
)
D
[
'humidity'
]
.
append
(
hum
)
D
[
'humidity_conditions'
]
.
append
(
hum_cond
)
D
[
'soil_texture'
]
.
append
(
fetch_soiltexture1
(
add_info
))
_sand
,
_silt
,
_clay
=
fetch_soiltexture2
(
add_info
)
D
[
'sand'
]
.
append
(
_sand
)
D
[
'silt'
]
.
append
(
_silt
)
D
[
'clay'
]
.
append
(
_clay
)
return
D
__main__
()
Event Timeline
Log In to Comment