Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F83461625
calculate_target_variables.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, Sep 17, 06:42
Size
10 KB
Mime Type
text/x-python
Expires
Thu, Sep 19, 06:42 (2 d)
Engine
blob
Format
Raw Data
Handle
20842118
Attached To
rTWTEST master_thesis_Lee
calculate_target_variables.py
View Options
import
pandas
as
pd
import
numpy
as
np
import
seaborn
as
sns
import
matplotlib.pyplot
as
plt
from
scipy.stats
import
gmean
from
Bayesian
import
*
output_file_path
=
"C:
\\
Users
\\
leetseng
\\
TWtest
\\
output
\\
"
input_file_path
=
"C:
\\
Users
\\
leetseng
\\
TWtest
\\
input
\\
"
df_raw
=
pd
.
read_csv
(
input_file_path
+
"sludge_raw_use_this_for_baymean_test.tsv"
,
sep
=
'
\t
'
)
# sludgeDatasetMergeCalculated.tsv #sludgeDatasetMergeCalculatedAvglogHL.tsv
CURATE_DATA_POINTS
=
False
def
main
():
df
=
calculate_target_variable
(
df_raw
)
df1
=
describe_dropna_halflife
(
df
)
df_
=
calculate_bay_mean_std
(
df1
)
df_
.
to_csv
(
output_file_path
+
'sludge_bay_PriorMuStd_2.tsv'
,
sep
=
'
\t
'
)
plot_distribution
(
output_path
=
output_file_path
+
'Distribution_comparison_PriorMuStd_2.pdf'
)
def
calculate_target_variable
(
df
):
df
[
'hl_log_gmean'
]
=
np
.
log10
(
get_geometric_mean
(
df
,
'halflife'
))
df
[
'hl_log_median'
]
=
np
.
log10
(
get_median
(
df
,
'halflife'
))
df
[
'hl_log_std'
]
=
np
.
log10
(
get_std
(
df
,
'halflife'
))
df
[
'hl_log_spread'
]
=
get_hl_spread
(
df
)
df
[
'biomass_hl_log_gmean'
]
=
np
.
log10
(
get_geometric_mean
(
df
,
'hl_biomass_corrected'
))
df
[
'biomass_hl_log_median'
]
=
np
.
log10
(
get_median
(
df
,
'hl_biomass_corrected'
))
df
[
'biomass_hl_log_std'
]
=
np
.
log10
(
get_std
(
df
,
'hl_biomass_corrected'
))
df
[
'biomass_hl_log_spread'
]
=
get_biomass_hl_spread
(
df
)
df
[
'acidity_std'
]
=
get_std
(
df
,
'acidity'
)
df
[
'temperature_std'
]
=
get_std
(
df
,
'temperature'
)
df
[
'biomass_log_std'
]
=
get_std
(
df
,
'total_suspended_solids_concentration_start'
)
return
df
def
describe_dropna_halflife
(
df
):
df
.
dropna
(
subset
=
[
'halflife'
,
'halflife_log'
],
inplace
=
True
)
# Remove the NaN in halflife column, otherwise you get ValueError in bmean, bstd calculation.
# df.to_csv(output_file_path + 'sludge_calculated_test_for_baycalculation.tsv', sep='\t') #'sludge_calculated.tsv'
df_
=
df
.
copy
()
description
=
df_
.
describe
()
print
(
"Summary of loaded data:
\n
------------------
\n
"
,
description
)
description
.
to_csv
(
output_file_path
+
'sludge_calculated_test_for_baycalculation_describe.tsv'
,
sep
=
'
\t
'
)
return
df
def
calculate_bay_mean_std
(
df
):
bmean
,
bstd
=
get_bayesian_stats
(
df
)
df
[
'hl_bayesian_mean'
]
=
bmean
df
[
'hl_bayesian_std'
]
=
bstd
# df.to_csv(output_file_path+'sludge_calculated_test_for_baycalculation.tsv', sep='\t')
return
df
def
get_std
(
df
,
column
):
new
=
[]
for
index
,
row
in
df
.
iterrows
():
this
=
df
.
loc
[
df
[
'reduced_smiles'
]
==
row
[
'reduced_smiles'
]]
std
=
np
.
nanstd
(
this
[
column
])
if
std
==
0
:
new
.
append
(
np
.
NaN
)
else
:
new
.
append
(
std
)
return
new
def
get_mean
(
df
,
column
):
new
=
[]
for
index
,
row
in
df
.
iterrows
():
this
=
df
.
loc
[
df
[
'reduced_smiles'
]
==
row
[
'reduced_smiles'
]]
std
=
np
.
nanmean
(
this
[
column
])
new
.
append
(
std
)
return
new
def
get_median
(
df
,
column
):
new
=
[]
for
index
,
row
in
df
.
iterrows
():
this
=
df
.
loc
[
df
[
'reduced_smiles'
]
==
row
[
'reduced_smiles'
]]
std
=
np
.
nanmedian
(
this
[
column
])
new
.
append
(
std
)
return
new
def
get_hl_spread
(
df
):
new
=
[]
for
index
,
row
in
df
.
iterrows
():
this
=
df
.
loc
[
df
[
'reduced_smiles'
]
==
row
[
'reduced_smiles'
]]
spread
=
max
(
this
[
'halflife_log'
])
-
min
(
this
[
'halflife_log'
])
new
.
append
(
spread
)
return
new
def
get_biomass_hl_spread
(
df
):
new
=
[]
for
index
,
row
in
df
.
iterrows
():
this
=
df
.
loc
[
df
[
'reduced_smiles'
]
==
row
[
'reduced_smiles'
]]
spread
=
max
(
np
.
log10
(
this
[
'hl_biomass_corrected'
]))
-
min
(
np
.
log10
(
this
[
'hl_biomass_corrected'
]))
new
.
append
(
spread
)
return
new
def
legal
(
value
,
name
):
if
np
.
isnan
(
value
):
print
(
f
"Problem: no {name}"
)
return
False
elif
value
==
0
:
print
(
f
"Problem: {name} is 0"
)
return
False
return
True
def
g_mean
(
x
):
a
=
np
.
log
(
x
)
return
np
.
exp
(
a
.
mean
())
def
get_geometric_mean
(
df
,
column
):
new
=
[]
for
index
,
row
in
df
.
iterrows
():
this
=
df
.
loc
[
df
[
'reduced_smiles'
]
==
row
[
'reduced_smiles'
]]
gmean
=
g_mean
(
this
[
column
])
new
.
append
(
gmean
)
return
new
# def get_gmean(df, column):
# new = []
# for index, row in df.iterrows():
# this = df.loc[df['reduced_smiles'] == row['reduced_smiles']
# v = this[column]
# if not legal(v, f"{column}"):
# return np.NaN
#
# if legal(v, f"{column}"):
# std = gmean(v)
# new.append(std)
# return new
def
process_comment_list
(
comment_list
):
new_list
=
[]
for
comment
in
comment_list
:
if
type
(
comment
)
==
float
:
new_list
.
append
(
''
)
elif
'<'
in
comment
:
new_list
.
append
(
'<'
)
elif
'>'
in
comment
:
new_list
.
append
(
'>'
)
else
:
new_list
.
append
(
''
)
return
new_list
def
get_bayesian_stats
(
df
):
mean_list
=
[]
std_list
=
[]
results
=
{}
# {'index': (mean, std)}
for
index
,
row
in
df
.
iterrows
():
if
row
[
'reduced_smiles'
]
in
results
.
keys
():
mean
,
std
=
results
[
row
[
'reduced_smiles'
]]
else
:
this
=
df
.
loc
[
df
[
'reduced_smiles'
]
==
row
[
'reduced_smiles'
]]
comment_list_raw
=
process_comment_list
(
this
[
"halflife_comment"
])
y_raw
=
np
.
array
(
this
[
'halflife_log'
])
if
CURATE_DATA_POINTS
==
True
:
pass
else
:
y
=
y_raw
comment_list
=
comment_list_raw
print
(
"
\n
COMPOUND reduced_smiles {}"
.
format
(
row
[
'reduced_smiles'
]))
print
(
"Compute bayes for {} with comments {}"
.
format
(
y
,
comment_list
))
bayesian
=
Bayesian
(
y
=
y
,
comment_list
=
comment_list
)
bayesian
.
set_prior_mu
(
mean
=
0.2
,
std
=
2
)
#(Original: mean=1.5, std=2) Set prior_mu_std as 2
bayesian
.
set_prior_sigma
(
mean
=
0.4
,
std
=
0.9
)
#(Original: mean=0.2, std=0.5)
mean
,
std
=
bayesian
.
get_posterior_distribution
()
results
[
row
[
'reduced_smiles'
]]
=
(
mean
,
std
)
print
(
'mean: {}, std: {}'
.
format
(
mean
,
std
))
bayesian
.
plot_distribution
(
output_path
=
output_file_path
+
'Distribution_comparison_PriorMuStd_2.pdf'
)
## I add this line to plot the data distribution.
mean_list
.
append
(
round
(
mean
,
2
))
std_list
.
append
(
round
(
std
,
2
))
return
mean_list
,
std_list
# column_groupby_smiles_sludge(df_raw, 'log_hl_biomass_corrected')
# def rename_column(df):
# df_rename = df.rename(columns={
# f'mean_log_hl_combined': 'hl_log_mean',
# f'median_log_hl_combined': 'hl_log_median',
# f'geomean_log_hl_combined': 'hl_log_gmean',
# f'std_log_hl_combined': 'hl_log_std',
# f'mean_log_hl_biomass_corrected': 'biomass_hl_log_mean',
# f'median_log_hl_biomass_corrected': 'biomass_hl_log_median',
# f'geomean_log_hl_biomass_corrected': 'biomass_hl_log_gmean',
# f'std_log_hl_biomass_corrected': 'biomass_hl_log_std'
# })
# return df_rename
def
column_groupby_smiles_sludge
(
df
,
column
):
mean_column_list
=
[]
median_column_list
=
[]
geomean_column_list
=
[]
std_column_list
=
[]
for
index
,
row
in
df
.
iterrows
():
# if not legal(row[column], f"row[{column}]"):
# return np.NaN
#
# elif legal(row[column], f"row[{column}]"):
x
=
df
.
loc
[
df
[
'reduced_smiles'
]
==
row
[
'reduced_smiles'
]]
mean_
=
np
.
nanmean
(
x
[
column
])
median_
=
np
.
nanmedian
(
x
[
column
])
g_
=
gmean
(
x
[
'hl_biomass_corrected'
])
geomean_
=
np
.
log10
(
g_
)
std_
=
np
.
nanstd
(
x
[
column
])
mean_column_list
.
append
(
mean_
)
median_column_list
.
append
(
median_
)
geomean_column_list
.
append
(
geomean_
)
std_column_list
.
append
(
std_
)
df
[
f
'mean_{column}'
]
=
mean_column_list
df
[
f
'median_{column}'
]
=
median_column_list
df
[
f
'geomean_{column}'
]
=
geomean_column_list
df
[
f
'std_{column}'
]
=
std_column_list
df
.
rename
(
columns
=
{
'mean_log_hl_combined'
:
'hl_log_mean'
,
'median_log_hl_combined'
:
'hl_log_median'
,
'geomean_log_hl_combined'
:
'hl_log_gmean'
,
'std_log_hl_combined'
:
'hl_log_std'
,
'mean_log_hl_biomass_corrected'
:
'biomass_hl_log_mean'
,
'median_log_hl_biomass_corrected'
:
'biomass_hl_log_median'
,
'geomean_log_hl_biomass_corrected'
:
'biomass_hl_log_gmean'
,
'std_log_hl_biomass_corrected'
:
'biomass_hl_log_std'
},
inplace
=
True
)
df_new
=
df
.
to_csv
(
output_file_path
+
"sludgeDatasetMergeCalculatedAvglogHLBiomass.tsv"
,
sep
=
'
\t
'
)
return
df_new
if
__name__
==
'__main__'
:
main
()
"""
The following code is my initial script for fetching the avg values of logDT50 and logDT50'
def column_groupby_smiles_sludge(df, column):
mean_column_list = []
median_column_list = []
geomean_column_list = []
std_column_list = []
for index, row in df.iterrows():
# if not legal(row[column], f"row[{column}]"):
# return np.NaN
#
# elif legal(row[column], f"row[{column}]"):
x = df.loc[df['smiles'] == row['smiles']]
mean_ = np.nanmean(x[column])
median_ = np.nanmedian(x[column])
g_ = gmean(x['hl_biomass_corrected'])
geomean_ = np.log10(g_)
std_ = np.nanstd(x[column])
mean_column_list.append(mean_)
median_column_list.append(median_)
geomean_column_list.append(geomean_)
std_column_list.append(std_)
df[f'mean_{column}'] = mean_column_list
df[f'median_{column}'] = median_column_list
df[f'geomean_{column}'] = geomean_column_list
df[f'std_{column}'] = std_column_list
df.rename(columns={
'mean_log_hl_combined': 'hl_log_mean',
'median_log_hl_combined': 'hl_log_median',
'geomean_log_hl_combined': 'hl_log_gmean',
'std_log_hl_combined': 'hl_log_std',
'mean_log_hl_biomass_corrected': 'biomass_hl_log_mean',
'median_log_hl_biomass_corrected': 'biomass_hl_log_median',
'geomean_log_hl_biomass_corrected': 'biomass_hl_log_gmean',
'std_log_hl_biomass_corrected': 'biomass_hl_log_std'
}, inplace=True)
df_new = df.to_csv(output_file_path + "sludgeDatasetMergeCalculatedAvglogHLBiomass.tsv", sep='\t')
return df_new
"""
Event Timeline
Log In to Comment