calculate_target_variables.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Sep 17, 06:42

calculate_target_variables.py
View Options

	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	from scipy.stats import gmean
	from Bayesian import *

	output_file_path = "C:\\Users\\leetseng\\TWtest\\output\\"
	input_file_path = "C:\\Users\\leetseng\\TWtest\\input\\"
	df_raw = pd.read_csv(input_file_path + "sludge_raw_use_this_for_baymean_test.tsv", sep='\t') # sludgeDatasetMergeCalculated.tsv #sludgeDatasetMergeCalculatedAvglogHL.tsv
	CURATE_DATA_POINTS = False

	def main():
	df = calculate_target_variable(df_raw)
	df1 = describe_dropna_halflife(df)
	df_ = calculate_bay_mean_std(df1)
	df_.to_csv(output_file_path+'sludge_bay_PriorMuStd_2.tsv', sep='\t')
	plot_distribution(output_path=output_file_path + 'Distribution_comparison_PriorMuStd_2.pdf')

	def calculate_target_variable(df):
	df['hl_log_gmean'] = np.log10(get_geometric_mean(df, 'halflife'))
	df['hl_log_median'] = np.log10(get_median(df, 'halflife'))
	df['hl_log_std'] = np.log10(get_std(df, 'halflife'))
	df['hl_log_spread'] = get_hl_spread(df)
	df['biomass_hl_log_gmean'] = np.log10(get_geometric_mean(df, 'hl_biomass_corrected'))
	df['biomass_hl_log_median'] = np.log10(get_median(df, 'hl_biomass_corrected'))
	df['biomass_hl_log_std'] = np.log10(get_std(df, 'hl_biomass_corrected'))
	df['biomass_hl_log_spread'] = get_biomass_hl_spread(df)
	df['acidity_std'] = get_std(df, 'acidity')
	df['temperature_std'] = get_std(df, 'temperature')
	df['biomass_log_std'] = get_std(df, 'total_suspended_solids_concentration_start')
	return df

	def describe_dropna_halflife(df):
	df.dropna(subset=['halflife', 'halflife_log'], inplace=True) # Remove the NaN in halflife column, otherwise you get ValueError in bmean, bstd calculation.
	# df.to_csv(output_file_path + 'sludge_calculated_test_for_baycalculation.tsv', sep='\t') #'sludge_calculated.tsv'
	df_ = df.copy()
	description = df_.describe()
	print("Summary of loaded data:\n------------------\n", description)
	description.to_csv(output_file_path + 'sludge_calculated_test_for_baycalculation_describe.tsv', sep='\t')
	return df

	def calculate_bay_mean_std(df):

	bmean, bstd = get_bayesian_stats(df)
	df['hl_bayesian_mean'] = bmean
	df['hl_bayesian_std'] = bstd
	# df.to_csv(output_file_path+'sludge_calculated_test_for_baycalculation.tsv', sep='\t')
	return df


	def get_std(df, column):
	new = []
	for index, row in df.iterrows():
	this = df.loc[df['reduced_smiles'] == row['reduced_smiles']]
	std = np.nanstd(this[column])
	if std == 0:
	new.append(np.NaN)
	else:
	new.append(std)
	return new

	def get_mean(df, column):
	new = []
	for index, row in df.iterrows():
	this = df.loc[df['reduced_smiles'] == row['reduced_smiles']]
	std = np.nanmean(this[column])
	new.append(std)
	return new

	def get_median(df, column):
	new = []
	for index, row in df.iterrows():
	this = df.loc[df['reduced_smiles'] == row['reduced_smiles']]
	std = np.nanmedian(this[column])
	new.append(std)
	return new

	def get_hl_spread(df):
	new = []
	for index, row in df.iterrows():
	this = df.loc[df['reduced_smiles'] == row['reduced_smiles']]
	spread = max(this['halflife_log']) - min(this['halflife_log'])
	new.append(spread)
	return new

	def get_biomass_hl_spread(df):
	new = []
	for index, row in df.iterrows():
	this = df.loc[df['reduced_smiles'] == row['reduced_smiles']]
	spread = max(np.log10(this['hl_biomass_corrected'])) - min(np.log10(this['hl_biomass_corrected']))
	new.append(spread)
	return new


	def legal(value, name):
	if np.isnan(value):
	print(f"Problem: no {name}")
	return False
	elif value == 0:
	print(f"Problem: {name} is 0")
	return False
	return True

	def g_mean(x):
	a = np.log(x)
	return np.exp(a.mean())

	def get_geometric_mean(df, column):
	new = []
	for index, row in df.iterrows():
	this = df.loc[df['reduced_smiles'] == row['reduced_smiles']]
	gmean = g_mean(this[column])
	new.append(gmean)
	return new

	# def get_gmean(df, column):
	# new = []
	# for index, row in df.iterrows():
	# this = df.loc[df['reduced_smiles'] == row['reduced_smiles']
	# v = this[column]
	# if not legal(v, f"{column}"):
	# return np.NaN
	#
	# if legal(v, f"{column}"):
	# std = gmean(v)
	# new.append(std)
	# return new

	def process_comment_list(comment_list):
	new_list = []
	for comment in comment_list:
	if type(comment) == float:
	new_list.append('')
	elif '<' in comment:
	new_list.append('<')
	elif '>' in comment:
	new_list.append('>')
	else:
	new_list.append('')
	return new_list

	def get_bayesian_stats(df):
	mean_list = []
	std_list = []
	results = {} # {'index': (mean, std)}
	for index, row in df.iterrows():
	if row['reduced_smiles'] in results.keys():
	mean, std = results[row['reduced_smiles']]
	else:
	this = df.loc[df['reduced_smiles'] == row['reduced_smiles']]
	comment_list_raw = process_comment_list(this["halflife_comment"])
	y_raw = np.array(this['halflife_log'])
	if CURATE_DATA_POINTS == True:
	pass
	else:
	y = y_raw
	comment_list = comment_list_raw
	print("\nCOMPOUND reduced_smiles {}".format(row['reduced_smiles']))
	print("Compute bayes for {} with comments {}".format(y, comment_list))
	bayesian = Bayesian(y=y, comment_list=comment_list)
	bayesian.set_prior_mu(mean=0.2, std=2) #(Original: mean=1.5, std=2) Set prior_mu_std as 2
	bayesian.set_prior_sigma(mean=0.4, std=0.9) #(Original: mean=0.2, std=0.5)
	mean, std = bayesian.get_posterior_distribution()
	results[row['reduced_smiles']] = (mean, std)
	print('mean: {}, std: {}'.format(mean, std))
	bayesian.plot_distribution(output_path=output_file_path + 'Distribution_comparison_PriorMuStd_2.pdf') ## I add this line to plot the data distribution.
	mean_list.append(round(mean, 2))
	std_list.append(round(std, 2))

	return mean_list, std_list

	# column_groupby_smiles_sludge(df_raw, 'log_hl_biomass_corrected')
	# def rename_column(df):
	# df_rename = df.rename(columns={
	# f'mean_log_hl_combined': 'hl_log_mean',
	# f'median_log_hl_combined': 'hl_log_median',
	# f'geomean_log_hl_combined': 'hl_log_gmean',
	# f'std_log_hl_combined': 'hl_log_std',
	# f'mean_log_hl_biomass_corrected': 'biomass_hl_log_mean',
	# f'median_log_hl_biomass_corrected': 'biomass_hl_log_median',
	# f'geomean_log_hl_biomass_corrected': 'biomass_hl_log_gmean',
	# f'std_log_hl_biomass_corrected': 'biomass_hl_log_std'
	# })
	# return df_rename

	def column_groupby_smiles_sludge(df, column):
	mean_column_list = []
	median_column_list = []
	geomean_column_list = []
	std_column_list = []
	for index, row in df.iterrows():
	# if not legal(row[column], f"row[{column}]"):
	# return np.NaN
	#
	# elif legal(row[column], f"row[{column}]"):

	x = df.loc[df['reduced_smiles'] == row['reduced_smiles']]
	mean_ = np.nanmean(x[column])
	median_ = np.nanmedian(x[column])
	g_ = gmean(x['hl_biomass_corrected'])
	geomean_ = np.log10(g_)
	std_ = np.nanstd(x[column])
	mean_column_list.append(mean_)
	median_column_list.append(median_)
	geomean_column_list.append(geomean_)
	std_column_list.append(std_)
	df[f'mean_{column}'] = mean_column_list
	df[f'median_{column}'] = median_column_list
	df[f'geomean_{column}'] = geomean_column_list
	df[f'std_{column}'] = std_column_list
	df.rename(columns={
	'mean_log_hl_combined': 'hl_log_mean',
	'median_log_hl_combined': 'hl_log_median',
	'geomean_log_hl_combined': 'hl_log_gmean',
	'std_log_hl_combined': 'hl_log_std',
	'mean_log_hl_biomass_corrected': 'biomass_hl_log_mean',
	'median_log_hl_biomass_corrected': 'biomass_hl_log_median',
	'geomean_log_hl_biomass_corrected': 'biomass_hl_log_gmean',
	'std_log_hl_biomass_corrected': 'biomass_hl_log_std'
	}, inplace=True)
	df_new = df.to_csv(output_file_path + "sludgeDatasetMergeCalculatedAvglogHLBiomass.tsv", sep='\t')

	return df_new

	if __name__ == '__main__':
	main()

	"""
	The following code is my initial script for fetching the avg values of logDT50 and logDT50'

	def column_groupby_smiles_sludge(df, column):
	mean_column_list = []
	median_column_list = []
	geomean_column_list = []
	std_column_list = []
	for index, row in df.iterrows():
	# if not legal(row[column], f"row[{column}]"):
	# return np.NaN
	#
	# elif legal(row[column], f"row[{column}]"):

	x = df.loc[df['smiles'] == row['smiles']]
	mean_ = np.nanmean(x[column])
	median_ = np.nanmedian(x[column])
	g_ = gmean(x['hl_biomass_corrected'])
	geomean_ = np.log10(g_)
	std_ = np.nanstd(x[column])
	mean_column_list.append(mean_)
	median_column_list.append(median_)
	geomean_column_list.append(geomean_)
	std_column_list.append(std_)
	df[f'mean_{column}'] = mean_column_list
	df[f'median_{column}'] = median_column_list
	df[f'geomean_{column}'] = geomean_column_list
	df[f'std_{column}'] = std_column_list
	df.rename(columns={
	'mean_log_hl_combined': 'hl_log_mean',
	'median_log_hl_combined': 'hl_log_median',
	'geomean_log_hl_combined': 'hl_log_gmean',
	'std_log_hl_combined': 'hl_log_std',
	'mean_log_hl_biomass_corrected': 'biomass_hl_log_mean',
	'median_log_hl_biomass_corrected': 'biomass_hl_log_median',
	'geomean_log_hl_biomass_corrected': 'biomass_hl_log_gmean',
	'std_log_hl_biomass_corrected': 'biomass_hl_log_std'
	}, inplace=True)
	df_new = df.to_csv(output_file_path + "sludgeDatasetMergeCalculatedAvglogHLBiomass.tsv", sep='\t')

	return df_new
	"""

calculate_target_variables.pyNo OneTemporaryActions

File Metadata

calculate_target_variables.pyView Options

Event Timeline

calculate_target_variables.py
No OneTemporary
Actions

calculate_target_variables.py
View Options