Page MenuHomec4science

calculation.py
No OneTemporary

File Metadata

Created
Fri, Apr 26, 03:06

calculation.py

import pandas as pd
import numpy as np
file_location = "C:\\Users\\leetseng\\TWtest"
input_file_path_1 = file_location+'\\input\\sludgeDatasetFromOriginal.tsv' #'input/sludgeWithSmiles.tsv' "Your concatenate the different iterations, so your index is not continuous!!!!"
input_file_path_2 = file_location+'\\input\\sludgeDatasetFromLeo.tsv'
input_file_path_3 = file_location+'\\input\\sludgeDatasetFromRich.tsv'
output_file_path_full = file_location+'/output\\sludgeDatasetMergeCalculated.tsv'
data1 = pd.read_csv(input_file_path_1, sep='\t')
data2 = pd.read_csv(input_file_path_2, sep='\t')
data3 = pd.read_csv(input_file_path_3, sep='\t')
data1 = data1.reset_index(drop=True)
data2 = data2.reset_index(drop=True)
data3 = data3.reset_index(drop=True)
data_merge = pd.concat([data1, data2, data3], ignore_index=True)
data_merge.index = np.arange(1, len(data_merge) + 1)
data_merge.index.name = 'index'
# print(data_merge.head(2))
print(data_merge.columns.tolist())
#check the size of the data before and after
#check you don't remove the line
#before the calculation of hl, check if you have the rateconstant
# data_merge['TSS_mean'] = data_merge[['total_suspended_solids_concentration_start', 'total_suspended_solids_concentration_end']].mean(1)
#
print('a', len(data_merge['rateconstant']))
print('b', len(data_merge['total_suspended_solids_concentration_start']))
def __main__():
k_list = []
k_biomass_list = []
hl_list = []
hl_biomass_list = []
for index, row in data_merge.iterrows():
k = get_k(row)
k_biomass = get_k_biomass(row)
k_list.append(k)
k_biomass_list.append(k_biomass)
DT50 = get_DT50(row)
DT50_biomass = get_DT50_biomass(row)
hl_list.append(DT50)
hl_biomass_list.append(DT50_biomass)
data_merge['k_combined'] = k_list # k_combined = original k + k calculated from halflife
data_merge['k_biomass_corrected'] = k_biomass_list
data_merge['hl_combined'] = hl_list
data_merge['hl_biomass_corrected'] = hl_biomass_list
data_merge['log_k_combined'] = np.log10(data_merge['k_combined']) # how to avoid error when you meet the np.NaN
data_merge['log_k_biomass_corrected'] = np.log10(data_merge['k_biomass_corrected'])
data_merge['log_hl_combined'] = np.log10(data_merge['hl_combined'])
data_merge['log_hl_biomass_corrected'] = np.log10(data_merge['hl_biomass_corrected'])
data_merge.to_csv(output_file_path_full, mode='w', sep="\t")
def get_k(row):
if row['rateconstant'] != np.NaN and row['rateconstant'] != 0 and row['rateconstant_unit'] == '1 / day':
rateconstant = row['rateconstant']
elif row['rateconstant'] != np.NaN and row['rateconstant'] != 0 and row['rateconstant_unit'] == 'L / (g TSS * day)':
rateconstant = row['rateconstant']/row['total_suspended_solids_concentration_start'] # I think the possible problem is that the length between rateconstant and TSS is different
elif row['rateconstant'] != np.NaN and row['rateconstant'] != 0 and row['halflife'] != np.NaN:
rateconstant = row['rateconstant']
elif row['rateconstant'] == np.NaN and row['halflife'] != np.NaN and row['reaction_order'] == 'Zero order':
rateconstant = row['total_suspended_solids_concentration_start']/(2 * row['halflife'])
elif row['rateconstant'] == np.NaN and row['halflife'] != np.NaN and row['reaction_order'] == 'First order':
rateconstant = np.log(2)/row['halflife']
elif row['rateconstant'] == np.NaN and row['halflife'] != np.NaN and row['reaction_order'] == 'Pseudo first order':
a = row['halflife'] * row['total_suspended_solids_concentration_start']
rateconstant = np.log(2)/a
elif row['rateconstant'] == np.NaN and row['halflife'] != np.NaN and row['reaction_order'] == np.NaN: #By default, using the 1st order reaction formula
rateconstant = np.log(2)/row['halflife']
else:
rateconstant = np.NaN
return rateconstant
def get_k_biomass(row):
if row['rateconstant'] != np.NaN and row['rateconstant'] != 0 and row['rateconstant_unit'] == '1 / day':
rateconstant_biomass = row['total_suspended_solids_concentration_start'] * row['rateconstant']
elif row['rateconstant'] != np.NaN and row['rateconstant'] != 0 and row['rateconstant_unit'] == 'L / (g TSS * day)':
rateconstant_biomass = row['rateconstant']
else:
rateconstant_biomass = np.NaN
return rateconstant_biomass
def get_DT50(row):
if row['halflife'] == np.NaN:
if row['reaction_order'] == 'Zero order':
hl = row['total_suspended_solids_concentration_start']/(2 * row['k_combined'])
elif row['reaction_order'] == 'First order':
hl = np.log(2)/row['k_combined']
elif row['reaction_order'] == 'Pseudo first order':
hl = np.log(2)/(row['k_combined'] * row['total_suspended_solids_concentration_start'])
elif row['rateconstant'] != np.NaN and row['k_combined'] != 0:
hl = np.log(2)/row['k_combined']
else:
hl = np.NaN
elif row['halflife'] != np.NaN and row['halflife'] != 0:
hl = row['halflife']
else:
hl = np.NaN
print('No such value!')
return hl
def get_DT50_biomass(row):
if row['rateconstant'] != np.NaN and row['rateconstant'] != 0: #if row['k_biomass_corrected'] != np.NaN:
hl_biomass = row['halflife']/row['total_suspended_solids_concentration_start']
else:
hl_biomass = np.NaN
return hl_biomass
__main__()
# print('k std: {}'.format(data_merge['k_combined'].std()))
# print('k biomass corrected std: {}'.format(data_merge['k_biomass_corrected'].std()))
# print('hl std: {}'.format(data_merge['hl_combined'].std()))
# print('hl biomass corrected std: {}'.format(data_merge['hl_biomass_corrected'].std()))
# print('log k normal std: {}'.format(data_merge['log_k_combined'].std()))
# print('log k biomass corrected std: {}'.format(data_merge['log_k_biomass_corrected'].std()))
# print('log hl std: {}'.format(data_merge['log_hl_combined'].std()))
# print('log hl biomass corrected std: {}'.format(data_merge['log_hl_biomass_corrected'].std()))
#create the set of SMILES
# list_of_canonicalize_smiles = data_merge['canonicalize_smiles'].values.tolist()
# set_of_canonicalize_smiles = set(list_of_canonicalize_smiles)
# print(set_of_canonicalize_smiles)
#simple calculate two column without any filter
# data_merge['TSS_mean'] = data_merge[['total_suspended_solids_concentration_start', 'total_suspended_solids_concentration_end']].mean(1)
# data_merge['k_normal'] = data_merge['TSS_mean'] * data_merge['rateconstant'] #data_merge['k_normal'] = data_merge.apply(lambda x: x['TSS_mean'] * x['rateconstant'], axis=1)
# data_merge['k_biomass_corrected'] = data_merge['rateconstant'] / data_merge['TSS_mean'] #data_merge['k_biomass_corrected'] = data_merge.apply(lambda x: x['rateconstant'] / x['TSS_mean'], axis=1)
#filter the empty row or the zero value in the merge data
# valid_data = data_merge[data_merge['rateconstant'].notnull() & data_merge['rateconstant'] != 0].copy() #you need to let python know to work on original data or the copy of data, or it might modify your original data
# print(valid_data.shape)
# print(data_merge['rateconstant'].isnull().value_counts())
# print(data_merge['rateconstant'].isnull().describe())
# def get_k_relevent_data(data):
# if data[data['rateconstant_unit'] == 'L / (g TSS * day)'].copy(): # .any() for the missing value in Series
# data['k_normal'] = data.apply(lambda x: x['TSS_mean'] * x['rateconstant'], axis=1)
# data['k_biomass_corrected'] = data.apply(lambda x: x['rateconstant'], axis=1)
# data['log(k_normal)'] = data.apply(lambda x: np.log10(x['k_normal']), axis=1)
# data['log(k_biomass_corrected)'] = data.apply(lambda x: np.log10(x['k_biomass_corrected']), axis=1)
# elif data[data['rateconstant_unit'] == '1 / day'].copy():
# data['k_normal'] = data.apply(lambda x: x['rateconstant'], axis=1)
# data['k_biomass_corrected'] = data.apply(lambda x: x['rateconstant'] / x['TSS_mean'], axis=1)
# data['log(k_normal)'] = data.apply(lambda x: np.log10(x['k_normal']), axis=1)
# data['log(k_biomass_corrected)'] = data.apply(lambda x: np.log10(x['k_biomass_corrected']), axis=1)
# else:
# pass
#
# return data['k_normal'], data['k_biomass_corrected'], data['log(k_normal)'], data['log(k_biomass_corrected)']
#
# print(get_k_relevent_data(valid_data))
#
#
# def get_hl_relevent_data(data):
#
# for row in valid_data.iterrows():
# if valid_data.loc[valid_data['rateconstant_unit'] == 'L / (g TSS * day)', :]:
# valid_data['k_normal'] = valid_data['TSS_mean'] * valid_data['rateconstant']
# k_tss['k_biomass_corrected'] = k_tss['rateconstant']
# k_tss['log(k_normal)'] = np.log10(k_tss['k_normal'])
# k_tss['log(k_biomass_corrected)'] = np.log10(k_tss['k_biomass_corrected'])
#
#
#
# k_tss = valid_data.loc[valid_data['rateconstant_unit'] == 'L / (g TSS * day)'].copy()
# k_tss['k_normal'] = k_tss['TSS_mean'] * k_tss['rateconstant']
# k_tss['k_biomass_corrected'] = k_tss['rateconstant']
# k_tss['log(k_normal)'] = np.log10(k_tss['k_normal'])
# k_tss['log(k_biomass_corrected)'] = np.log10(k_tss['k_biomass_corrected'])
#
# k = valid_data.loc[valid_data['rateconstant_unit'] == '1 / day'].copy()
# k_tss['k_normal'] = k['rateconstant']
# k_tss['k_biomass_corrected'] = k['rateconstant'] / k['TSS_mean']
# k_tss['log(k_normal)'] = np.log10(k['k_normal'])
# k_tss['log(k_biomass_corrected)'] = np.log10(k['k_biomass_corrected'])
#
# print('log(k_biomass_corrected)', k_tss['log(k_biomass_corrected)'])
#
#
#
# #all(axis=1)
# #
# # def arcessere_k_normal_k_biomass_corrected(data):
# for row in data_merge.iterrows():
# if data_merge[data_merge['rateconstant'].isnull() & data_merge['rateconstant'] != 0 ].cpoy(): #if data['rateconstant'].isnull().all(axis=1):
# continue
# else:
# if data_merge['rateconstant_unit'].values.any() == 'L / (g TSS * day)': #.any() for the missing value in Series
# data_merge['k_normal'] = data_merge.apply(lambda x: x['TSS_mean'] * x['rateconstant'], axis=1)
# data_merge['k_biomass_corrected'] = data_merge.apply(lambda x: x['rateconstant'], axis=1)
# data_merge['log(k_normal)'] = data_merge.apply(lambda x: np.log(x['k_normal']), axis=1)
# data_merge['log(k_biomass_corrected)']= data_merge.apply(lambda x: np.log(x['k_biomass_corrected']), axis=1)
# elif data_merge['rateconstant_unit'].values.any() == '1 / day':
# data_merge['k_normal'] = data_merge.apply(lambda x: x['rateconstant'], axis=1)
# data_merge['k_biomass_corrected'] = data_merge.apply(lambda x: x['rateconstant'] / x['TSS_mean'], axis=1)
# data_merge['log(k_normal)'] = data_merge.apply(lambda x: np.log(x['k_normal']), axis=1)
# data_merge['log(k_biomass_corrected)'] = data_merge.apply(lambda x: np.log(x['k_biomass_corrected']), axis=1)
#
# elif data_merge['rateconstant_unit'].values.any() == '':
# pass
#
# elif data_merge['reaction_order'].values.any() == 'First order':
# data_merge['hl_normal'] = data_merge.apply(lambda x: (0.693/(x['k_normal'])), axis=1)
# data_merge['hl_biomass_corrected'] = data_merge.apply(lambda x: (0.693/(x['k_biomass_corrected'])), axis=1)
# data_merge['log(hl_normal)'] = data_merge.apply(lambda x: log(x['hl_normal']), axis=1)
# data_merge['log(hl_biomass_corrected)']= data_merge.apply(lambda x: log(x['hl_biomass_corrected']), axis=1)
# elif data_merge['reaction_order'].values.any() == 'Pseudo first order':
# data_merge['hl_normal'] = data_merge.apply(lambda x: (0.693/(x['k_normal'])), axis=1)
# else:
# continue
# # print(data['k_normal'])
# # print(data['k_biomass_corrected'])
# # print(data['log (k_normal)'])
# print('k std: {}'.format(data_merge['k_normal'].std()))
# print('k biomass corrected std: {}'.format(data_merge['k_biomass_corrected'].std()))
# print('hl std: {}'.format(data_merge['hl_normal'].std()))
# print('hl biomass corrected std: {}'.format(data_merge['hl_biomass_corrected'].std()))
# print('log k normal std: {}'.format(data_merge['log(k_normal)'].std()))
# print('log k biomass corrected std: {}'.format(data_merge['log(k_biomass_corrected)'].std()))
# print('log hl std: {}'.format(data_merge['log(hl_normal)'].std()))
# print('log hl biomass corrected std: {}'.format(data_merge['log(hl_biomass_corrected)'].std()))
# def arcessere_HF_normal_HF_biomass_corrected(data):
# for row in data.iterrows():
# if data['reaction_order'].values.any() == 'First order':
# data['hl_normal'] = data.apply(lambda x: (0.693/x['k_normal']), axis=1)
# data['hl_biomass_corrected'] = data.apply(lambda x: (0.693/x['k_biomass_corrected']),axis=1)
# data['log (hl_normal)'] = data.apply(lambda x: log(x['hl_normal']), axis=1)
# data['log (hl_biomass_corrected)']= data.apply(lambda x: log(x['hl_biomass_corrected']), axis=1)
# elif data['reaction_order'].values.any() == 'Pseudo first order':
# data['hl_normal'] = data.apply(lambda x: (0.693/x['k_normal']), axis=1)
# data['hl_biomass_corrected'] = data.apply(lambda x: (0.693/x['k_biomass_corrected']), axis=1)
# data['log (hl_normal)']= data.apply(lambda x: log(x['hl_normal']), axis=1)
# data['log (hl_biomass_corrected)']= data.apply(lambda x: log(x['hl_biomass_corrected']), axis=1)
# elif data['reaction_order'].values.any() == 'Second order':
# pass
# else:
# continue
# #calculate the std for each input
# print('rateconstant std: {}'.format(data['k_normal'].std()))
# print('rateconstant biomass corrected std: {}'.format(data['k_biomass_corrected'].std()))
# print('half-life std: {}'.format(data['hl_normal'].std()))
# print('half-life biomass corrected std: {}'.format(data['hl_biomass_corrected'].std()))
# print('log k normal std: {}'.format(data['log (k_normal)'].std()))
# print('log k biomass corrected std: {}'.format(data['log (k_biomass_corrected)'].std()))
# print('log hl std: {}'.format(data['log (hl_normal)'].std()))
# print('log hl biomass corrected std: {}'.format(data['log (hl_biomass_corrected)'].std()))
#
# data['k_normal']
# data['k_biomass_corrected']
# data['hf_normal']
# data['hf_biomass_corrected']
# data['log_k_normal']
# data['log_k_biomass_corrected']
# data['log_hl_normal']
# data['log_hl_biomass_corrected']
# df = pd.DataFrame.from_dict(D, orient='index')
# df.to_csv(output_file_path_padel, sep='\t')
# #2022/11/3
# k_normal_list = []
# k_biomass_corrected_list = []
# hl_normal_list = []
# hl_biomass_corrected = []
#
# # log_hl_list = []
# # log_hl_biomass_corrected_list = []
# # log_k_list = []
# # log_k_biomass_corrected = []
# for i, row in data_merge.iterrows():
# data_merge['TSS_mean'] = data_merge[['total_suspended_solids_concentration_start', 'total_suspended_solids_concentration_end']].mean(1)
# data_merge['k_normal'] = data_merge['TSS_mean'] * data_merge['rateconstant']
# data_merge['k_biomass_corrected'] = data_merge['rateconstant'] / data_merge['TSS_mean']
# if row['rateconstant_unit'].lower() == 'L / (g TSS * day)':
# k_normal_list.append(row['TSS_mean'] * row['rateconstant'])
# k_biomass_corrected_list.append(row['rateconstant'])
# elif row['rateconstant_unit'] == '1 / day':
# k_normal_list.append(row['rateconstant'])
# k_biomass_corrected_list.append(row['rateconstant'] / row['TSS_mean'])
# elif row['reaction_order'] == 'Zero order':
# pass
# elif row['reaction_order'] == 'First order':
# hl_normal_list.append(np.log(2)/(row['k_normal']))
# hl_biomass_corrected.append(np.log(2)/(row['k_biomass_corrected']))
# elif row['reaction_order'] == 'Pseudo first order':
# pass
# else:
# pass
# data_merge['k_normal'] = k_normal_list
# data_merge['k_biomass_corrected'] = k_biomass_corrected_list
# data_merge['hl_normal'] = hl_normal_list
# data_merge['hl_biomass_corrected'] = hl_biomass_corrected
#
# data_merge['log(k_normal)'] = np.log10(data_merge['k_normal'])
# data_merge['log(k_biomass_corrected)'] = np.log10(data_merge['k_biomass_corrected'])
# data_merge['log(hl_normal)'] = np.log10(data_merge['hl_normal'])
# data_merge['log(hl_biomass_corrected)'] = np.log10(data_merge['hl_biomass_corrected'])
# if row['rateconstant_unit'] == 'L / (g TSS * day)': #.any() for the missing value in Series
# data_merge['k_normal'] = data_merge.apply(lambda x: x['TSS_mean'] * x['rateconstant'], axis=1) #apply on the whole row
# #how to add single value in dataframe?
# data_merge['k_biomass_corrected'] = data_merge.apply(lambda x: x['rateconstant'], axis=1)
# data_merge['log(k_normal)'] = data_merge.apply(lambda x: np.log10(x['k_normal']), axis=1)
# data_merge['log(k_biomass_corrected)']= data_merge.apply(lambda x: np.log10(x['k_biomass_corrected']), axis=1)
# elif row['rateconstant_unit'].values.any() == '1 / day':
# data_merge['k_normal'] = data_merge.apply(lambda x: x['rateconstant'], axis=1)
# data_merge['k_biomass_corrected'] = data_merge.apply(lambda x: x['rateconstant'] / x['TSS_mean'], axis=1) ### Here might have some problems
# data_merge['log(k_normal)'] = data_merge.apply(lambda x: np.log10(x['k_normal']), axis=1)
# data_merge['log(k_biomass_corrected)'] = data_merge.apply(lambda x: np.log10(x['k_biomass_corrected']), axis=1)
# elif data_merge['rateconstant_unit'].values.any() == '':
# pass
# elif data_merge['reaction_order'].values.any() == 'Zero order':
# pass
# elif data_merge['reaction_order'].values.any() == 'First order':
# data_merge['hl_normal'] = data_merge.apply(lambda x: (np.log(2)/(x['k_normal'])), axis=1)
# data_merge['hl_biomass_corrected'] = data_merge.apply(lambda x: (np.log(2)/(x['k_biomass_corrected'])), axis=1)
# data_merge['log(hl_normal)'] = data_merge.apply(lambda x: log10(x['hl_normal']), axis=1)
# data_merge['log(hl_biomass_corrected)']= data_merge.apply(lambda x: log10(x['hl_biomass_corrected']), axis=1)
# elif data_merge['reaction_order'].values.any() == 'Pseudo first order':
# data_merge['hl_normal'] = data_merge.apply(lambda x: (np.log(2)/(x['k_normal'])), axis=1)
# else:
# continue

Event Timeline