Page MenuHomec4science

combined_learning_last.py
No OneTemporary

File Metadata

Created
Thu, Mar 28, 09:39

combined_learning_last.py

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import PathPatch
from collections import defaultdict
import xgboost as xgb
from sklearn import ensemble, neighbors, linear_model, gaussian_process
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import ridge_regression, SGDRegressor, ElasticNet, Ridge
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
pd.options.display.max_columns = None
output_path = "C:\\Users\\leetseng\\TWtest\\output\\bay_area\\"
output_path_des = output_path + "descriptors\\"
#This files already contain the bay mean and bay std.
#The reduced SMILES of previous files were generated from other toolkit. This file lead to inconsistant of SMILES.
file_name_soil = "soil_model_input_full.txt" #soil_model_input_bayes_curated_full.txt
file_name_sludge = "sludge_bay_PriorMuStd_3.tsv" #sludge_bayesian_PriorMuStd_09.tsv
df_soil_original = pd.read_csv(output_path + file_name_soil, sep='\t')
df_sludge_original = pd.read_csv(output_path + file_name_sludge, sep='\t')
#Use the copy of dataframe for the later test and manipulation
df_soil_ = df_soil_original.copy()
df_sludge_ = df_sludge_original.copy()
df_sludge_.loc[:, 'package'] = 0 # O: sludge, 1: soil
df_soil_.loc[:, 'package'] = 1
output_path_model = output_path + "modeling\\"
TAGS = ''
ITERATIONS = 20
DESCRIPTORS = 'MACCS'
DENDROGRAM = False
CLUSTER_THRESHOLD = 0.01
COMBINED_LEARNING = True
TOPS = ['10', '20', '30', '40', 'all']
MODELS = [
ensemble.RandomForestRegressor(),
xgb.XGBRegressor(),
ensemble.AdaBoostRegressor(),
SVR(),
neighbors.KNeighborsRegressor()
# ElasticNet()
# RANSACRegressor(),
# TheilSenRegressor()
# Ridge()
# KernelRidge(),
# OrthogonalMatchingPursuit()
# Lars()
# HuberRegressor()
# MLPRegressor(hidden_layer_sizes=(1200, 120), max_iter=1000)
# SGDRegressor(),
# reg,
# PLSRegression(n_components=3)
# gaussian_process.GaussianProcessRegressor()
]
model_name = [
"Random Forest",
"XGBoost",
"AdaBoost",
"SVR",
"k-NN"
# "ElasticNet"
# "RANSAC",
# "TheilSenRegressor"
# "Ridge Regressor",
# "Kernel Ridge",
# "OMP"
# "LARS"
# "Huber Regressor"
# "MLPregressor"
# "SGD Regressor",
# "StackReg",
# "Partial Least-Squares Regressor"
# "Gaussian Process"
]
def adjust_box_widths(g, fac):
"""
Adjust the withs of a seaborn-generated boxplot.
"""
# iterating through Axes instances
for ax in g.axes:
# iterating through axes artists:
for c in ax.get_children():
# searching for PathPatches
if isinstance(c, PathPatch):
# getting current width of box:
p = c.get_path()
verts = p.vertices
verts_sub = verts[:-1]
xmin = np.min(verts_sub[:, 0])
xmax = np.max(verts_sub[:, 0])
xmid = 0.5*(xmin+xmax)
xhalf = 0.5*(xmax - xmin)
# setting new width of box
xmin_new = xmid-fac*xhalf
xmax_new = xmid+fac*xhalf
verts_sub[verts_sub[:, 0] == xmin, 0] = xmin_new
verts_sub[verts_sub[:, 0] == xmax, 0] = xmax_new
# setting new width of median line
for l in ax.lines:
if np.all(l.get_xdata() == [xmin, xmax]):
l.set_xdata([xmin_new, xmax_new])
def non_nan_mean(x):
if x.empty:
return None
else:
x = x.dropna()
return np.mean(x)
def check_collinearity(input_df): # CLUSTER_THRESHOLD,
print('\nCheck collinearity..................\t')
feature_names = input_df.columns
corr = input_df.corr('spearman').to_numpy()
corr = (corr + corr.T) / 2 # Ensure the correlation matrix is symmetric
np.fill_diagonal(corr, 1)
corr = np.nan_to_num(corr)
distance_matrix = 1 - np.abs(corr) # Convert the correlation matrix to a distance matrix
dist_linkage = hierarchy.ward(squareform(distance_matrix)) # hierarchical clustering using Ward's linkage, return a ndarray
if DENDROGRAM:
size = len(feature_names) / 6.5
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(size * 2, size))
dendro = hierarchy.dendrogram(dist_linkage, labels=feature_names, ax=ax1, leaf_rotation=90)
dendro_idx = np.arange(0, len(dendro["ivl"]))
ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]])
ax2.set_xticks(feature_names[dendro_idx])
ax2.set_yticks(dendro_idx)
ax2.set_xticklabels(dendro["ivl"], rotation="vertical")
ax2.set_yticklabels(dendro["ivl"])
fig.tight_layout()
plt.show()
cluster_ids = hierarchy.fcluster(dist_linkage, CLUSTER_THRESHOLD, criterion="distance") # return a list
cluster_id_to_feature_ids = defaultdict(list) # create a default dict
for idx, cluster_id in enumerate(cluster_ids): # generate the tuple [(0, ""), (1, ""), (2, "")]
cluster_id_to_feature_ids[cluster_id].append(idx) # use the 'cluster_id' as the key
selected_feature_ids = [v[0] for v in cluster_id_to_feature_ids.values()] # Each v is a list that might contins single or multiple cluster_id, here only thake the first element v[0] as the representative cludter_id
# Transform the inupt_df into np.array and tranpose so that the list in each row is the data of each features.
# We use the [i] to select the features that list in selected_features_ids.
# After the List Comprehension, you need to convert the list back to the np.array so that you can transpose to original direction of the column and row.
new_X = np.array([np.array(input_df).T[i] for i in selected_feature_ids]).T
selected_features = [feature_names[i] for i in selected_feature_ids]
print('Descriptors after applying cluster threshold of {}: {}'.format(CLUSTER_THRESHOLD, len(selected_features)))
D = {'new_X': new_X, 'selected_features': selected_features}
return new_X, selected_features
def generate_df_for_collinearity_analysis(X_sludge_des, y_sludge_des, soil_XY_merge_des): #soil_bayesian_merge_padel_reduced
if COMBINED_LEARNING:
X_sludge_des_2 = X_sludge_des.copy()
y_sludge_des_2 = y_sludge_des.copy()
soil_XY_merge_des_2 = soil_XY_merge_des.copy()
X_combined_des = pd.concat([X_sludge_des_2.fillna(0), soil_XY_merge_des_2.iloc[:, 2:].fillna(0)], join='inner') #X_sludge_padel, soil_bayesian_merge_padel_reduced.iloc[:, 2:]
y_combined_des = pd.concat([y_sludge_des_2, soil_XY_merge_des_2['hl_log_bayesian_mean']], join='inner')
# When you run the combined learning, you first check the collinearity on soil and sludge together since the concatenated dataset might have some undiscover relationship.
selected_X_combined_des_array, selected_features_combined = check_collinearity(X_combined_des) #Return the nd.array and list of selected features # cluster threshold: 0.02, Descriptors: 1085
return selected_X_combined_des_array, selected_features_combined, X_combined_des, y_combined_des
else:
X_combined_des = "Not exist"
y_combined_des = "Not exist"
selected_X_sludge_des_array, selected_features_sludge = check_collinearity(X_sludge_des) # cluster threshold: 0.02, Descriptors: 1068
return selected_X_sludge_des_array, selected_features_sludge, X_combined_des, y_combined_des
def get_feature_importance_RF(X, y):
model = ensemble.RandomForestRegressor()
r2_list = []
rmse_list = []
mae_list = []
importance_list = []
i = 1
print('\nFind feature importance using RF..................')
while i <= ITERATIONS:
print('Running {} iterations'.format(i))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= i+10) #, random_state=i
pipe = make_pipeline(StandardScaler(), model)
pipe.fit(X_train, y_train)
y_test_pred = pipe.predict(X_test)
r2_list.append(r2_score(y_test, y_test_pred))
rmse_list.append(mean_squared_error(y_test, y_test_pred))
mae_list.append(mean_absolute_error(y_test, y_test_pred))
importance_list.append(model.feature_importances_)
i += 1
R2_mean = np.mean(r2_list)
RMSE_mean = np.mean(rmse_list)
MAE_mean = np.mean(mae_list)
mean_feature_importance = np.mean(np.array(importance_list).T, axis=1)
print("R2: {}, RMSE: {}, MAE: {}".format(round(R2_mean, 2), round(RMSE_mean, 2), round(MAE_mean, 2)))
return mean_feature_importance
def plot_feature_importance(ls_feature, mean_feature_importance, topx):
tree_importance_sorted_idx = np.argsort(mean_feature_importance)[::-1] #This is the last model
tree_indices_rule = np.arange(0, len(mean_feature_importance))[::-1]
fig, ax1 = plt.subplots(1, 1, figsize=(6, 6))
ax1.barh(tree_indices_rule[:topx], mean_feature_importance[tree_importance_sorted_idx][:topx], height=0.4)
ax1.set_yticks(tree_indices_rule[:topx])
ax1.set_yticklabels(ls_feature[tree_importance_sorted_idx][:topx]) #The features from sludge data set.
fig.tight_layout()
# plt.show()
plt.savefig(output_path + 'figures\\{}_CL{}_RF_top_{}_feature_importance.png'.format(DESCRIPTORS, COMBINED_LEARNING, topx))
plt.close()
def find_top_features_subset(TOPS, features_, tree_importance_sorted_idx):
features = {}
for top in TOPS:
if not top.isdigit():
features[top] = [f for f in features_[tree_importance_sorted_idx][:]]
features[top]
continue
features[top] = [f for f in features_[tree_importance_sorted_idx][:int(top)]]
features[top].append('package')
return features
def return_topX_features(TOPS, input_array, y_input, input_descriptor): # Input True or False
print('\nReturn topX features subset..................')
if COMBINED_LEARNING:
rf_des_feature_importance_CL = get_feature_importance_RF(input_array, y_input) #selected_X_combined_padel_array, y_combined_padel
tree_importance_sorted_idx_CL = np.argsort(rf_des_feature_importance_CL)[::-1]
features_des_CL = np.array(input_descriptor.columns[2:]) # X_combined_padel
features_CL = find_top_features_subset(TOPS, features_des_CL, tree_importance_sorted_idx_CL)
features_10 = features_CL['10']
features_20 = features_CL['20']
features_30 = features_CL['30']
features_40 = features_CL['40']
features_all = features_CL['all']
plot_feature_importance(features_des_CL, rf_des_feature_importance_CL, 30)
return features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx_CL
else:
rf_des_feature_importance = get_feature_importance_RF(input_array, y_input) #selected_X_sludge_padel_array, y_sludge_padel
tree_importance_sorted_idx = np.argsort(rf_des_feature_importance)[::-1]
features_des = np.array(input_descriptor.columns[2:]) # sludge_bayesian_merge_padel
features = find_top_features_subset(TOPS, features_des, tree_importance_sorted_idx)
features_10 = features['10']
features_20 = features['20']
features_30 = features['30']
features_40 = features['40']
features_all = features['all']
plot_feature_importance(features_des, rf_des_feature_importance, 30)
return features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx
def run_model(X_sludge_des, y_sludge_des, model, m, model_name, features_10, features_20, features_30, features_40, features_all, soil_XY_merge_des):
r2_list = []
rmse_list = []
mae_list = []
model_used = []
combined_learning_used = []
features_used = []
descriptor_used = []
# cv_score_list = []
dict_features_list = {'top 10': features_10, 'top 20': features_20, 'top 30': features_30, 'top 40': features_40, 'all': features_all}
for list_name, features_topX in dict_features_list.items(): # dict_features_list[list_name]
i = 1
print("\nEvaluate model {} with {} features".format(model_name[m], list_name))
# split the sludge dataset into validation set and input set
# X_sludge_pre, X_validate, y_sludge_pre, y_validate = train_test_split(X[features_topX], y, test_size=0.10, random_state=0)
# print("X_sludge shape before random splitting", X_sludge_pre.shape)
while i <= ITERATIONS:
# # Randomly split the preprocessing sludge dataset into training set and test set
X_train_des, X_test_des, y_train_des, y_test_des = train_test_split(X_sludge_des, y_sludge_des,
test_size=0.2, random_state=i+10)
# cv = KFold(n_splits=5)
if COMBINED_LEARNING:
print('\tRun the {} iterations of combined learning.\tKeep calm and carry on!'.format(i))
# Concate the split sludge data with soil data
X_train_CL = pd.concat([X_train_des.fillna(0), soil_XY_merge_des.iloc[:, 2:].fillna(0)], join='inner', ignore_index=True) # We lost 30 cpds in this step
y_train_CL = pd.concat([y_train_des, soil_XY_merge_des["hl_log_bayesian_mean"]], join='inner', ignore_index=True)
pipe = make_pipeline(StandardScaler(), model) #RobustScaler()
pipe.fit(X_train_CL[features_topX], y_train_CL)
y_test_pred = pipe.predict(X_test_des[features_topX])
r2_list.append(r2_score(y_test_des, y_test_pred))
rmse_list.append(mean_squared_error(y_test_des, y_test_pred))
mae_list.append(mean_absolute_error(y_test_des, y_test_pred))
model_used.append(model_name[m])
combined_learning_used.append(str(COMBINED_LEARNING))
features_used.append(list_name)
descriptor_used.append(str(DESCRIPTORS))
# cv_score_list.append(cross_val_score(pipe, X_train_CL[features_topX], y_train_CL, cv=cv))
else:
print('\tRun the {} iterations of Eawag-Sludge.\tKeep calm and carry on!'.format(i))
# X_train_sludge, X_test_sludge, y_train_sludge, y_test_sludge = train_test_split(X_sludge_padel[features_topX], y_sludge_padel, test_size=0.20)
pipe = make_pipeline(StandardScaler(), model)
pipe.fit(X_train_des[features_topX], y_train_des) # X_train, y_train
y_test_pred = pipe.predict(X_test_des[features_topX])
r2_list.append(r2_score(y_test_des, y_test_pred))
rmse_list.append(mean_squared_error(y_test_des, y_test_pred))
mae_list.append(mean_absolute_error(y_test_des, y_test_pred))
model_used.append(model_name[m])
combined_learning_used.append(str(COMBINED_LEARNING))
features_used.append(list_name)
descriptor_used.append(str(DESCRIPTORS))
# cv_score_list.append(cross_val_score(pipe, X_train_CL[features_topX], y_train_CL, cv=cv))
i += 1
print('R2:', round(np.mean(r2_list), 2), '\tRMSE:', round(np.mean(rmse_list), 2), '\tMAE:', round(np.mean(mae_list), 2))
return r2_list, rmse_list, mae_list, model_used, combined_learning_used, features_used, descriptor_used
def store_result(MODELS, model_name, X, y, features_10, features_20, features_30, features_40, features_all, soil_XY_merge_des): #
print("Prepare to collect the result..................")
model_dict_ = {'R2': [], 'RMSE': [], 'MAE': [], 'Model': [], 'Combined_learning': [], 'Selected_features': [], 'Descriptor': []}
for m in range(len(MODELS)): # for modle in MODELS:
r2_list, rmse_list, mae_list, model_used, combined_learning_used, features_used, descriptor_used = run_model(X, y, MODELS[m], m, model_name, features_10, features_20, features_30, features_40, features_all, soil_XY_merge_des) # X_sludge_padel, y_sludge_padel for sludge only
model_dict_['R2'].extend(r2_list) # X_combined_padel, y_combined_padel
model_dict_['RMSE'].extend(rmse_list)
model_dict_['MAE'].extend(mae_list)
# model_dict_['CV_SCORE'].extend(cv_score_list)
model_dict_['Selected_features'].extend(features_used)
model_dict_['Model'].extend(model_used)
model_dict_['Descriptor'].extend(descriptor_used)
model_dict_['Combined_learning'].extend(combined_learning_used)
return model_dict_
def plot_rmse(df):
if COMBINED_LEARNING:
TAGS_rmse = ""
dt = df
else:
TAGS_rmse = "Without"
dt = df
fig = plt.figure(1, figsize=(12, 6))
sns.set_style("whitegrid")
sns.set_theme(style="ticks", palette="pastel")
sns.boxplot(x="Model", y="RMSE", width=0.6,hue="Selected_features", palette=["#47ad62", "#4d5154", "#2c79de", "#8a1506", "#ed6511"], data=dt, showfliers=False)
adjust_box_widths(fig, 0.6)
sns.despine(offset=10, trim=False)
plt.xlabel('')
plt.ylabel('RMSE')
plt.title('{} Combined Learning, {} descriptor'.format(TAGS_rmse, DESCRIPTORS), fontsize=20)
plt.savefig(output_path + 'figures\\rmse_CL_{}_{}_iter{}.png'.format(COMBINED_LEARNING, DESCRIPTORS, ITERATIONS))
plt.close()
def plot_r2(df):
if COMBINED_LEARNING:
TAGS = ""
dt = df
else:
TAGS = "Without"
dt = df
fig = plt.figure(1, figsize=(12, 6))
sns.set_style("whitegrid")
sns.set_theme(style="ticks", palette="pastel")
sns.boxplot(x="Model", y="R2", width=0.6, hue="Selected_features", palette=["#47ad62", "#4d5154", "#2c79de", "#8a1506", "#ed6511"], data=dt, showfliers=False)
adjust_box_widths(fig, 0.7)
sns.despine(offset=10, trim=False)
# plt.axhline(0.1, linewidth=2, color='#bf6613')
plt.xlabel('')
plt.ylabel('$R^2$')
plt.title('{} Combined Learning, {} descriptor'.format(TAGS, DESCRIPTORS), fontsize=20)
plt.savefig(output_path+ 'figures\\r2_CL_{}_{}_iter{}.png'.format(COMBINED_LEARNING, DESCRIPTORS, ITERATIONS))
plt.close() # prevent overwrite
def get_sludge_subset_target_variable(des_X, target_Y): # des_X: df of descriptor, Y: df_soil or df_sludge, X, should be from the same package
# des_X_ = des_X.copy()
# des_X_.loc[:, 'package'] = 0 # assign the sludge label = 0
target_Y_ = target_Y.copy()
aggs = [non_nan_mean]
df_subset_XY = target_Y_[["reduced_smiles", "hl_log_bayesian_mean", "package"]] # "temperature", "log_hl_combined", "log_hl_biomass_corrected"
XY = df_subset_XY.groupby(["reduced_smiles"]).agg(aggs).reset_index()
XY.columns = XY.columns.droplevel(1)
XY_merge_des = pd.merge(XY, des_X, how='left', on='reduced_smiles')
XY_merge_des.dropna(axis=0, inplace=True)
# XY_merge_des.drop(columns=['Unnamed: 0','topoShape'], inplace=True)
return XY_merge_des
def index_merge_descriptor(df1, df2):
df_new = pd.merge(df1, df2, how='inner', on='index')
# print(df_new.shape)
return df_new
def merge_descriptor(df1, df2):
df_new = pd.merge(df1, df2, how='inner', on='reduced_smiles')
# print(df_new.shape)
return df_new
def main():
# df_soil_index_maccs = pd.read_csv(output_path + 'soil_MACCS_descriptors.txt', sep='\t').drop_duplicates()
# df_soil_index_rule = pd.read_csv(output_path + 'soil_rule_descriptors_TRIG.txt', sep='\t').drop_duplicates()
# df_soil_index_padel = pd.read_csv(output_path + 'soil_PaDEL_descriptors.txt', sep='\t').drop_duplicates()
# df_index_smiles = pd.read_csv(output_path + 'soil_model_input_all_reduced.txt', sep='\t')
# index_smiles = df_index_smiles.iloc[:, :3].drop(columns=['compound_id'])
# index_smiles.to_csv(output_output_path_model + 'descriptors\\soil_index_smiles.tsv', sep='\t')
sludge_padel_descriptor = pd.read_csv(output_path_des + 'sludge_padel_descriptor.tsv', sep='\t')
sludge_maccs_descriptor = pd.read_csv(output_path_des + 'sludge_maccs_descriptor.tsv', sep='\t') # shape = (160, 485)
sludge_rule_descriptor = pd.read_csv(output_path_des + 'sludge_rule_descriptor.tsv', sep='\t') # shape = (160, 192)
sludge_padel_maccs_descriptor = pd.read_csv(output_path_des + 'sludge_padel_maccs_descriptor.tsv', sep='\t')
sludge_padel_rule_descriptor = pd.read_csv(output_path_des + 'sludge_padel_rule_descriptor.tsv', sep='\t')
sludge_maccs_rule_descriptor = pd.read_csv(output_path_des + 'sludge_maccs_rule_descriptor.tsv', sep='\t')
sludge_all_descriptor = pd.read_csv(output_path_des + 'sludge_all_descriptor.tsv', sep='\t')
soil_padel_descriptor = pd.read_csv(output_path_des + 'soil_padel_descriptor.tsv', sep='\t')
soil_maccs_descriptor = pd.read_csv(output_path_des + 'soil_maccs_descriptor.tsv', sep='\t')
soil_rule_descriptor = pd.read_csv(output_path_des + 'soil_rule_descriptor.tsv', sep='\t')
soil_padel_maccs_descriptor = pd.read_csv(output_path_des + 'soil_padel_maccs_descriptor.tsv', sep='\t')
soil_padel_rule_descriptor = pd.read_csv(output_path_des + 'soil_padel_rule_descriptor.tsv', sep='\t')
soil_maccs_rule_descriptor = pd.read_csv(output_path_des + 'soil_maccs_rule_descriptor.tsv', sep='\t')
soil_all_descriptor = pd.read_csv(output_path_des + 'soil_all_descriptor.tsv', sep='\t')
soil_XY_merge_padel = pd.read_csv(output_path_des + "soil_XY_merge_padel.tsv", sep='\t')
soil_XY_merge_maccs = pd.read_csv(output_path_des + "soil_XY_merge_maccs.tsv", sep='\t')
soil_XY_merge_rule = pd.read_csv(output_path_des + "soil_XY_merge_rule.tsv", sep='\t')
soil_XY_merge_padel_maccs = pd.read_csv(output_path_des + "soil_XY_merge_padel_maccs.tsv", sep='\t')
soil_XY_merge_padel_rule = pd.read_csv(output_path_des + "soil_XY_merge_padel_rule.tsv", sep='\t')
soil_XY_merge_maccs_rule = pd.read_csv(output_path_des + "soil_XY_merge_maccs_rule.tsv", sep='\t')
soil_XY_merge_all = pd.read_csv(output_path_des + "soil_XY_merge_all.tsv", sep='\t')
# sludge_XY_merge_padel = pd.read_csv(output_path+ "sludge_XY_merge_padel.tsv", sep='\t')
# sludge_XY_merge_maccs = pd.read_csv(output_path + "sludge_XY_merge_maccs.tsv", sep='\t')
# sludge_XY_merge_rule = pd.read_csv(output_path + "sludge_XY_merge_rule.tsv", sep='\t')
# sludge_XY_merge_padel_maccs = pd.read_csv(output_path + "sludge_XY_merge_padel_maccs.tsv", sep='\t')
# sludge_XY_merge_padel_rule = pd.read_csv(output_path + "sludge_XY_merge_padel_rule.tsv", sep='\t')
# sludge_XY_merge_maccs_rule = pd.read_csv(output_path + "sludge_XY_merge_maccs_rule.tsv", sep='\t')
# sludge_XY_merge_all = pd.read_csv(output_path + "sludge_XY_merge_all.tsv", sep='\t')
# for des_X in des_sludgeX_list:
# sludge_XY_merge_des = get_subset_target_variable(des_X, df_sludge_, 'hl_log_bayesian_mean')
# X_sludge_des, X_validate, y_sludge_des, y_validate = train_test_split(sludge_XY_merge_des.iloc[:, 2:], sludge_XY_merge_des['hl_log_bayesian_mean'], test_size=0.10, random_state=62)
soil_XY_merge_des = soil_XY_merge_padel
sludge_XY_merge_des = get_sludge_subset_target_variable(sludge_padel_descriptor, df_sludge_)
X_sludge_des, X_validate, y_sludge_des, y_validate = train_test_split(sludge_XY_merge_des.iloc[:, 2:], sludge_XY_merge_des['hl_log_bayesian_mean'], test_size=0.10, random_state=62)
if COMBINED_LEARNING:
selected_X_combined_des_array, selected_features_combined, X_combined_des, y_combined_des = generate_df_for_collinearity_analysis(X_sludge_des, y_sludge_des, soil_XY_merge_des)
features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx_CL = return_topX_features(TOPS, selected_X_combined_des_array, y_combined_des, X_combined_des)
df_top_feature_des = pd.DataFrame.from_dict(store_result(MODELS, model_name, X_sludge_des, y_sludge_des, features_10, features_20, features_30, features_40, features_all, soil_XY_merge_des))
# COMBINED_LEARNING, ITERATIONS, MODELS, X, y, features_10, features_20, features_30, features_40, features_all
plot_rmse(df_top_feature_des)
plot_r2(df_top_feature_des)
df_top_feature_des.to_csv(output_path_model+"{}_CL{}_iter{}_cluster_threshold{}.tsv".format(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, CLUSTER_THRESHOLD), sep='\t', index=False)
else:
selected_X_sludge_des_array, selected_features_sludge, X_combined_des, y_combined_des = generate_df_for_collinearity_analysis(X_sludge_des, y_sludge_des, soil_XY_merge_des)
features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx = return_topX_features(TOPS, selected_X_sludge_des_array, y_sludge_des, sludge_XY_merge_des)
df_top_feature_des_noCL = pd.DataFrame.from_dict(store_result(MODELS, model_name, X_sludge_des, y_sludge_des, features_10, features_20, features_30, features_40, features_all, soil_XY_merge_des))
plot_rmse(df_top_feature_des_noCL)
plot_r2(df_top_feature_des_noCL)
df_top_feature_des_noCL.to_csv(output_path_model+"{}_CL{}_iter{}_cluster_threshold{}.tsv".format(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, CLUSTER_THRESHOLD), sep='\t', index=False)
dict_features_list = {'top 10': features_10, 'top 20': features_20, 'top 30': features_30, 'top 40': features_40, 'all': features_all}
if __name__ == '__main__':
main()

Event Timeline