diff --git a/combined_learning_last.py b/combined_learning_last.py new file mode 100644 index 0000000..2cbd2c7 --- /dev/null +++ b/combined_learning_last.py @@ -0,0 +1,492 @@ +import pandas as pd +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +from matplotlib.patches import PathPatch +from collections import defaultdict +import xgboost as xgb +from sklearn import ensemble, neighbors, linear_model, gaussian_process +from sklearn.svm import SVR +from sklearn.neural_network import MLPRegressor +from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import StandardScaler, RobustScaler +from sklearn.model_selection import train_test_split, cross_val_score, KFold +from sklearn.cross_decomposition import PLSRegression +from sklearn.linear_model import ridge_regression, SGDRegressor, ElasticNet, Ridge +from scipy.cluster import hierarchy +from scipy.spatial.distance import squareform +pd.options.display.max_columns = None + + +output_path = "C:\\Users\\leetseng\\TWtest\\output\\bay_area\\" +output_path_des = output_path + "descriptors\\" + +#This files already contain the bay mean and bay std. +#The reduced SMILES of previous files were generated from other toolkit. This file lead to inconsistant of SMILES. +file_name_soil = "soil_model_input_full.txt" #soil_model_input_bayes_curated_full.txt +file_name_sludge = "sludge_bay_PriorMuStd_3.tsv" #sludge_bayesian_PriorMuStd_09.tsv +df_soil_original = pd.read_csv(output_path + file_name_soil, sep='\t') +df_sludge_original = pd.read_csv(output_path + file_name_sludge, sep='\t') + +#Use the copy of dataframe for the later test and manipulation +df_soil_ = df_soil_original.copy() +df_sludge_ = df_sludge_original.copy() +df_sludge_.loc[:, 'package'] = 0 # O: sludge, 1: soil +df_soil_.loc[:, 'package'] = 1 + +output_path_model = output_path + "modeling\\" +TAGS = '' +ITERATIONS = 20 +DESCRIPTORS = 'MACCS' +DENDROGRAM = False +CLUSTER_THRESHOLD = 0.01 +COMBINED_LEARNING = True +TOPS = ['10', '20', '30', '40', 'all'] + +MODELS = [ + ensemble.RandomForestRegressor(), + xgb.XGBRegressor(), + ensemble.AdaBoostRegressor(), + SVR(), + neighbors.KNeighborsRegressor() + # ElasticNet() + # RANSACRegressor(), + # TheilSenRegressor() + # Ridge() + # KernelRidge(), + # OrthogonalMatchingPursuit() + # Lars() + # HuberRegressor() + # MLPRegressor(hidden_layer_sizes=(1200, 120), max_iter=1000) + # SGDRegressor(), + # reg, + # PLSRegression(n_components=3) + + # gaussian_process.GaussianProcessRegressor() +] + +model_name = [ + "Random Forest", + "XGBoost", + "AdaBoost", + "SVR", + "k-NN" + # "ElasticNet" + # "RANSAC", + # "TheilSenRegressor" + # "Ridge Regressor", + # "Kernel Ridge", + # "OMP" + # "LARS" + # "Huber Regressor" + # "MLPregressor" + # "SGD Regressor", + # "StackReg", + # "Partial Least-Squares Regressor" + # "Gaussian Process" +] + + +def adjust_box_widths(g, fac): + """ + Adjust the withs of a seaborn-generated boxplot. + """ + # iterating through Axes instances + for ax in g.axes: + + # iterating through axes artists: + for c in ax.get_children(): + + # searching for PathPatches + if isinstance(c, PathPatch): + # getting current width of box: + p = c.get_path() + verts = p.vertices + verts_sub = verts[:-1] + xmin = np.min(verts_sub[:, 0]) + xmax = np.max(verts_sub[:, 0]) + xmid = 0.5*(xmin+xmax) + xhalf = 0.5*(xmax - xmin) + + # setting new width of box + xmin_new = xmid-fac*xhalf + xmax_new = xmid+fac*xhalf + verts_sub[verts_sub[:, 0] == xmin, 0] = xmin_new + verts_sub[verts_sub[:, 0] == xmax, 0] = xmax_new + + # setting new width of median line + for l in ax.lines: + if np.all(l.get_xdata() == [xmin, xmax]): + l.set_xdata([xmin_new, xmax_new]) + +def non_nan_mean(x): + if x.empty: + return None + else: + x = x.dropna() + return np.mean(x) + +def check_collinearity(input_df): # CLUSTER_THRESHOLD, + print('\nCheck collinearity..................\t') + feature_names = input_df.columns + corr = input_df.corr('spearman').to_numpy() + corr = (corr + corr.T) / 2 # Ensure the correlation matrix is symmetric + np.fill_diagonal(corr, 1) + corr = np.nan_to_num(corr) + distance_matrix = 1 - np.abs(corr) # Convert the correlation matrix to a distance matrix + dist_linkage = hierarchy.ward(squareform(distance_matrix)) # hierarchical clustering using Ward's linkage, return a ndarray + + if DENDROGRAM: + size = len(feature_names) / 6.5 + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(size * 2, size)) + dendro = hierarchy.dendrogram(dist_linkage, labels=feature_names, ax=ax1, leaf_rotation=90) + dendro_idx = np.arange(0, len(dendro["ivl"])) + ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]]) + ax2.set_xticks(feature_names[dendro_idx]) + ax2.set_yticks(dendro_idx) + ax2.set_xticklabels(dendro["ivl"], rotation="vertical") + ax2.set_yticklabels(dendro["ivl"]) + fig.tight_layout() + plt.show() + + cluster_ids = hierarchy.fcluster(dist_linkage, CLUSTER_THRESHOLD, criterion="distance") # return a list + cluster_id_to_feature_ids = defaultdict(list) # create a default dict + + for idx, cluster_id in enumerate(cluster_ids): # generate the tuple [(0, ""), (1, ""), (2, "")] + cluster_id_to_feature_ids[cluster_id].append(idx) # use the 'cluster_id' as the key + selected_feature_ids = [v[0] for v in cluster_id_to_feature_ids.values()] # Each v is a list that might contins single or multiple cluster_id, here only thake the first element v[0] as the representative cludter_id + # Transform the inupt_df into np.array and tranpose so that the list in each row is the data of each features. + # We use the [i] to select the features that list in selected_features_ids. + # After the List Comprehension, you need to convert the list back to the np.array so that you can transpose to original direction of the column and row. + new_X = np.array([np.array(input_df).T[i] for i in selected_feature_ids]).T + selected_features = [feature_names[i] for i in selected_feature_ids] + + print('Descriptors after applying cluster threshold of {}: {}'.format(CLUSTER_THRESHOLD, len(selected_features))) + D = {'new_X': new_X, 'selected_features': selected_features} + return new_X, selected_features + +def generate_df_for_collinearity_analysis(X_sludge_des, y_sludge_des, soil_XY_merge_des): #soil_bayesian_merge_padel_reduced + + if COMBINED_LEARNING: + X_sludge_des_2 = X_sludge_des.copy() + y_sludge_des_2 = y_sludge_des.copy() + soil_XY_merge_des_2 = soil_XY_merge_des.copy() + + X_combined_des = pd.concat([X_sludge_des_2.fillna(0), soil_XY_merge_des_2.iloc[:, 2:].fillna(0)], join='inner') #X_sludge_padel, soil_bayesian_merge_padel_reduced.iloc[:, 2:] + y_combined_des = pd.concat([y_sludge_des_2, soil_XY_merge_des_2['hl_log_bayesian_mean']], join='inner') + # When you run the combined learning, you first check the collinearity on soil and sludge together since the concatenated dataset might have some undiscover relationship. + selected_X_combined_des_array, selected_features_combined = check_collinearity(X_combined_des) #Return the nd.array and list of selected features # cluster threshold: 0.02, Descriptors: 1085 + return selected_X_combined_des_array, selected_features_combined, X_combined_des, y_combined_des + else: + X_combined_des = "Not exist" + y_combined_des = "Not exist" + selected_X_sludge_des_array, selected_features_sludge = check_collinearity(X_sludge_des) # cluster threshold: 0.02, Descriptors: 1068 + return selected_X_sludge_des_array, selected_features_sludge, X_combined_des, y_combined_des + +def get_feature_importance_RF(X, y): + model = ensemble.RandomForestRegressor() + r2_list = [] + rmse_list = [] + mae_list = [] + importance_list = [] + + i = 1 + print('\nFind feature importance using RF..................') + while i <= ITERATIONS: + print('Running {} iterations'.format(i)) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= i+10) #, random_state=i + pipe = make_pipeline(StandardScaler(), model) + pipe.fit(X_train, y_train) + y_test_pred = pipe.predict(X_test) + r2_list.append(r2_score(y_test, y_test_pred)) + rmse_list.append(mean_squared_error(y_test, y_test_pred)) + mae_list.append(mean_absolute_error(y_test, y_test_pred)) + importance_list.append(model.feature_importances_) + i += 1 + + R2_mean = np.mean(r2_list) + RMSE_mean = np.mean(rmse_list) + MAE_mean = np.mean(mae_list) + mean_feature_importance = np.mean(np.array(importance_list).T, axis=1) + print("R2: {}, RMSE: {}, MAE: {}".format(round(R2_mean, 2), round(RMSE_mean, 2), round(MAE_mean, 2))) + return mean_feature_importance + + +def plot_feature_importance(ls_feature, mean_feature_importance, topx): + tree_importance_sorted_idx = np.argsort(mean_feature_importance)[::-1] #This is the last model + tree_indices_rule = np.arange(0, len(mean_feature_importance))[::-1] + fig, ax1 = plt.subplots(1, 1, figsize=(6, 6)) + ax1.barh(tree_indices_rule[:topx], mean_feature_importance[tree_importance_sorted_idx][:topx], height=0.4) + ax1.set_yticks(tree_indices_rule[:topx]) + ax1.set_yticklabels(ls_feature[tree_importance_sorted_idx][:topx]) #The features from sludge data set. + fig.tight_layout() + # plt.show() + plt.savefig(output_path + 'figures\\{}_CL{}_RF_top_{}_feature_importance.png'.format(DESCRIPTORS, COMBINED_LEARNING, topx)) + plt.close() + + +def find_top_features_subset(TOPS, features_, tree_importance_sorted_idx): + features = {} + + for top in TOPS: + if not top.isdigit(): + features[top] = [f for f in features_[tree_importance_sorted_idx][:]] + features[top] + continue + + features[top] = [f for f in features_[tree_importance_sorted_idx][:int(top)]] + features[top].append('package') + + return features + + +def return_topX_features(TOPS, input_array, y_input, input_descriptor): # Input True or False + print('\nReturn topX features subset..................') + if COMBINED_LEARNING: + rf_des_feature_importance_CL = get_feature_importance_RF(input_array, y_input) #selected_X_combined_padel_array, y_combined_padel + tree_importance_sorted_idx_CL = np.argsort(rf_des_feature_importance_CL)[::-1] + features_des_CL = np.array(input_descriptor.columns[2:]) # X_combined_padel + features_CL = find_top_features_subset(TOPS, features_des_CL, tree_importance_sorted_idx_CL) + features_10 = features_CL['10'] + features_20 = features_CL['20'] + features_30 = features_CL['30'] + features_40 = features_CL['40'] + features_all = features_CL['all'] + plot_feature_importance(features_des_CL, rf_des_feature_importance_CL, 30) + return features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx_CL + + else: + rf_des_feature_importance = get_feature_importance_RF(input_array, y_input) #selected_X_sludge_padel_array, y_sludge_padel + tree_importance_sorted_idx = np.argsort(rf_des_feature_importance)[::-1] + features_des = np.array(input_descriptor.columns[2:]) # sludge_bayesian_merge_padel + features = find_top_features_subset(TOPS, features_des, tree_importance_sorted_idx) + features_10 = features['10'] + features_20 = features['20'] + features_30 = features['30'] + features_40 = features['40'] + features_all = features['all'] + plot_feature_importance(features_des, rf_des_feature_importance, 30) + return features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx + +def run_model(X_sludge_des, y_sludge_des, model, m, model_name, features_10, features_20, features_30, features_40, features_all, soil_XY_merge_des): + r2_list = [] + rmse_list = [] + mae_list = [] + model_used = [] + combined_learning_used = [] + features_used = [] + descriptor_used = [] + # cv_score_list = [] + + dict_features_list = {'top 10': features_10, 'top 20': features_20, 'top 30': features_30, 'top 40': features_40, 'all': features_all} + for list_name, features_topX in dict_features_list.items(): # dict_features_list[list_name] + i = 1 + print("\nEvaluate model {} with {} features".format(model_name[m], list_name)) + # split the sludge dataset into validation set and input set + # X_sludge_pre, X_validate, y_sludge_pre, y_validate = train_test_split(X[features_topX], y, test_size=0.10, random_state=0) + # print("X_sludge shape before random splitting", X_sludge_pre.shape) + + while i <= ITERATIONS: + # # Randomly split the preprocessing sludge dataset into training set and test set + X_train_des, X_test_des, y_train_des, y_test_des = train_test_split(X_sludge_des, y_sludge_des, + test_size=0.2, random_state=i+10) + # cv = KFold(n_splits=5) + if COMBINED_LEARNING: + print('\tRun the {} iterations of combined learning.\tKeep calm and carry on!'.format(i)) + # Concate the split sludge data with soil data + X_train_CL = pd.concat([X_train_des.fillna(0), soil_XY_merge_des.iloc[:, 2:].fillna(0)], join='inner', ignore_index=True) # We lost 30 cpds in this step + y_train_CL = pd.concat([y_train_des, soil_XY_merge_des["hl_log_bayesian_mean"]], join='inner', ignore_index=True) + pipe = make_pipeline(StandardScaler(), model) #RobustScaler() + pipe.fit(X_train_CL[features_topX], y_train_CL) + y_test_pred = pipe.predict(X_test_des[features_topX]) + + r2_list.append(r2_score(y_test_des, y_test_pred)) + rmse_list.append(mean_squared_error(y_test_des, y_test_pred)) + mae_list.append(mean_absolute_error(y_test_des, y_test_pred)) + model_used.append(model_name[m]) + combined_learning_used.append(str(COMBINED_LEARNING)) + features_used.append(list_name) + descriptor_used.append(str(DESCRIPTORS)) + # cv_score_list.append(cross_val_score(pipe, X_train_CL[features_topX], y_train_CL, cv=cv)) + + else: + print('\tRun the {} iterations of Eawag-Sludge.\tKeep calm and carry on!'.format(i)) + # X_train_sludge, X_test_sludge, y_train_sludge, y_test_sludge = train_test_split(X_sludge_padel[features_topX], y_sludge_padel, test_size=0.20) + pipe = make_pipeline(StandardScaler(), model) + pipe.fit(X_train_des[features_topX], y_train_des) # X_train, y_train + y_test_pred = pipe.predict(X_test_des[features_topX]) + r2_list.append(r2_score(y_test_des, y_test_pred)) + rmse_list.append(mean_squared_error(y_test_des, y_test_pred)) + mae_list.append(mean_absolute_error(y_test_des, y_test_pred)) + model_used.append(model_name[m]) + combined_learning_used.append(str(COMBINED_LEARNING)) + features_used.append(list_name) + descriptor_used.append(str(DESCRIPTORS)) + # cv_score_list.append(cross_val_score(pipe, X_train_CL[features_topX], y_train_CL, cv=cv)) + i += 1 + print('R2:', round(np.mean(r2_list), 2), '\tRMSE:', round(np.mean(rmse_list), 2), '\tMAE:', round(np.mean(mae_list), 2)) + return r2_list, rmse_list, mae_list, model_used, combined_learning_used, features_used, descriptor_used + +def store_result(MODELS, model_name, X, y, features_10, features_20, features_30, features_40, features_all, soil_XY_merge_des): # + print("Prepare to collect the result..................") + model_dict_ = {'R2': [], 'RMSE': [], 'MAE': [], 'Model': [], 'Combined_learning': [], 'Selected_features': [], 'Descriptor': []} + for m in range(len(MODELS)): # for modle in MODELS: + r2_list, rmse_list, mae_list, model_used, combined_learning_used, features_used, descriptor_used = run_model(X, y, MODELS[m], m, model_name, features_10, features_20, features_30, features_40, features_all, soil_XY_merge_des) # X_sludge_padel, y_sludge_padel for sludge only + model_dict_['R2'].extend(r2_list) # X_combined_padel, y_combined_padel + model_dict_['RMSE'].extend(rmse_list) + model_dict_['MAE'].extend(mae_list) + # model_dict_['CV_SCORE'].extend(cv_score_list) + model_dict_['Selected_features'].extend(features_used) + model_dict_['Model'].extend(model_used) + model_dict_['Descriptor'].extend(descriptor_used) + model_dict_['Combined_learning'].extend(combined_learning_used) + return model_dict_ + +def plot_rmse(df): + + if COMBINED_LEARNING: + TAGS_rmse = "" + dt = df + else: + TAGS_rmse = "Without" + dt = df + + fig = plt.figure(1, figsize=(12, 6)) + sns.set_style("whitegrid") + sns.set_theme(style="ticks", palette="pastel") + sns.boxplot(x="Model", y="RMSE", width=0.6,hue="Selected_features", palette=["#47ad62", "#4d5154", "#2c79de", "#8a1506", "#ed6511"], data=dt, showfliers=False) + adjust_box_widths(fig, 0.6) + sns.despine(offset=10, trim=False) + plt.xlabel('') + plt.ylabel('RMSE') + plt.title('{} Combined Learning, {} descriptor'.format(TAGS_rmse, DESCRIPTORS), fontsize=20) + plt.savefig(output_path + 'figures\\rmse_CL_{}_{}_iter{}.png'.format(COMBINED_LEARNING, DESCRIPTORS, ITERATIONS)) + plt.close() + +def plot_r2(df): + + if COMBINED_LEARNING: + TAGS = "" + dt = df + else: + TAGS = "Without" + dt = df + + fig = plt.figure(1, figsize=(12, 6)) + sns.set_style("whitegrid") + sns.set_theme(style="ticks", palette="pastel") + sns.boxplot(x="Model", y="R2", width=0.6, hue="Selected_features", palette=["#47ad62", "#4d5154", "#2c79de", "#8a1506", "#ed6511"], data=dt, showfliers=False) + adjust_box_widths(fig, 0.7) + sns.despine(offset=10, trim=False) + # plt.axhline(0.1, linewidth=2, color='#bf6613') + plt.xlabel('') + plt.ylabel('$R^2$') + plt.title('{} Combined Learning, {} descriptor'.format(TAGS, DESCRIPTORS), fontsize=20) + plt.savefig(output_path+ 'figures\\r2_CL_{}_{}_iter{}.png'.format(COMBINED_LEARNING, DESCRIPTORS, ITERATIONS)) + plt.close() # prevent overwrite + +def get_sludge_subset_target_variable(des_X, target_Y): # des_X: df of descriptor, Y: df_soil or df_sludge, X, should be from the same package + # des_X_ = des_X.copy() + # des_X_.loc[:, 'package'] = 0 # assign the sludge label = 0 + target_Y_ = target_Y.copy() + aggs = [non_nan_mean] + df_subset_XY = target_Y_[["reduced_smiles", "hl_log_bayesian_mean", "package"]] # "temperature", "log_hl_combined", "log_hl_biomass_corrected" + XY = df_subset_XY.groupby(["reduced_smiles"]).agg(aggs).reset_index() + XY.columns = XY.columns.droplevel(1) + XY_merge_des = pd.merge(XY, des_X, how='left', on='reduced_smiles') + XY_merge_des.dropna(axis=0, inplace=True) + # XY_merge_des.drop(columns=['Unnamed: 0','topoShape'], inplace=True) + return XY_merge_des + +def index_merge_descriptor(df1, df2): + df_new = pd.merge(df1, df2, how='inner', on='index') + # print(df_new.shape) + return df_new + + +def merge_descriptor(df1, df2): + df_new = pd.merge(df1, df2, how='inner', on='reduced_smiles') + # print(df_new.shape) + return df_new + + +def main(): + + # df_soil_index_maccs = pd.read_csv(output_path + 'soil_MACCS_descriptors.txt', sep='\t').drop_duplicates() + # df_soil_index_rule = pd.read_csv(output_path + 'soil_rule_descriptors_TRIG.txt', sep='\t').drop_duplicates() + # df_soil_index_padel = pd.read_csv(output_path + 'soil_PaDEL_descriptors.txt', sep='\t').drop_duplicates() + # df_index_smiles = pd.read_csv(output_path + 'soil_model_input_all_reduced.txt', sep='\t') + # index_smiles = df_index_smiles.iloc[:, :3].drop(columns=['compound_id']) + # index_smiles.to_csv(output_output_path_model + 'descriptors\\soil_index_smiles.tsv', sep='\t') + + sludge_padel_descriptor = pd.read_csv(output_path_des + 'sludge_padel_descriptor.tsv', sep='\t') + sludge_maccs_descriptor = pd.read_csv(output_path_des + 'sludge_maccs_descriptor.tsv', sep='\t') # shape = (160, 485) + sludge_rule_descriptor = pd.read_csv(output_path_des + 'sludge_rule_descriptor.tsv', sep='\t') # shape = (160, 192) + sludge_padel_maccs_descriptor = pd.read_csv(output_path_des + 'sludge_padel_maccs_descriptor.tsv', sep='\t') + sludge_padel_rule_descriptor = pd.read_csv(output_path_des + 'sludge_padel_rule_descriptor.tsv', sep='\t') + sludge_maccs_rule_descriptor = pd.read_csv(output_path_des + 'sludge_maccs_rule_descriptor.tsv', sep='\t') + sludge_all_descriptor = pd.read_csv(output_path_des + 'sludge_all_descriptor.tsv', sep='\t') + + + soil_padel_descriptor = pd.read_csv(output_path_des + 'soil_padel_descriptor.tsv', sep='\t') + soil_maccs_descriptor = pd.read_csv(output_path_des + 'soil_maccs_descriptor.tsv', sep='\t') + soil_rule_descriptor = pd.read_csv(output_path_des + 'soil_rule_descriptor.tsv', sep='\t') + soil_padel_maccs_descriptor = pd.read_csv(output_path_des + 'soil_padel_maccs_descriptor.tsv', sep='\t') + soil_padel_rule_descriptor = pd.read_csv(output_path_des + 'soil_padel_rule_descriptor.tsv', sep='\t') + soil_maccs_rule_descriptor = pd.read_csv(output_path_des + 'soil_maccs_rule_descriptor.tsv', sep='\t') + soil_all_descriptor = pd.read_csv(output_path_des + 'soil_all_descriptor.tsv', sep='\t') + + soil_XY_merge_padel = pd.read_csv(output_path_des + "soil_XY_merge_padel.tsv", sep='\t') + soil_XY_merge_maccs = pd.read_csv(output_path_des + "soil_XY_merge_maccs.tsv", sep='\t') + soil_XY_merge_rule = pd.read_csv(output_path_des + "soil_XY_merge_rule.tsv", sep='\t') + soil_XY_merge_padel_maccs = pd.read_csv(output_path_des + "soil_XY_merge_padel_maccs.tsv", sep='\t') + soil_XY_merge_padel_rule = pd.read_csv(output_path_des + "soil_XY_merge_padel_rule.tsv", sep='\t') + soil_XY_merge_maccs_rule = pd.read_csv(output_path_des + "soil_XY_merge_maccs_rule.tsv", sep='\t') + soil_XY_merge_all = pd.read_csv(output_path_des + "soil_XY_merge_all.tsv", sep='\t') + + # sludge_XY_merge_padel = pd.read_csv(output_path+ "sludge_XY_merge_padel.tsv", sep='\t') + # sludge_XY_merge_maccs = pd.read_csv(output_path + "sludge_XY_merge_maccs.tsv", sep='\t') + # sludge_XY_merge_rule = pd.read_csv(output_path + "sludge_XY_merge_rule.tsv", sep='\t') + # sludge_XY_merge_padel_maccs = pd.read_csv(output_path + "sludge_XY_merge_padel_maccs.tsv", sep='\t') + # sludge_XY_merge_padel_rule = pd.read_csv(output_path + "sludge_XY_merge_padel_rule.tsv", sep='\t') + # sludge_XY_merge_maccs_rule = pd.read_csv(output_path + "sludge_XY_merge_maccs_rule.tsv", sep='\t') + # sludge_XY_merge_all = pd.read_csv(output_path + "sludge_XY_merge_all.tsv", sep='\t') + + + # for des_X in des_sludgeX_list: + + # sludge_XY_merge_des = get_subset_target_variable(des_X, df_sludge_, 'hl_log_bayesian_mean') + # X_sludge_des, X_validate, y_sludge_des, y_validate = train_test_split(sludge_XY_merge_des.iloc[:, 2:], sludge_XY_merge_des['hl_log_bayesian_mean'], test_size=0.10, random_state=62) + + soil_XY_merge_des = soil_XY_merge_padel + sludge_XY_merge_des = get_sludge_subset_target_variable(sludge_padel_descriptor, df_sludge_) + + X_sludge_des, X_validate, y_sludge_des, y_validate = train_test_split(sludge_XY_merge_des.iloc[:, 2:], sludge_XY_merge_des['hl_log_bayesian_mean'], test_size=0.10, random_state=62) + + + if COMBINED_LEARNING: + selected_X_combined_des_array, selected_features_combined, X_combined_des, y_combined_des = generate_df_for_collinearity_analysis(X_sludge_des, y_sludge_des, soil_XY_merge_des) + features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx_CL = return_topX_features(TOPS, selected_X_combined_des_array, y_combined_des, X_combined_des) + df_top_feature_des = pd.DataFrame.from_dict(store_result(MODELS, model_name, X_sludge_des, y_sludge_des, features_10, features_20, features_30, features_40, features_all, soil_XY_merge_des)) + # COMBINED_LEARNING, ITERATIONS, MODELS, X, y, features_10, features_20, features_30, features_40, features_all + plot_rmse(df_top_feature_des) + plot_r2(df_top_feature_des) + df_top_feature_des.to_csv(output_path_model+"{}_CL{}_iter{}_cluster_threshold{}.tsv".format(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, CLUSTER_THRESHOLD), sep='\t', index=False) + else: + selected_X_sludge_des_array, selected_features_sludge, X_combined_des, y_combined_des = generate_df_for_collinearity_analysis(X_sludge_des, y_sludge_des, soil_XY_merge_des) + features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx = return_topX_features(TOPS, selected_X_sludge_des_array, y_sludge_des, sludge_XY_merge_des) + df_top_feature_des_noCL = pd.DataFrame.from_dict(store_result(MODELS, model_name, X_sludge_des, y_sludge_des, features_10, features_20, features_30, features_40, features_all, soil_XY_merge_des)) + plot_rmse(df_top_feature_des_noCL) + plot_r2(df_top_feature_des_noCL) + df_top_feature_des_noCL.to_csv(output_path_model+"{}_CL{}_iter{}_cluster_threshold{}.tsv".format(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, CLUSTER_THRESHOLD), sep='\t', index=False) + dict_features_list = {'top 10': features_10, 'top 20': features_20, 'top 30': features_30, 'top 40': features_40, 'all': features_all} + +if __name__ == '__main__': + main() + + + + + + diff --git a/combined_learning_padel.py b/combined_learning_padel.py new file mode 100644 index 0000000..339c1ee --- /dev/null +++ b/combined_learning_padel.py @@ -0,0 +1,442 @@ +import pandas as pd +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +from matplotlib.patches import PathPatch +from collections import defaultdict +import xgboost as xgb +from sklearn import ensemble, neighbors, linear_model, gaussian_process +from sklearn.svm import SVR +from sklearn.neural_network import MLPRegressor +from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import StandardScaler, RobustScaler +from sklearn.model_selection import train_test_split, cross_val_score, KFold +from sklearn.cross_decomposition import PLSRegression +from sklearn.linear_model import ridge_regression, SGDRegressor, ElasticNet, Ridge +from scipy.cluster import hierarchy +from scipy.spatial.distance import squareform +pd.options.display.max_columns = None + + +#Load the soil dataset + +output_path = "C:\\Users\\leetseng\\TWtest\\output\\bay_area\\" +output_path_des = output_path + "descriptors\\" +output_path_model = output_path + "modeling\\" +#soil_model_input_all_full.tsv is the new df from Jasmin where she use the RDKit to generate the reduced SMILES. +#This files already contain the bay mean and bay std. +file_name_soil = "soil_model_input_full.txt" #soil_model_input_bayes_curated_full.txt +file_name_sludge = "sludge_bay_PriorMuStd_3.tsv" #sludge_bayesian_PriorMuStd_09.tsv +df_soil_original = pd.read_csv(output_path + file_name_soil, sep='\t') +df_sludge_original = pd.read_csv(output_path + file_name_sludge, sep='\t') + +#Use the copy of dataframe for the later test and manipulation +df_soil_ = df_soil_original.copy() +df_sludge_ = df_sludge_original.copy() +df_sludge_.loc[:, 'package'] = 0 # O: sludge, 1: soil +df_soil_.loc[:, 'package'] = 1 + +list_of_reduced_smiles_soil = df_soil_['reduced_smiles'].values.tolist() #'canonicalize_smiles' +set_of_reduced_smiles_soil = set(list_of_reduced_smiles_soil) +print("Numbers of compound in soil: ", len(set_of_reduced_smiles_soil)) + +list_of_reduced_smiles_sludge = df_sludge_['reduced_smiles'].values.tolist() #'canonicalize_smiles' +set_of_reduced_smiles_sludge = set(list_of_reduced_smiles_sludge) +print("Numbers of compound in sludge: ", len(set_of_reduced_smiles_sludge)) + +# Load all the descriptor here +df_padel_sludge_remove_columns = pd.read_csv(output_path+'descriptors\\sludge_padel_rule_descriptor.tsv', sep='\t') +sludge_padel_descriptor = df_padel_sludge_remove_columns.copy() +# sludge_padel_descriptor.drop(columns=['Unnamed: 0'], inplace=True) + + +from sklearn.kernel_ridge import KernelRidge +from sklearn.linear_model import HuberRegressor, OrthogonalMatchingPursuit, Lars, RANSACRegressor, TheilSenRegressor + +TAGS = '' +ITERATIONS = 20 +DESCRIPTORS = 'PaDEL + btrules on Holdout Set' +DENDROGRAM = False +CLUSTER_THRESHOLD = 0.01 #0.00000000001 +COMBINED_LEARNING = False +TOPS = ['10', '20', '30', '40', 'all'] + +MODELS = [ + ensemble.RandomForestRegressor(), + xgb.XGBRegressor(), + ensemble.AdaBoostRegressor(), + SVR(), + neighbors.KNeighborsRegressor() + # ElasticNet() + # RANSACRegressor(), + # TheilSenRegressor() + # Ridge() + # KernelRidge(), + # OrthogonalMatchingPursuit() + # Lars() + # HuberRegressor() + # MLPRegressor(hidden_layer_sizes=(1200, 120), max_iter=1000) + # SGDRegressor(), + # reg, + # PLSRegression(n_components=3) + # gaussian_process.GaussianProcessRegressor() +] + +model_name = [ + "Random Forest", + "XGBoost", + "AdaBoost", + "SVR", + "k-NN" + # "ElasticNet" + # "RANSAC", + # "TheilSenRegressor" + # "Ridge Regressor", + # "Kernel Ridge", + # "OMP" + # "LARS" + # "Huber Regressor" + # "MLPregressor" + # "SGD Regressor", + # "StackReg", + # "Partial Least-Squares Regressor" + # "Gaussian Process" +] + +def adjust_box_widths(g, fac): + """ + Adjust the withs of a seaborn-generated boxplot. + """ + # iterating through Axes instances + for ax in g.axes: + + # iterating through axes artists: + for c in ax.get_children(): + + # searching for PathPatches + if isinstance(c, PathPatch): + # getting current width of box: + p = c.get_path() + verts = p.vertices + verts_sub = verts[:-1] + xmin = np.min(verts_sub[:, 0]) + xmax = np.max(verts_sub[:, 0]) + xmid = 0.5*(xmin+xmax) + xhalf = 0.5*(xmax - xmin) + + # setting new width of box + xmin_new = xmid-fac*xhalf + xmax_new = xmid+fac*xhalf + verts_sub[verts_sub[:, 0] == xmin, 0] = xmin_new + verts_sub[verts_sub[:, 0] == xmax, 0] = xmax_new + + # setting new width of median line + for l in ax.lines: + if np.all(l.get_xdata() == [xmin, xmax]): + l.set_xdata([xmin_new, xmax_new]) + +def non_nan_mean(x): + if x.empty: + return None + else: + x = x.dropna() + return np.mean(x) + +def check_collinearity(input_df): # CLUSTER_THRESHOLD, + print('\nCheck collinearity..................\t') + feature_names = input_df.columns + corr = input_df.corr('spearman').to_numpy() + corr = (corr + corr.T) / 2 # Ensure the correlation matrix is symmetric + np.fill_diagonal(corr, 1) + corr = np.nan_to_num(corr) + distance_matrix = 1 - np.abs(corr) # Convert the correlation matrix to a distance matrix + dist_linkage = hierarchy.ward( + squareform(distance_matrix)) # hierarchical clustering using Ward's linkage, return a ndarray + + if DENDROGRAM: + size = len(feature_names) / 6.5 + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(size * 2, size)) + dendro = hierarchy.dendrogram(dist_linkage, labels=feature_names, ax=ax1, leaf_rotation=90) + dendro_idx = np.arange(0, len(dendro["ivl"])) + ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]]) + ax2.set_xticks(feature_names[dendro_idx]) + ax2.set_yticks(dendro_idx) + ax2.set_xticklabels(dendro["ivl"], rotation="vertical") + ax2.set_yticklabels(dendro["ivl"]) + fig.tight_layout() + plt.show() + + cluster_ids = hierarchy.fcluster(dist_linkage, CLUSTER_THRESHOLD, criterion="distance") # return a list + cluster_id_to_feature_ids = defaultdict(list) # create a default dict + + for idx, cluster_id in enumerate(cluster_ids): # generate the tuple [(0, ""), (1, ""), (2, "")] + cluster_id_to_feature_ids[cluster_id].append(idx) # use the 'cluster_id' as the key + selected_feature_ids = [v[0] for v in cluster_id_to_feature_ids.values()] # Each v is a list that might contins single or multiple cluster_id, here only thake the first element v[0] as the representative cludter_id + # Transform the inupt_df into np.array and tranpose so that the list in each row is the data of each features. + # We use the [i] to select the features that list in selected_features_ids. + # After the List Comprehension, you need to convert the list back to the np.array so that you can transpose to original direction of the column and row. + new_X = np.array([np.array(input_df).T[i] for i in selected_feature_ids]).T + selected_features = [feature_names[i] for i in selected_feature_ids] + + print('Descriptors after applying cluster threshold of {}: {}'.format(CLUSTER_THRESHOLD, len(selected_features))) + D = {'new_X': new_X, 'selected_features': selected_features} + return new_X, selected_features + +def generate_df_for_collinearity_analysis(X_sludge_padel, y_sludge_padel, soil_bayesian_merge_padel_reduced): + + if COMBINED_LEARNING: + X_combined_padel = pd.concat([X_sludge_padel, soil_bayesian_merge_padel_reduced.iloc[:, 2:]], join='inner') + y_combined_padel = pd.concat([y_sludge_padel, soil_bayesian_merge_padel_reduced['hl_log_bayesian_mean']], join='inner') + # When you run the combined learning, you first check the collinearity on soil and sludge together since the concatenated dataset might have some undiscover relationship. + selected_X_combined_padel_array, selected_features_combined = check_collinearity(X_combined_padel) #Return the nd.array and list of selected features # cluster threshold: 0.02, Descriptors: 1085 + return selected_X_combined_padel_array, selected_features_combined, X_combined_padel, y_combined_padel + else: + X_combined_padel = "Not exist" + y_combined_padel = "Not exist" + selected_X_sludge_padel_array, selected_features_sludge = check_collinearity(X_sludge_padel) # cluster threshold: 0.02, Descriptors: 1068 + return selected_X_sludge_padel_array, selected_features_sludge, X_combined_padel, y_combined_padel + +def get_feature_importance_RF(X, y): + model = ensemble.RandomForestRegressor() + r2_list = [] + rmse_list = [] + mae_list = [] + importance_list = [] + + i = 1 + print('\nFind feature importance using RF..................') + while i <= ITERATIONS: + print('Running {} iterations'.format(i)) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=i+10) + pipe = make_pipeline(StandardScaler(), model) + pipe.fit(X_train, y_train) + y_test_pred = pipe.predict(X_test) + r2_list.append(r2_score(y_test, y_test_pred)) + rmse_list.append(mean_squared_error(y_test, y_test_pred)) + mae_list.append(mean_absolute_error(y_test, y_test_pred)) + importance_list.append(model.feature_importances_) + i += 1 + + R2_mean = np.mean(r2_list) + RMSE_mean = np.mean(rmse_list) + MAE_mean = np.mean(mae_list) + mean_feature_importance = np.mean(np.array(importance_list).T, axis=1) + print("R2: {}, RMSE: {}, MAE: {}".format(round(R2_mean, 2), round(RMSE_mean, 2), round(MAE_mean, 2))) + return mean_feature_importance + + +def plot_feature_importance(ls_feature, mean_feature_importance, topx): + tree_importance_sorted_idx = np.argsort(mean_feature_importance)[::-1] #This is the last model + tree_indices_rule = np.arange(0, len(mean_feature_importance))[::-1] + fig, ax1 = plt.subplots(1, 1, figsize=(6, 6)) + ax1.barh(tree_indices_rule[:topx], mean_feature_importance[tree_importance_sorted_idx][:topx], height=0.4) + ax1.set_yticks(tree_indices_rule[:topx]) + ax1.set_yticklabels(ls_feature[tree_importance_sorted_idx][:topx]) #The features from sludge data set. + fig.tight_layout() + # plt.show() + plt.savefig(output_path + 'figures\\modelling\\RRR_{}_CL{}_RF_top_{}_feature_importance_iter{}_clsThreshold_{}.png'.format(DESCRIPTORS, COMBINED_LEARNING, topx, ITERATIONS, CLUSTER_THRESHOLD)) + plt.close() + + +def find_top_features_subset(TOPS, features_, tree_importance_sorted_idx_): + features = {} + + for top in TOPS: + if not top.isdigit(): + features[top] = [f for f in features_[tree_importance_sorted_idx_][:]] + features[top] + continue + + features[top] = [f for f in features_[tree_importance_sorted_idx_][:int(top)]] + features[top].append('package') + + return features + + +def return_topX_features(TOPS, input_array, y_input, input_descriptor): # Input True or False + print('\nReturn topX features subset..................') + if COMBINED_LEARNING: + rf_padel_feature_importance_CL = get_feature_importance_RF(input_array, y_input) #selected_X_combined_padel_array, y_combined_padel + tree_importance_sorted_idx_CL = np.argsort(rf_padel_feature_importance_CL)[::-1] + features_padel_CL = np.array(input_descriptor.columns) # X_combined_padel + features_CL = find_top_features_subset(TOPS, features_padel_CL, tree_importance_sorted_idx_CL) + features_10 = features_CL['10'] + features_20 = features_CL['20'] + features_30 = features_CL['30'] + features_40 = features_CL['40'] + features_all = features_CL['all'] + plot_feature_importance(features_padel_CL, rf_padel_feature_importance_CL, 30) + return features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx_CL + + else: + rf_padel_feature_importance = get_feature_importance_RF(input_array, y_input) #selected_X_sludge_padel_array, y_sludge_padel + tree_importance_sorted_idx = np.argsort(rf_padel_feature_importance)[::-1] + features_padel = np.array(input_descriptor.columns[2:]) # sludge_bayesian_merge_padel + features = find_top_features_subset(TOPS, features_padel, tree_importance_sorted_idx) + features_10 = features['10'] + features_20 = features['20'] + features_30 = features['30'] + features_40 = features['40'] + features_all = features['all'] + plot_feature_importance(features_padel, rf_padel_feature_importance, 30) + return features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx + +def run_model(X_sludge_padel, y_sludge_padel, model, m, model_name, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced): + r2_list = [] + rmse_list = [] + mae_list = [] + model_used = [] + combined_learning_used = [] + features_used = [] + descriptor_used = [] + # cv_score_list = [] + + dict_features_list = {'top 10': features_10, 'top 20': features_20, 'top 30': features_30, 'top 40': features_40, 'all': features_all} + for list_name, features_topX in dict_features_list.items(): # dict_features_list[list_name] + i = 1 + print("\nEvaluate model {} with {} features".format(model_name[m], list_name)) + # split the sludge dataset into validation set and input set + # X_sludge_pre, X_validate, y_sludge_pre, y_validate = train_test_split(X[features_topX], y, test_size=0.10, random_state=0) + # print("X_sludge shape before random splitting", X_sludge_pre.shape) + + while i <= ITERATIONS: + # # Randomly split the preprocessing sludge dataset into training set and test set + X_train_sludge, X_test_sludge, y_train_sludge, y_test_sludge = train_test_split(X_sludge_padel, y_sludge_padel, test_size=0.20, random_state=i+20) + # cv = KFold(n_splits=5) + if COMBINED_LEARNING: + print('\tRun the {} iterations of combined learning.\tKeep calm and carry on!'.format(i)) + # Concate the split sludge data with soil data + X_train_CL = pd.concat([X_train_sludge, soil_bayesian_merge_padel_reduced.iloc[:, 2:]], join='inner', ignore_index=True) # We lost 30 cpds in this step + y_train_CL = pd.concat([y_train_sludge, soil_bayesian_merge_padel_reduced["hl_log_bayesian_mean"]], join='inner', ignore_index=True) + pipe = make_pipeline(StandardScaler(), model) #RobustScaler() + pipe.fit(X_train_CL[features_topX], y_train_CL) + y_test_pred = pipe.predict(X_test_sludge[features_topX]) + r2_list.append(r2_score(y_test_sludge, y_test_pred)) + rmse_list.append(mean_squared_error(y_test_sludge, y_test_pred)) + mae_list.append(mean_absolute_error(y_test_sludge, y_test_pred)) + model_used.append(model_name[m]) + combined_learning_used.append(str(COMBINED_LEARNING)) + features_used.append(list_name) + descriptor_used.append(str(DESCRIPTORS)) + # cv_score_list.append(cross_val_score(pipe, X_train_CL[features_topX], y_train_CL, cv=cv)) + + else: + print('\tRun the {} iterations of Eawag-Sludge.\tKeep calm and carry on!'.format(i)) + # X_train_sludge, X_test_sludge, y_train_sludge, y_test_sludge = train_test_split(X_sludge_padel[features_topX], y_sludge_padel, test_size=0.20) + pipe = make_pipeline(StandardScaler(), model) + pipe.fit(X_train_sludge[features_topX], y_train_sludge) # X_train, y_train + y_test_pred = pipe.predict(X_test_sludge[features_topX]) + r2_list.append(r2_score(y_test_sludge, y_test_pred)) + rmse_list.append(mean_squared_error(y_test_sludge, y_test_pred)) + mae_list.append(mean_absolute_error(y_test_sludge, y_test_pred)) + model_used.append(model_name[m]) + combined_learning_used.append(str(COMBINED_LEARNING)) + features_used.append(list_name) + descriptor_used.append(str(DESCRIPTORS)) + # cv_score_list.append(cross_val_score(pipe, X_train_CL[features_topX], y_train_CL, cv=cv)) + i += 1 + print('R2:', round(np.mean(r2_list), 2), '\tRMSE:', round(np.mean(rmse_list), 2), '\tMAE:', round(np.mean(mae_list), 2)) + return r2_list, rmse_list, mae_list, model_used, combined_learning_used, features_used, descriptor_used + +def store_result(MODELS, model_name, X, y, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced): # + print("Prepare to collect the result..................") + model_dict_ = {'R2': [], 'RMSE': [], 'MAE': [], 'Model': [], 'Combined_learning': [], 'Selected_features': [], 'Descriptor': []} + for m in range(len(MODELS)): # for modle in MODELS: + r2_list, rmse_list, mae_list, model_used, combined_learning_used, features_used, descriptor_used = run_model(X, y, MODELS[m], m, model_name, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced) # X_sludge_padel, y_sludge_padel for sludge only + model_dict_['R2'].extend(r2_list) # X_combined_padel, y_combined_padel + model_dict_['RMSE'].extend(rmse_list) + model_dict_['MAE'].extend(mae_list) + # model_dict_['CV_SCORE'].extend(cv_score_list) + model_dict_['Selected_features'].extend(features_used) + model_dict_['Model'].extend(model_used) + model_dict_['Descriptor'].extend(descriptor_used) + model_dict_['Combined_learning'].extend(combined_learning_used) + return model_dict_ + +def plot_rmse(df): + + if COMBINED_LEARNING: + TAGS_rmse = "" + dt = df + else: + TAGS_rmse = "Without" + dt = df + + fig = plt.figure(1, figsize=(12, 6)) + sns.set_style("whitegrid") + sns.set_theme(style="ticks", palette="pastel") + sns.boxplot(x="Model", y="RMSE", width=0.6,hue="Selected_features", palette=["#47ad62", "#4d5154", "#2c79de", "#8a1506", "#ed6511"], data=dt, showfliers=False) + adjust_box_widths(fig, 0.6) + sns.despine(offset=10, trim=False) + plt.xlabel('') + plt.ylabel('RMSE') + plt.title('{} Combined Learning, {}'.format(TAGS_rmse, DESCRIPTORS), fontsize=20) + plt.savefig(output_path + 'figures\\modelling\\RRR_rmse_CL_{}_{}_iter{}_ClsThres_{}.png'.format(COMBINED_LEARNING, DESCRIPTORS, ITERATIONS, CLUSTER_THRESHOLD)) + plt.close() + +def plot_r2(df): + + if COMBINED_LEARNING: + TAGS = "" + dt = df + else: + TAGS = "Without" + dt = df + + fig = plt.figure(1, figsize=(12, 6)) + sns.set_style("whitegrid") + sns.set_theme(style="ticks", palette="pastel") + sns.boxplot(x="Model", y="R2", width=0.6, hue="Selected_features", palette=["#47ad62", "#4d5154", "#2c79de", "#8a1506", "#ed6511"], data=dt, showfliers=False) + adjust_box_widths(fig, 0.7) + sns.despine(offset=10, trim=False) + # plt.axhline(0.1, linewidth=2, color='#bf6613') + plt.xlabel('') + plt.ylabel('$R^2$') + plt.title('{} Combined Learning, {}'.format(TAGS, DESCRIPTORS), fontsize=20) + plt.savefig(output_path + 'figures\\modelling\\RRR_r2_CL_{}_{}_iter{}_CLsThres_{}.png'.format(COMBINED_LEARNING, DESCRIPTORS, ITERATIONS, CLUSTER_THRESHOLD)) + plt.close() # prevent overwrite + + +def main(): + + aggs = [non_nan_mean] + df_subset_sludge = df_sludge_[["reduced_smiles", "hl_log_bayesian_mean", "package"]] #"temperature", "log_hl_combined", "log_hl_biomass_corrected" + df_subset_sludge_1 = df_subset_sludge.groupby(["reduced_smiles"]).agg(aggs).reset_index() + df_subset_sludge_1.columns = df_subset_sludge_1.columns.droplevel(1) + sludge_bayesian_merge_padel = pd.merge(df_subset_sludge_1, sludge_padel_descriptor, how='left', on='reduced_smiles').dropna() + + soil_bayesian_merge_padel_reduced = pd.read_csv(output_path_des +"soil_XY_merge_padel_rule.tsv", sep="\t") + soil_bayesian_merge_padel_reduced.dropna(axis=0, inplace=True) + # soil_bayesian_merge_padel_reduced.drop(columns=['Unnamed: 0', 'topoShape'], inplace=True) #contains subset of all target variable (without pH), and reduced descriptors + + # Keep the hold-out set + X_sludge_padel, X_validate, y_sludge_padel, y_validate = train_test_split(sludge_bayesian_merge_padel.iloc[:, 2:], + sludge_bayesian_merge_padel['hl_log_bayesian_mean'], + test_size=0.10, + random_state=62) + if COMBINED_LEARNING: + selected_X_combined_padel_array, selected_features_combined, X_combined_padel, y_combined_padel = generate_df_for_collinearity_analysis(X_validate, y_validate, soil_bayesian_merge_padel_reduced) + features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx_CL = return_topX_features(TOPS, selected_X_combined_padel_array, y_combined_padel, X_combined_padel) + df_top_feature_padel = pd.DataFrame.from_dict(store_result(MODELS, model_name, X_sludge_padel, y_sludge_padel, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced)) + # COMBINED_LEARNING, ITERATIONS, MODELS, X, y, features_10, features_20, features_30, features_40, features_all + plot_rmse(df_top_feature_padel) + plot_r2(df_top_feature_padel) + df_top_feature_padel.to_csv(output_path_model + "{}_CL{}_iter{}_cluster_threshold{}.tsv".format(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, CLUSTER_THRESHOLD), sep='\t', index=False) + else: + selected_X_sludge_padel_array, selected_features_sludge, X_combined_padel, y_combined_padel = generate_df_for_collinearity_analysis(X_validate, y_validate, soil_bayesian_merge_padel_reduced) + features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx = return_topX_features(TOPS, selected_X_sludge_padel_array, y_validate, sludge_bayesian_merge_padel) + df_top_feature_padel_noCL = pd.DataFrame.from_dict(store_result(MODELS, model_name, X_sludge_padel, y_sludge_padel, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced)) + plot_rmse(df_top_feature_padel_noCL) + plot_r2(df_top_feature_padel_noCL) + df_top_feature_padel_noCL.to_csv(output_path_model + "{}_CL{}_iter{}_cluster_threshold{}.tsv".format(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, CLUSTER_THRESHOLD), sep='\t', index=False) + dict_features_list = {'top 10': features_10, 'top 20': features_20, 'top 30': features_30, 'top 40': features_40, 'all': features_all} + +if __name__ == '__main__': + main() + + + + + +