diff --git a/combined_learning.py b/combined_learning.py index 36f289f..0b16fb4 100644 --- a/combined_learning.py +++ b/combined_learning.py @@ -1,447 +1,419 @@ import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from matplotlib.patches import PathPatch from collections import defaultdict -from sklearn import ensemble -from sklearn.svm import SVR -from sklearn.model_selection import train_test_split, cross_val_score -from sklearn.feature_selection import VarianceThreshold +import xgboost as xgb from sklearn import ensemble, neighbors, linear_model, gaussian_process -from sklearn.linear_model import ElasticNet -from sklearn.neighbors import KNeighborsRegressor +from sklearn.svm import SVR from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score -from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import train_test_split, cross_val_score from sklearn.cross_decomposition import PLSRegression -import xgboost as xgb -from sklearn.preprocessing import FunctionTransformer -from sklearn.ensemble import StackingRegressor -from sklearn.cross_decomposition import PLSRegression -from sklearn.linear_model import ridge_regression, SGDRegressor -from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor +from sklearn.linear_model import ridge_regression, SGDRegressor, ElasticNet from scipy.cluster import hierarchy from scipy.spatial.distance import squareform pd.options.display.max_columns = None #Load the soil dataset input_file_path = "C:\\Users\\leetseng\\TWtest\\input\\" output_file_path = "C:\\Users\\leetseng\\TWtest\\output\\" #soil_model_input_all_full.tsv is the new df from Jasmin where she use the RDKit to generate the reduced SMILES. #This files already contain the bay mean and bay std. file_name_soil = "soil_model_input_all_full.tsv" #soil_model_input_bayes_curated_full.txt file_name_sludge = "sludge_bayesian_PriorMuStd_2.tsv" #sludge_bayesian_PriorMuStd_09.tsv df_soil_original = pd.read_csv(input_file_path + file_name_soil, sep='\t') df_sludge_original = pd.read_csv(input_file_path + file_name_sludge, sep='\t') #Use the copy of dataframe for the later test and manipulation df_soil_ = df_soil_original.copy() df_sludge_ = df_sludge_original.copy() df_sludge_.loc[:, 'package'] = 0 # O: sludge, 1: soil df_soil_.loc[:, 'package'] = 1 list_of_reduced_smiles_soil = df_soil_['reduced_smiles'].values.tolist() #'canonicalize_smiles' set_of_reduced_smiles_soil = set(list_of_reduced_smiles_soil) print("Numbers of compound in soil: ", len(set_of_reduced_smiles_soil)) list_of_reduced_smiles_sludge = df_sludge_['reduced_smiles'].values.tolist() #'canonicalize_smiles' set_of_reduced_smiles_sludge = set(list_of_reduced_smiles_sludge) print("Numbers of compound in sludge: ", len(set_of_reduced_smiles_sludge)) # Load all the descriptor here df_padel_sludge_remove_columns = pd.read_csv(output_file_path+'descriptors\\sludge_trimpadel_descriptor.tsv', sep='\t') sludge_padel_descriptor = df_padel_sludge_remove_columns.copy() sludge_padel_descriptor.drop(columns=['Unnamed: 0'], inplace=True) +TAGS = '' +ITERATIONS = 20 +DESCRIPTORS = 'PaDEL' +DENDROGRAM = False +CLUSTER_THRESHOLD = 0.01 +COMBINED_LEARNING = True +TOPS = ['10', '20', '30', '40', 'all'] + def adjust_box_widths(g, fac): """ Adjust the withs of a seaborn-generated boxplot. """ # iterating through Axes instances for ax in g.axes: # iterating through axes artists: for c in ax.get_children(): # searching for PathPatches if isinstance(c, PathPatch): # getting current width of box: p = c.get_path() verts = p.vertices verts_sub = verts[:-1] xmin = np.min(verts_sub[:, 0]) xmax = np.max(verts_sub[:, 0]) xmid = 0.5*(xmin+xmax) xhalf = 0.5*(xmax - xmin) # setting new width of box xmin_new = xmid-fac*xhalf xmax_new = xmid+fac*xhalf verts_sub[verts_sub[:, 0] == xmin, 0] = xmin_new verts_sub[verts_sub[:, 0] == xmax, 0] = xmax_new # setting new width of median line for l in ax.lines: if np.all(l.get_xdata() == [xmin, xmax]): l.set_xdata([xmin_new, xmax_new]) def non_nan_mean(x): if x.empty: return None else: x = x.dropna() return np.mean(x) -def check_collinearity(CLUSTER_THRESHOLD, DENDROGRAM, input_df): +def check_collinearity(input_df): # CLUSTER_THRESHOLD, print('\nCheck collinearity..................\t') feature_names = input_df.columns corr = input_df.corr('spearman').to_numpy() corr = (corr + corr.T) / 2 # Ensure the correlation matrix is symmetric np.fill_diagonal(corr, 1) corr = np.nan_to_num(corr) distance_matrix = 1 - np.abs(corr) # Convert the correlation matrix to a distance matrix dist_linkage = hierarchy.ward( squareform(distance_matrix)) # hierarchical clustering using Ward's linkage, return a ndarray if DENDROGRAM: size = len(feature_names) / 6.5 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(size * 2, size)) dendro = hierarchy.dendrogram(dist_linkage, labels=feature_names, ax=ax1, leaf_rotation=90) dendro_idx = np.arange(0, len(dendro["ivl"])) ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]]) ax2.set_xticks(feature_names[dendro_idx]) ax2.set_yticks(dendro_idx) ax2.set_xticklabels(dendro["ivl"], rotation="vertical") ax2.set_yticklabels(dendro["ivl"]) fig.tight_layout() plt.show() cluster_ids = hierarchy.fcluster(dist_linkage, CLUSTER_THRESHOLD, criterion="distance") # return a list - # print("Cludter_ids:", cluster_ids) cluster_id_to_feature_ids = defaultdict(list) # create a default dict for idx, cluster_id in enumerate(cluster_ids): # generate the tuple [(0, ""), (1, ""), (2, "")] cluster_id_to_feature_ids[cluster_id].append(idx) # use the 'cluster_id' as the key selected_feature_ids = [v[0] for v in cluster_id_to_feature_ids.values()] # Each v is a list that might contins single or multiple cluster_id, here only thake the first element v[0] as the representative cludter_id # Transform the inupt_df into np.array and tranpose so that the list in each row is the data of each features. # We use the [i] to select the features that list in selected_features_ids. # After the List Comprehension, you need to convert the list back to the np.array so that you can transpose to original direction of the column and row. new_X = np.array([np.array(input_df).T[i] for i in selected_feature_ids]).T selected_features = [feature_names[i] for i in selected_feature_ids] print('Descriptors after applying cluster threshold of {}: {}'.format(CLUSTER_THRESHOLD, len(selected_features))) D = {'new_X': new_X, 'selected_features': selected_features} return new_X, selected_features -def generate_df_for_collinearity_analysis(CLUSTER_THRESHOLD, DENDROGRAM, COMBINED_LEARNING_COLLINEARITY, X_sludge_padel, y_sludge_padel, soil_bayesian_merge_padel_reduced): +def generate_df_for_collinearity_analysis(X_sludge_padel, y_sludge_padel, soil_bayesian_merge_padel_reduced): - if COMBINED_LEARNING_COLLINEARITY: + if COMBINED_LEARNING: X_combined_padel = pd.concat([X_sludge_padel, soil_bayesian_merge_padel_reduced.iloc[:, 2:]], join='inner') y_combined_padel = pd.concat([y_sludge_padel, soil_bayesian_merge_padel_reduced['hl_log_bayesian_mean']], join='inner') # When you run the combined learning, you first check the collinearity on soil and sludge together since the concatenated dataset might have some undiscover relationship. - selected_X_combined_padel_array, selected_features_combined = check_collinearity(CLUSTER_THRESHOLD, DENDROGRAM, X_combined_padel) # Return the nd.array and list of selected features # cluster threshold: 0.02, Descriptors: 1085 + selected_X_combined_padel_array, selected_features_combined = check_collinearity(X_combined_padel) #Return the nd.array and list of selected features # cluster threshold: 0.02, Descriptors: 1085 return selected_X_combined_padel_array, selected_features_combined, X_combined_padel, y_combined_padel else: X_combined_padel = "Not exist" y_combined_padel = "Not exist" - selected_X_sludge_padel_array, selected_features_sludge = check_collinearity(CLUSTER_THRESHOLD, DENDROGRAM, X_sludge_padel) # cluster threshold: 0.02, Descriptors: 1068 + selected_X_sludge_padel_array, selected_features_sludge = check_collinearity(X_sludge_padel) # cluster threshold: 0.02, Descriptors: 1068 return selected_X_sludge_padel_array, selected_features_sludge, X_combined_padel, y_combined_padel -def get_feature_importance_RF(X, y, iterations): +def get_feature_importance_RF(X, y): model = ensemble.RandomForestRegressor() r2_list = [] rmse_list = [] mae_list = [] importance_list = [] i = 1 print('\nFind feature importance using RF..................') - while i <= iterations: + while i <= ITERATIONS: print('Running {} iterations'.format(i)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) pipe = make_pipeline(StandardScaler(), model) pipe.fit(X_train, y_train) y_test_pred = pipe.predict(X_test) r2_list.append(r2_score(y_test, y_test_pred)) rmse_list.append(mean_squared_error(y_test, y_test_pred)) mae_list.append(mean_absolute_error(y_test, y_test_pred)) importance_list.append(model.feature_importances_) i += 1 R2_mean = np.mean(r2_list) RMSE_mean = np.mean(rmse_list) MAE_mean = np.mean(mae_list) mean_feature_importance = np.mean(np.array(importance_list).T, axis=1) - print("R2: {}, RMSE: {}, MAE: {}".format(round(R2_mean,2), round(RMSE_mean,2), round(MAE_mean,2))) + print("R2: {}, RMSE: {}, MAE: {}".format(round(R2_mean, 2), round(RMSE_mean, 2), round(MAE_mean, 2))) return mean_feature_importance -def plot_feature_importance(DESCRIPTORS, ls_feature, mean_feature_importance, topx, COMBINED_LEARNING_COLLINEARITY): +def plot_feature_importance(ls_feature, mean_feature_importance, topx): tree_importance_sorted_idx = np.argsort(mean_feature_importance)[::-1] #This is the last model tree_indices_rule = np.arange(0, len(mean_feature_importance))[::-1] fig, ax1 = plt.subplots(1, 1, figsize=(6, 6)) ax1.barh(tree_indices_rule[:topx], mean_feature_importance[tree_importance_sorted_idx][:topx], height=0.4) ax1.set_yticks(tree_indices_rule[:topx]) ax1.set_yticklabels(ls_feature[tree_importance_sorted_idx][:topx]) #The features from sludge data set. fig.tight_layout() # plt.show() - plt.savefig(output_file_path + 'figures\\modelling\\{}_CL{}_RF_top_{}_feature_importance.png'.format(DESCRIPTORS, COMBINED_LEARNING_COLLINEARITY, topx)) + plt.savefig(output_file_path + 'figures\\modelling\\{}_CL{}_RF_top_{}_feature_importance.png'.format(DESCRIPTORS, COMBINED_LEARNING, topx)) plt.close() def find_top_features_subset(TOPS, features_, tree_importance_sorted_idx_): features = {} for top in TOPS: if not top.isdigit(): features[top] = [f for f in features_[tree_importance_sorted_idx_][:]] features[top] continue features[top] = [f for f in features_[tree_importance_sorted_idx_][:int(top)]] features[top].append('package') return features -def return_topX_features(DESCRIPTORS, TOPS, COMBINED_LEARNING_COLLINEARITY, input_array, y_input, input_descriptor, iterations): # Input True or False +def return_topX_features(TOPS, input_array, y_input, input_descriptor): # Input True or False print('\nReturn topX features subset..................') - if COMBINED_LEARNING_COLLINEARITY: - rf_padel_feature_importance_CL = get_feature_importance_RF(input_array, y_input, iterations) #selected_X_combined_padel_array, y_combined_padel + if COMBINED_LEARNING: + rf_padel_feature_importance_CL = get_feature_importance_RF(input_array, y_input) #selected_X_combined_padel_array, y_combined_padel tree_importance_sorted_idx_CL = np.argsort(rf_padel_feature_importance_CL)[::-1] features_padel_CL = np.array(input_descriptor.columns) # X_combined_padel features_CL = find_top_features_subset(TOPS, features_padel_CL, tree_importance_sorted_idx_CL) features_10 = features_CL['10'] features_20 = features_CL['20'] features_30 = features_CL['30'] features_40 = features_CL['40'] features_all = features_CL['all'] - plot_feature_importance(DESCRIPTORS, features_padel_CL, rf_padel_feature_importance_CL, 30, COMBINED_LEARNING_COLLINEARITY) + plot_feature_importance(features_padel_CL, rf_padel_feature_importance_CL, 30) return features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx_CL else: - rf_padel_feature_importance = get_feature_importance_RF(input_array, y_input, iterations) #selected_X_sludge_padel_array, y_sludge_padel + rf_padel_feature_importance = get_feature_importance_RF(input_array, y_input) #selected_X_sludge_padel_array, y_sludge_padel tree_importance_sorted_idx = np.argsort(rf_padel_feature_importance)[::-1] features_padel = np.array(input_descriptor.columns[2:]) # sludge_bayesian_merge_padel features = find_top_features_subset(TOPS, features_padel, tree_importance_sorted_idx) features_10 = features['10'] features_20 = features['20'] features_30 = features['30'] features_40 = features['40'] features_all = features['all'] - plot_feature_importance(DESCRIPTORS, features_padel, rf_padel_feature_importance, 30, COMBINED_LEARNING_COLLINEARITY) + plot_feature_importance(features_padel, rf_padel_feature_importance, 30) return features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx -def run_model(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, X_sludge_padel, y_sludge_padel, model, m, model_name, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced): +def run_model(X_sludge_padel, y_sludge_padel, model, m, model_name, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced): r2_list = [] rmse_list = [] mae_list = [] model_used = [] combined_learning_used = [] features_used = [] descriptor_used = [] dict_features_list = {'top 10': features_10, 'top 20': features_20, 'top 30': features_30, 'top 40': features_40, 'all': features_all} for list_name, features_topX in dict_features_list.items(): # dict_features_list[list_name] i = 1 print("\nEvaluate model {} with {} features".format(model_name[m], list_name)) # split the sludge dataset into validation set and input set # X_sludge_pre, X_validate, y_sludge_pre, y_validate = train_test_split(X[features_topX], y, test_size=0.10, random_state=0) # print("X_sludge shape before random splitting", X_sludge_pre.shape) while i <= ITERATIONS: # # Randomly split the preprocessing sludge dataset into training set and test set - X_train_sludge, X_test_sludge, y_train_sludge, y_test_sludge = train_test_split(X_sludge_padel, - y_sludge_padel, - test_size=0.20) + X_train_sludge, X_test_sludge, y_train_sludge, y_test_sludge = train_test_split(X_sludge_padel, y_sludge_padel, test_size=0.20) if COMBINED_LEARNING: print('\tRun the {} iterations of combined learning.\tKeep calm and carry on!'.format(i)) # Concate the split sludge data with soil data X_train_CL = pd.concat([X_train_sludge, soil_bayesian_merge_padel_reduced.iloc[:, 2:]], join='inner', ignore_index=True) # We lost 30 cpds in this step y_train_CL = pd.concat([y_train_sludge, soil_bayesian_merge_padel_reduced["hl_log_bayesian_mean"]], join='inner', ignore_index=True) pipe = make_pipeline(StandardScaler(), model) pipe.fit(X_train_CL[features_topX], y_train_CL) y_test_pred = pipe.predict(X_test_sludge[features_topX]) r2_list.append(r2_score(y_test_sludge, y_test_pred)) rmse_list.append(mean_squared_error(y_test_sludge, y_test_pred)) mae_list.append(mean_absolute_error(y_test_sludge, y_test_pred)) model_used.append(model_name[m]) combined_learning_used.append(str(COMBINED_LEARNING)) features_used.append(list_name) descriptor_used.append(str(DESCRIPTORS)) else: print('\tRun the {} iterations of Eawag-Soil.\tKeep calm and carry on!'.format(i)) # X_train_sludge, X_test_sludge, y_train_sludge, y_test_sludge = train_test_split(X_sludge_padel[features_topX], y_sludge_padel, test_size=0.20) pipe = make_pipeline(StandardScaler(), model) pipe.fit(X_train_sludge[features_topX], y_train_sludge) # X_train, y_train y_test_pred = pipe.predict(X_test_sludge[features_topX]) r2_list.append(r2_score(y_test_sludge, y_test_pred)) rmse_list.append(mean_squared_error(y_test_sludge, y_test_pred)) mae_list.append(mean_absolute_error(y_test_sludge, y_test_pred)) model_used.append(model_name[m]) combined_learning_used.append(str(COMBINED_LEARNING)) features_used.append(list_name) descriptor_used.append(str(DESCRIPTORS)) i += 1 print('R2:', round(np.mean(r2_list), 2), '\tRMSE:', round(np.mean(rmse_list), 2), '\tMAE:', round(np.mean(mae_list), 2), "\n") return r2_list, rmse_list, mae_list, model_used, combined_learning_used, features_used, descriptor_used -def store_result(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, MODELS, model_name, X, y, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced): # +def store_result(MODELS, model_name, X, y, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced): # print("Prepare to collect the result..................") model_dict_ = {'R2': [], 'RMSE': [], 'MAE': [], 'Model': [], 'Combined_learning': [], 'Selected_features': [], 'Descriptor': []} for m in range(len(MODELS)): # for modle in MODELS: - r2_list, rmse_list, mae_list, model_used, combined_learning_used, features_used, descriptor_used = run_model(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, X, y, MODELS[m], m, model_name, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced) # X_sludge_padel, y_sludge_padel for sludge only + r2_list, rmse_list, mae_list, model_used, combined_learning_used, features_used, descriptor_used = run_model(X, y, MODELS[m], m, model_name, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced) # X_sludge_padel, y_sludge_padel for sludge only model_dict_['R2'].extend(r2_list) # X_combined_padel, y_combined_padel model_dict_['RMSE'].extend(rmse_list) model_dict_['MAE'].extend(mae_list) model_dict_['Selected_features'].extend(features_used) model_dict_['Model'].extend(model_used) model_dict_['Descriptor'].extend(descriptor_used) model_dict_['Combined_learning'].extend(combined_learning_used) return model_dict_ -def plot_rmse(COMBINED_LEARNING, DESCRIPTORS, df): +def plot_rmse(df): if COMBINED_LEARNING: TAGS_rmse = "" dt = df else: TAGS_rmse = "Without" dt = df fig = plt.figure(1, figsize=(12, 6)) sns.set_style("whitegrid") sns.set_theme(style="ticks", palette="pastel") sns.boxplot(x="Model", y="RMSE", width=0.6,hue="Selected_features", palette=["#47ad62", "#4d5154", "#2c79de", "#8a1506", "#ed6511"], data=dt, showfliers=False) adjust_box_widths(fig, 0.6) sns.despine(offset=10, trim=False) plt.xlabel('') - plt.ylabel('RMSE') #'$R^2$' + plt.ylabel('RMSE') plt.title('{} Combined Learning & {} descriptor'.format(TAGS_rmse, DESCRIPTORS), fontsize=20) plt.savefig(output_file_path + 'figures\\modelling\\rmse_CL_{}_{}.png'.format(COMBINED_LEARNING, DESCRIPTORS)) plt.close() -def plot_r2(COMBINED_LEARNING, DESCRIPTORS, df): +def plot_r2(df): if COMBINED_LEARNING: TAGS = "" dt = df else: TAGS = "Without" dt = df fig = plt.figure(1, figsize=(12, 6)) sns.set_style("whitegrid") sns.set_theme(style="ticks", palette="pastel") - sns.boxplot(x="Model", y="R2", width=0.6,hue="Selected_features", palette=["#47ad62", "#4d5154", "#2c79de", "#8a1506", "#ed6511"], data=dt, showfliers=False) + sns.boxplot(x="Model", y="R2", width=0.6, hue="Selected_features", palette=["#47ad62", "#4d5154", "#2c79de", "#8a1506", "#ed6511"], data=dt, showfliers=False) adjust_box_widths(fig, 0.7) sns.despine(offset=10, trim=False) # plt.axhline(0.1, linewidth=2, color='#bf6613') plt.xlabel('') - plt.ylabel('$R^2$') #'$R^2$' + plt.ylabel('$R^2$') plt.title('{} Combined Learning & {} descriptor'.format(TAGS, DESCRIPTORS), fontsize=20) plt.savefig(output_file_path + 'figures\\modelling\\r2_CL_{}_{}.png'.format(COMBINED_LEARNING, DESCRIPTORS)) plt.close() # prevent overwrite def main(): # Load the soil dataset # input_file_path = "C:\\Users\\leetseng\\TWtest\\input\\" # output_file_path = "C:\\Users\\leetseng\\TWtest\\output\\" - TAGS = '' - ITERATIONS = 20 - DESCRIPTORS = 'PaDEL' - DENDROGRAM = False - CLUSTER_THRESHOLD = 0.02 - COMBINED_LEARNING = True - COMBINED_LEARNING_COLLINEARITY = True - TOPS = ['10', '20', '30', '40', 'all'] - MODELS = [ - RandomForestRegressor(), + ensemble.RandomForestRegressor(), xgb.XGBRegressor(), SVR(), neighbors.KNeighborsRegressor(), - AdaBoostRegressor() + ensemble.AdaBoostRegressor() # SGDRegressor(), # reg, # PLSRegression(n_components=3) # linear_model.ElasticNet(), # gaussian_process.GaussianProcessRegressor() ] model_name = [ "Random Forest", "XGBoost", "Support Vector Regressor", "k-Nearest Neighbors", "AdaBoost Regressor" # "SGD Regressor", # "StackReg", # "Partial Least-Squares Regressor" # "ElasticNet", # "Gaussian Process" ] aggs = [non_nan_mean] df_subset_sludge = df_sludge_[["reduced_smiles", "hl_log_bayesian_mean", "package"]] #"temperature", "log_hl_combined", "log_hl_biomass_corrected" df_subset_sludge_1 = df_subset_sludge.groupby(["reduced_smiles"]).agg(aggs).reset_index() df_subset_sludge_1.columns = df_subset_sludge_1.columns.droplevel(1) sludge_bayesian_merge_padel = pd.merge(df_subset_sludge_1, sludge_padel_descriptor, how='left', on='reduced_smiles').dropna() soil_bayesian_merge_padel_reduced = pd.read_csv(output_file_path+"soil_bayesian_merge_padel_reduced.tsv", sep="\t") soil_bayesian_merge_padel_reduced.dropna(axis=0, inplace=True) soil_bayesian_merge_padel_reduced.drop(columns=['Unnamed: 0', 'topoShape'], inplace=True) #contains subset of all target variable (without pH), and reduced descriptors # Keep the hold-out set X_sludge_padel, X_validate, y_sludge_padel, y_validate = train_test_split(sludge_bayesian_merge_padel.iloc[:, 2:], sludge_bayesian_merge_padel['hl_log_bayesian_mean'], test_size=0.10, random_state=62) if COMBINED_LEARNING: - selected_X_combined_padel_array, selected_features_combined, X_combined_padel, y_combined_padel = generate_df_for_collinearity_analysis(CLUSTER_THRESHOLD, DENDROGRAM, COMBINED_LEARNING_COLLINEARITY, X_sludge_padel, y_sludge_padel, soil_bayesian_merge_padel_reduced) - features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx_CL = return_topX_features(DESCRIPTORS, TOPS, COMBINED_LEARNING_COLLINEARITY, selected_X_combined_padel_array, y_combined_padel, X_combined_padel, ITERATIONS) - df_top_feature_padel = pd.DataFrame.from_dict(store_result(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, MODELS, model_name, X_sludge_padel, y_sludge_padel, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced)) + selected_X_combined_padel_array, selected_features_combined, X_combined_padel, y_combined_padel = generate_df_for_collinearity_analysis(X_sludge_padel, y_sludge_padel, soil_bayesian_merge_padel_reduced) + features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx_CL = return_topX_features(TOPS, selected_X_combined_padel_array, y_combined_padel, X_combined_padel) + df_top_feature_padel = pd.DataFrame.from_dict(store_result(MODELS, model_name, X_sludge_padel, y_sludge_padel, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced)) # COMBINED_LEARNING, ITERATIONS, MODELS, X, y, features_10, features_20, features_30, features_40, features_all - plot_rmse(COMBINED_LEARNING, DESCRIPTORS, df_top_feature_padel) - plot_r2(COMBINED_LEARNING, DESCRIPTORS, df_top_feature_padel) + plot_rmse(df_top_feature_padel) + plot_r2(df_top_feature_padel) else: - selected_X_sludge_padel_array, selected_features_sludge, X_combined_padel, y_combined_padel = generate_df_for_collinearity_analysis(CLUSTER_THRESHOLD, DENDROGRAM, COMBINED_LEARNING_COLLINEARITY, X_sludge_padel, y_sludge_padel,soil_bayesian_merge_padel_reduced) - features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx = return_topX_features(DESCRIPTORS, TOPS, COMBINED_LEARNING_COLLINEARITY, selected_X_sludge_padel_array, y_sludge_padel, sludge_bayesian_merge_padel, ITERATIONS) - df_top_feature_padel_noCL = pd.DataFrame.from_dict(store_result(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, MODELS, model_name, X_sludge_padel, y_sludge_padel, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced)) - plot_rmse(COMBINED_LEARNING, DESCRIPTORS, df_top_feature_padel_noCL) - plot_r2(COMBINED_LEARNING, DESCRIPTORS, df_top_feature_padel_noCL) - # else: - # selected_X_sludge_padel_array, selected_features_sludge, X_combined_padel, y_combined_padel = generate_df_for_collinearity_analysis(CLUSTER_THRESHOLD, DENDROGRAM, COMBINED_LEARNING_COLLINEARITY, X_sludge_padel, y_sludge_padel, soil_bayesian_merge_padel_reduced) - - # if COMBINED_LEARNING: - - # features_10, features_20, features_30, features_40, features_all = return_topX_features(COMBINED_LEARNING_COLLINEARITY, selected_X_sludge_padel_array, y_sludge_padel, sludge_bayesian_merge_padel) - - + selected_X_sludge_padel_array, selected_features_sludge, X_combined_padel, y_combined_padel = generate_df_for_collinearity_analysis(X_sludge_padel, y_sludge_padel,soil_bayesian_merge_padel_reduced) + features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx = return_topX_features(TOPS, selected_X_sludge_padel_array, y_sludge_padel, sludge_bayesian_merge_padel) + df_top_feature_padel_noCL = pd.DataFrame.from_dict(store_result(MODELS, model_name, X_sludge_padel, y_sludge_padel, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced)) + plot_rmse(df_top_feature_padel_noCL) + plot_r2(df_top_feature_padel_noCL) dict_features_list = {'top 10': features_10, 'top 20': features_20, 'top 30': features_30, 'top 40': features_40, 'all': features_all} - # if COMBINED_LEARNING: - # df_top_feature_padel = pd.DataFrame.from_dict(store_result(MODELS, X_sludge_padel, y_sludge_padel)) - # else: - # df_top_feature_padel_noCL = pd.DataFrame.from_dict(store_result(MODELS, X_sludge_padel, y_sludge_padel)) - - - - if __name__ == '__main__': main()