diff --git a/combined_learning.py b/combined_learning.py
index 36f289f..0b16fb4 100644
--- a/combined_learning.py
+++ b/combined_learning.py
@@ -1,447 +1,419 @@
 import pandas as pd
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
 from matplotlib.patches import PathPatch
 from collections import defaultdict
-from sklearn import ensemble
-from sklearn.svm import SVR
-from sklearn.model_selection import train_test_split, cross_val_score
-from sklearn.feature_selection import VarianceThreshold
+import xgboost as xgb
 from sklearn import ensemble, neighbors, linear_model, gaussian_process
-from sklearn.linear_model import ElasticNet
-from sklearn.neighbors import KNeighborsRegressor
+from sklearn.svm import SVR
 from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
-from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.cross_decomposition import PLSRegression
-import xgboost as xgb
-from sklearn.preprocessing import FunctionTransformer
-from sklearn.ensemble import StackingRegressor
-from sklearn.cross_decomposition import PLSRegression
-from sklearn.linear_model import ridge_regression, SGDRegressor
-from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
+from sklearn.linear_model import ridge_regression, SGDRegressor, ElasticNet
 from scipy.cluster import hierarchy
 from scipy.spatial.distance import squareform
 pd.options.display.max_columns = None
 
 
 #Load the soil dataset
 input_file_path = "C:\\Users\\leetseng\\TWtest\\input\\"
 output_file_path = "C:\\Users\\leetseng\\TWtest\\output\\"
 
 #soil_model_input_all_full.tsv is the new df from Jasmin where she use the RDKit to generate the reduced SMILES.
 #This files already contain the bay mean and bay std.
 file_name_soil = "soil_model_input_all_full.tsv"      #soil_model_input_bayes_curated_full.txt
 file_name_sludge = "sludge_bayesian_PriorMuStd_2.tsv"   #sludge_bayesian_PriorMuStd_09.tsv
 df_soil_original = pd.read_csv(input_file_path + file_name_soil, sep='\t')
 df_sludge_original = pd.read_csv(input_file_path + file_name_sludge, sep='\t')
 
 #Use the copy of dataframe for the later test and manipulation
 df_soil_ = df_soil_original.copy()
 df_sludge_ = df_sludge_original.copy()
 df_sludge_.loc[:, 'package'] = 0  # O: sludge, 1: soil
 df_soil_.loc[:, 'package'] = 1
 
 list_of_reduced_smiles_soil = df_soil_['reduced_smiles'].values.tolist()   #'canonicalize_smiles'
 set_of_reduced_smiles_soil = set(list_of_reduced_smiles_soil)
 print("Numbers of compound in soil: ", len(set_of_reduced_smiles_soil))
 
 list_of_reduced_smiles_sludge = df_sludge_['reduced_smiles'].values.tolist()   #'canonicalize_smiles'
 set_of_reduced_smiles_sludge = set(list_of_reduced_smiles_sludge)
 print("Numbers of compound in sludge: ", len(set_of_reduced_smiles_sludge))
 
 # Load all the descriptor here
 df_padel_sludge_remove_columns = pd.read_csv(output_file_path+'descriptors\\sludge_trimpadel_descriptor.tsv', sep='\t')
 sludge_padel_descriptor = df_padel_sludge_remove_columns.copy()
 sludge_padel_descriptor.drop(columns=['Unnamed: 0'], inplace=True)
 
 
+TAGS = ''
+ITERATIONS = 20
+DESCRIPTORS = 'PaDEL'
+DENDROGRAM = False
+CLUSTER_THRESHOLD = 0.01
+COMBINED_LEARNING = True
+TOPS = ['10', '20', '30', '40', 'all']
+
 
 def adjust_box_widths(g, fac):
     """
     Adjust the withs of a seaborn-generated boxplot.
     """
     # iterating through Axes instances
     for ax in g.axes:
 
         # iterating through axes artists:
         for c in ax.get_children():
 
             # searching for PathPatches
             if isinstance(c, PathPatch):
                 # getting current width of box:
                 p = c.get_path()
                 verts = p.vertices
                 verts_sub = verts[:-1]
                 xmin = np.min(verts_sub[:, 0])
                 xmax = np.max(verts_sub[:, 0])
                 xmid = 0.5*(xmin+xmax)
                 xhalf = 0.5*(xmax - xmin)
 
                 # setting new width of box
                 xmin_new = xmid-fac*xhalf
                 xmax_new = xmid+fac*xhalf
                 verts_sub[verts_sub[:, 0] == xmin, 0] = xmin_new
                 verts_sub[verts_sub[:, 0] == xmax, 0] = xmax_new
 
                 # setting new width of median line
                 for l in ax.lines:
                     if np.all(l.get_xdata() == [xmin, xmax]):
                         l.set_xdata([xmin_new, xmax_new])
 
 def non_nan_mean(x):
     if x.empty:
         return None
 
     else:
         x = x.dropna()
         return np.mean(x)
 
-def check_collinearity(CLUSTER_THRESHOLD, DENDROGRAM, input_df):
+def check_collinearity(input_df):  # CLUSTER_THRESHOLD,
     print('\nCheck collinearity..................\t')
     feature_names = input_df.columns
     corr = input_df.corr('spearman').to_numpy()
     corr = (corr + corr.T) / 2  # Ensure the correlation matrix is symmetric
     np.fill_diagonal(corr, 1)
     corr = np.nan_to_num(corr)
     distance_matrix = 1 - np.abs(corr)  # Convert the correlation matrix to a distance matrix
     dist_linkage = hierarchy.ward(
         squareform(distance_matrix))  # hierarchical clustering using Ward's linkage, return a ndarray
 
     if DENDROGRAM:
         size = len(feature_names) / 6.5
         fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(size * 2, size))
         dendro = hierarchy.dendrogram(dist_linkage, labels=feature_names, ax=ax1, leaf_rotation=90)
         dendro_idx = np.arange(0, len(dendro["ivl"]))
         ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]])
         ax2.set_xticks(feature_names[dendro_idx])
         ax2.set_yticks(dendro_idx)
         ax2.set_xticklabels(dendro["ivl"], rotation="vertical")
         ax2.set_yticklabels(dendro["ivl"])
         fig.tight_layout()
         plt.show()
 
     cluster_ids = hierarchy.fcluster(dist_linkage, CLUSTER_THRESHOLD, criterion="distance")  # return a list
-    # print("Cludter_ids:", cluster_ids)
     cluster_id_to_feature_ids = defaultdict(list)  # create a default dict
 
     for idx, cluster_id in enumerate(cluster_ids):  # generate the tuple [(0, ""), (1, ""), (2, "")]
         cluster_id_to_feature_ids[cluster_id].append(idx)  # use the 'cluster_id' as the key
     selected_feature_ids = [v[0] for v in cluster_id_to_feature_ids.values()]  # Each v is a list that might contins single or multiple cluster_id, here only thake the first element v[0] as the representative cludter_id
     # Transform the inupt_df into np.array and tranpose so that the list in each row is the data of each features.
     # We use the [i] to select the features that list in selected_features_ids.
     # After the List Comprehension, you need to convert the list back to the np.array so that you can transpose to original direction of the column and row.
     new_X = np.array([np.array(input_df).T[i] for i in selected_feature_ids]).T
     selected_features = [feature_names[i] for i in selected_feature_ids]
 
     print('Descriptors after applying cluster threshold of {}: {}'.format(CLUSTER_THRESHOLD, len(selected_features)))
     D = {'new_X': new_X, 'selected_features': selected_features}
     return new_X, selected_features
 
-def generate_df_for_collinearity_analysis(CLUSTER_THRESHOLD, DENDROGRAM, COMBINED_LEARNING_COLLINEARITY, X_sludge_padel, y_sludge_padel, soil_bayesian_merge_padel_reduced):
+def generate_df_for_collinearity_analysis(X_sludge_padel, y_sludge_padel, soil_bayesian_merge_padel_reduced):
     
-    if COMBINED_LEARNING_COLLINEARITY:
+    if COMBINED_LEARNING:
         X_combined_padel = pd.concat([X_sludge_padel, soil_bayesian_merge_padel_reduced.iloc[:, 2:]], join='inner')
         y_combined_padel = pd.concat([y_sludge_padel, soil_bayesian_merge_padel_reduced['hl_log_bayesian_mean']], join='inner')
         # When you run the combined learning, you first check the collinearity on soil and sludge together since the concatenated dataset might have some undiscover relationship.
-        selected_X_combined_padel_array, selected_features_combined = check_collinearity(CLUSTER_THRESHOLD, DENDROGRAM, X_combined_padel)  # Return the nd.array and list of selected features # cluster threshold: 0.02, Descriptors: 1085
+        selected_X_combined_padel_array, selected_features_combined = check_collinearity(X_combined_padel)  #Return the nd.array and list of selected features # cluster threshold: 0.02, Descriptors: 1085
         return selected_X_combined_padel_array, selected_features_combined, X_combined_padel, y_combined_padel
     else:
         X_combined_padel = "Not exist"
         y_combined_padel = "Not exist"
-        selected_X_sludge_padel_array, selected_features_sludge = check_collinearity(CLUSTER_THRESHOLD, DENDROGRAM, X_sludge_padel)  # cluster threshold: 0.02, Descriptors: 1068
+        selected_X_sludge_padel_array, selected_features_sludge = check_collinearity(X_sludge_padel)  # cluster threshold: 0.02, Descriptors: 1068
         return selected_X_sludge_padel_array, selected_features_sludge, X_combined_padel, y_combined_padel
 
-def get_feature_importance_RF(X, y, iterations):
+def get_feature_importance_RF(X, y):
     model = ensemble.RandomForestRegressor()
     r2_list = []
     rmse_list = []
     mae_list = []
     importance_list = []
 
     i = 1
     print('\nFind feature importance using RF..................')
-    while i <= iterations:
+    while i <= ITERATIONS:
         print('Running {} iterations'.format(i))
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
         pipe = make_pipeline(StandardScaler(), model)
         pipe.fit(X_train, y_train)
         y_test_pred = pipe.predict(X_test)
         r2_list.append(r2_score(y_test, y_test_pred))
         rmse_list.append(mean_squared_error(y_test, y_test_pred))
         mae_list.append(mean_absolute_error(y_test, y_test_pred))
         importance_list.append(model.feature_importances_)
         i += 1
 
     R2_mean = np.mean(r2_list)
     RMSE_mean = np.mean(rmse_list)
     MAE_mean = np.mean(mae_list)
     mean_feature_importance = np.mean(np.array(importance_list).T, axis=1)
-    print("R2: {}, RMSE: {}, MAE: {}".format(round(R2_mean,2), round(RMSE_mean,2), round(MAE_mean,2)))
+    print("R2: {}, RMSE: {}, MAE: {}".format(round(R2_mean, 2), round(RMSE_mean, 2), round(MAE_mean, 2)))
     return mean_feature_importance
 
 
-def plot_feature_importance(DESCRIPTORS, ls_feature, mean_feature_importance, topx, COMBINED_LEARNING_COLLINEARITY):
+def plot_feature_importance(ls_feature, mean_feature_importance, topx):
     tree_importance_sorted_idx = np.argsort(mean_feature_importance)[::-1] #This is the last model
     tree_indices_rule = np.arange(0, len(mean_feature_importance))[::-1]
     fig, ax1  = plt.subplots(1, 1, figsize=(6, 6))
     ax1.barh(tree_indices_rule[:topx], mean_feature_importance[tree_importance_sorted_idx][:topx], height=0.4)
     ax1.set_yticks(tree_indices_rule[:topx])
     ax1.set_yticklabels(ls_feature[tree_importance_sorted_idx][:topx])  #The features from sludge data set.
     fig.tight_layout()
     # plt.show()
-    plt.savefig(output_file_path + 'figures\\modelling\\{}_CL{}_RF_top_{}_feature_importance.png'.format(DESCRIPTORS, COMBINED_LEARNING_COLLINEARITY, topx))
+    plt.savefig(output_file_path + 'figures\\modelling\\{}_CL{}_RF_top_{}_feature_importance.png'.format(DESCRIPTORS, COMBINED_LEARNING, topx))
     plt.close()
 
 
 def find_top_features_subset(TOPS, features_, tree_importance_sorted_idx_):
     features = {}
 
     for top in TOPS:
         if not top.isdigit():
           features[top] = [f for f in features_[tree_importance_sorted_idx_][:]]
           features[top]
           continue
 
         features[top] = [f for f in features_[tree_importance_sorted_idx_][:int(top)]]
         features[top].append('package')
 
     return features
 
 
-def return_topX_features(DESCRIPTORS, TOPS, COMBINED_LEARNING_COLLINEARITY, input_array, y_input, input_descriptor, iterations):  # Input True or False
+def return_topX_features(TOPS, input_array, y_input, input_descriptor):  # Input True or False
     print('\nReturn topX features subset..................')
-    if COMBINED_LEARNING_COLLINEARITY:
-        rf_padel_feature_importance_CL = get_feature_importance_RF(input_array, y_input, iterations)  #selected_X_combined_padel_array, y_combined_padel
+    if COMBINED_LEARNING:
+        rf_padel_feature_importance_CL = get_feature_importance_RF(input_array, y_input)  #selected_X_combined_padel_array, y_combined_padel
         tree_importance_sorted_idx_CL = np.argsort(rf_padel_feature_importance_CL)[::-1]
         features_padel_CL = np.array(input_descriptor.columns) # X_combined_padel
         features_CL = find_top_features_subset(TOPS, features_padel_CL, tree_importance_sorted_idx_CL)
         features_10 = features_CL['10']
         features_20 = features_CL['20']
         features_30 = features_CL['30']
         features_40 = features_CL['40']
         features_all = features_CL['all']
-        plot_feature_importance(DESCRIPTORS, features_padel_CL, rf_padel_feature_importance_CL, 30, COMBINED_LEARNING_COLLINEARITY)
+        plot_feature_importance(features_padel_CL, rf_padel_feature_importance_CL, 30)
         return features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx_CL
 
     else:
-        rf_padel_feature_importance = get_feature_importance_RF(input_array, y_input, iterations)  #selected_X_sludge_padel_array, y_sludge_padel
+        rf_padel_feature_importance = get_feature_importance_RF(input_array, y_input)  #selected_X_sludge_padel_array, y_sludge_padel
         tree_importance_sorted_idx = np.argsort(rf_padel_feature_importance)[::-1]
         features_padel = np.array(input_descriptor.columns[2:]) # sludge_bayesian_merge_padel
         features = find_top_features_subset(TOPS, features_padel, tree_importance_sorted_idx)
         features_10 = features['10']
         features_20 = features['20']
         features_30 = features['30']
         features_40 = features['40']
         features_all = features['all']
-        plot_feature_importance(DESCRIPTORS, features_padel, rf_padel_feature_importance, 30, COMBINED_LEARNING_COLLINEARITY)
+        plot_feature_importance(features_padel, rf_padel_feature_importance, 30)
         return features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx
 
-def run_model(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, X_sludge_padel, y_sludge_padel, model, m, model_name, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced):
+def run_model(X_sludge_padel, y_sludge_padel, model, m, model_name, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced):
     r2_list = []
     rmse_list = []
     mae_list = []
     model_used = []
     combined_learning_used = []
     features_used = []
     descriptor_used = []
     dict_features_list = {'top 10': features_10, 'top 20': features_20, 'top 30': features_30, 'top 40': features_40, 'all': features_all}
     for list_name, features_topX in dict_features_list.items():  # dict_features_list[list_name]
         i = 1
         print("\nEvaluate model {} with {} features".format(model_name[m], list_name))
         # split the sludge dataset into validation set and input set
         # X_sludge_pre, X_validate, y_sludge_pre, y_validate = train_test_split(X[features_topX], y, test_size=0.10, random_state=0)
         # print("X_sludge shape before random splitting", X_sludge_pre.shape)
 
         while i <= ITERATIONS:
             # # Randomly split the preprocessing sludge dataset into training set and test set
-            X_train_sludge, X_test_sludge, y_train_sludge, y_test_sludge = train_test_split(X_sludge_padel,
-                                                                                            y_sludge_padel,
-                                                                                            test_size=0.20)
+            X_train_sludge, X_test_sludge, y_train_sludge, y_test_sludge = train_test_split(X_sludge_padel, y_sludge_padel, test_size=0.20)
             if COMBINED_LEARNING:
                 print('\tRun the {} iterations of combined learning.\tKeep calm and carry on!'.format(i))
                 # Concate the split sludge data with soil data
                 X_train_CL = pd.concat([X_train_sludge, soil_bayesian_merge_padel_reduced.iloc[:, 2:]], join='inner', ignore_index=True)  # We lost 30 cpds in this step
                 y_train_CL = pd.concat([y_train_sludge, soil_bayesian_merge_padel_reduced["hl_log_bayesian_mean"]], join='inner', ignore_index=True)
                 pipe = make_pipeline(StandardScaler(), model)
                 pipe.fit(X_train_CL[features_topX], y_train_CL)
                 y_test_pred = pipe.predict(X_test_sludge[features_topX])
                 r2_list.append(r2_score(y_test_sludge, y_test_pred))
                 rmse_list.append(mean_squared_error(y_test_sludge, y_test_pred))
                 mae_list.append(mean_absolute_error(y_test_sludge, y_test_pred))
                 model_used.append(model_name[m])
                 combined_learning_used.append(str(COMBINED_LEARNING))
                 features_used.append(list_name)
                 descriptor_used.append(str(DESCRIPTORS))
 
             else:
                 print('\tRun the {} iterations of Eawag-Soil.\tKeep calm and carry on!'.format(i))
                 # X_train_sludge, X_test_sludge, y_train_sludge, y_test_sludge = train_test_split(X_sludge_padel[features_topX], y_sludge_padel, test_size=0.20)
                 pipe = make_pipeline(StandardScaler(), model)
                 pipe.fit(X_train_sludge[features_topX], y_train_sludge)  # X_train, y_train
                 y_test_pred = pipe.predict(X_test_sludge[features_topX])
                 r2_list.append(r2_score(y_test_sludge, y_test_pred))
                 rmse_list.append(mean_squared_error(y_test_sludge, y_test_pred))
                 mae_list.append(mean_absolute_error(y_test_sludge, y_test_pred))
                 model_used.append(model_name[m])
                 combined_learning_used.append(str(COMBINED_LEARNING))
                 features_used.append(list_name)
                 descriptor_used.append(str(DESCRIPTORS))
             i += 1
         print('R2:', round(np.mean(r2_list), 2), '\tRMSE:', round(np.mean(rmse_list), 2), '\tMAE:', round(np.mean(mae_list), 2), "\n")
     return r2_list, rmse_list, mae_list, model_used, combined_learning_used, features_used, descriptor_used
 
-def store_result(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, MODELS, model_name, X, y, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced):  #
+def store_result(MODELS, model_name, X, y, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced):  #
     print("Prepare to collect the result..................")
     model_dict_ = {'R2': [], 'RMSE': [], 'MAE': [], 'Model': [], 'Combined_learning': [], 'Selected_features': [], 'Descriptor': []}
     for m in range(len(MODELS)):  # for modle in MODELS:
-        r2_list, rmse_list, mae_list, model_used, combined_learning_used, features_used, descriptor_used = run_model(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, X, y, MODELS[m], m, model_name, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced)  # X_sludge_padel, y_sludge_padel for sludge only
+        r2_list, rmse_list, mae_list, model_used, combined_learning_used, features_used, descriptor_used = run_model(X, y, MODELS[m], m, model_name, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced)  # X_sludge_padel, y_sludge_padel for sludge only
         model_dict_['R2'].extend(r2_list)  # X_combined_padel, y_combined_padel
         model_dict_['RMSE'].extend(rmse_list)
         model_dict_['MAE'].extend(mae_list)
         model_dict_['Selected_features'].extend(features_used)
         model_dict_['Model'].extend(model_used)
         model_dict_['Descriptor'].extend(descriptor_used)
         model_dict_['Combined_learning'].extend(combined_learning_used)
     return model_dict_
 
-def plot_rmse(COMBINED_LEARNING, DESCRIPTORS, df):
+def plot_rmse(df):
 
     if COMBINED_LEARNING:
         TAGS_rmse = ""
         dt = df
     else:
         TAGS_rmse = "Without"
         dt = df
 
     fig = plt.figure(1, figsize=(12, 6))
     sns.set_style("whitegrid")
     sns.set_theme(style="ticks", palette="pastel")
     sns.boxplot(x="Model", y="RMSE", width=0.6,hue="Selected_features", palette=["#47ad62", "#4d5154", "#2c79de", "#8a1506", "#ed6511"], data=dt, showfliers=False)
     adjust_box_widths(fig, 0.6)
     sns.despine(offset=10, trim=False)
     plt.xlabel('')
-    plt.ylabel('RMSE')  #'$R^2$'
+    plt.ylabel('RMSE')
     plt.title('{} Combined Learning & {} descriptor'.format(TAGS_rmse, DESCRIPTORS), fontsize=20)
     plt.savefig(output_file_path + 'figures\\modelling\\rmse_CL_{}_{}.png'.format(COMBINED_LEARNING, DESCRIPTORS))
     plt.close()
 
-def plot_r2(COMBINED_LEARNING, DESCRIPTORS, df):
+def plot_r2(df):
 
     if COMBINED_LEARNING:
         TAGS = ""
         dt = df
     else:
         TAGS = "Without"
         dt = df
 
     fig = plt.figure(1, figsize=(12, 6))
     sns.set_style("whitegrid")
     sns.set_theme(style="ticks", palette="pastel")
-    sns.boxplot(x="Model", y="R2", width=0.6,hue="Selected_features", palette=["#47ad62", "#4d5154", "#2c79de", "#8a1506", "#ed6511"], data=dt, showfliers=False)
+    sns.boxplot(x="Model", y="R2", width=0.6, hue="Selected_features", palette=["#47ad62", "#4d5154", "#2c79de", "#8a1506", "#ed6511"], data=dt, showfliers=False)
     adjust_box_widths(fig, 0.7)
     sns.despine(offset=10, trim=False)
     # plt.axhline(0.1, linewidth=2, color='#bf6613')
     plt.xlabel('')
-    plt.ylabel('$R^2$')  #'$R^2$'
+    plt.ylabel('$R^2$')
     plt.title('{} Combined Learning & {} descriptor'.format(TAGS, DESCRIPTORS), fontsize=20)
     plt.savefig(output_file_path + 'figures\\modelling\\r2_CL_{}_{}.png'.format(COMBINED_LEARNING, DESCRIPTORS))
     plt.close()  # prevent overwrite
 
 
 def main():
     # Load the soil dataset
     # input_file_path = "C:\\Users\\leetseng\\TWtest\\input\\"
     # output_file_path = "C:\\Users\\leetseng\\TWtest\\output\\"
 
-    TAGS = ''
-    ITERATIONS = 20
-    DESCRIPTORS = 'PaDEL'
-    DENDROGRAM = False
-    CLUSTER_THRESHOLD = 0.02
-    COMBINED_LEARNING = True
-    COMBINED_LEARNING_COLLINEARITY = True
-    TOPS = ['10', '20', '30', '40', 'all']
-
     MODELS = [
-        RandomForestRegressor(),
+        ensemble.RandomForestRegressor(),
         xgb.XGBRegressor(),
         SVR(),
         neighbors.KNeighborsRegressor(),
-        AdaBoostRegressor()
+        ensemble.AdaBoostRegressor()
         #   SGDRegressor(),
         #   reg,
         #   PLSRegression(n_components=3)
         #   linear_model.ElasticNet(),
         #   gaussian_process.GaussianProcessRegressor()
     ]
 
     model_name = [
         "Random Forest",
         "XGBoost",
         "Support Vector Regressor",
         "k-Nearest Neighbors",
         "AdaBoost Regressor"
         # "SGD Regressor",
         # "StackReg",
         # "Partial Least-Squares Regressor"
         # "ElasticNet",
         # "Gaussian Process"
     ]
 
     aggs = [non_nan_mean]
     df_subset_sludge = df_sludge_[["reduced_smiles", "hl_log_bayesian_mean", "package"]]  #"temperature", "log_hl_combined", "log_hl_biomass_corrected"
     df_subset_sludge_1 = df_subset_sludge.groupby(["reduced_smiles"]).agg(aggs).reset_index()
     df_subset_sludge_1.columns = df_subset_sludge_1.columns.droplevel(1)
     sludge_bayesian_merge_padel = pd.merge(df_subset_sludge_1, sludge_padel_descriptor, how='left', on='reduced_smiles').dropna()
 
     soil_bayesian_merge_padel_reduced = pd.read_csv(output_file_path+"soil_bayesian_merge_padel_reduced.tsv", sep="\t")
     soil_bayesian_merge_padel_reduced.dropna(axis=0, inplace=True)
     soil_bayesian_merge_padel_reduced.drop(columns=['Unnamed: 0', 'topoShape'], inplace=True)   #contains subset of all target variable (without pH), and reduced descriptors
 
     # Keep the hold-out set
     X_sludge_padel, X_validate, y_sludge_padel, y_validate = train_test_split(sludge_bayesian_merge_padel.iloc[:, 2:],
                                                                               sludge_bayesian_merge_padel['hl_log_bayesian_mean'],
                                                                               test_size=0.10,
                                                                               random_state=62)
     if COMBINED_LEARNING:
-        selected_X_combined_padel_array, selected_features_combined, X_combined_padel, y_combined_padel = generate_df_for_collinearity_analysis(CLUSTER_THRESHOLD, DENDROGRAM, COMBINED_LEARNING_COLLINEARITY, X_sludge_padel, y_sludge_padel, soil_bayesian_merge_padel_reduced)
-        features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx_CL = return_topX_features(DESCRIPTORS, TOPS, COMBINED_LEARNING_COLLINEARITY, selected_X_combined_padel_array, y_combined_padel, X_combined_padel, ITERATIONS)
-        df_top_feature_padel = pd.DataFrame.from_dict(store_result(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, MODELS, model_name, X_sludge_padel, y_sludge_padel, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced))
+        selected_X_combined_padel_array, selected_features_combined, X_combined_padel, y_combined_padel = generate_df_for_collinearity_analysis(X_sludge_padel, y_sludge_padel, soil_bayesian_merge_padel_reduced)
+        features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx_CL = return_topX_features(TOPS, selected_X_combined_padel_array, y_combined_padel, X_combined_padel)
+        df_top_feature_padel = pd.DataFrame.from_dict(store_result(MODELS, model_name, X_sludge_padel, y_sludge_padel, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced))
         # COMBINED_LEARNING, ITERATIONS, MODELS, X, y, features_10, features_20, features_30, features_40, features_all
-        plot_rmse(COMBINED_LEARNING, DESCRIPTORS, df_top_feature_padel)
-        plot_r2(COMBINED_LEARNING, DESCRIPTORS, df_top_feature_padel)
+        plot_rmse(df_top_feature_padel)
+        plot_r2(df_top_feature_padel)
     else:
-        selected_X_sludge_padel_array, selected_features_sludge, X_combined_padel, y_combined_padel = generate_df_for_collinearity_analysis(CLUSTER_THRESHOLD, DENDROGRAM, COMBINED_LEARNING_COLLINEARITY, X_sludge_padel, y_sludge_padel,soil_bayesian_merge_padel_reduced)
-        features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx = return_topX_features(DESCRIPTORS, TOPS, COMBINED_LEARNING_COLLINEARITY, selected_X_sludge_padel_array, y_sludge_padel, sludge_bayesian_merge_padel, ITERATIONS)
-        df_top_feature_padel_noCL = pd.DataFrame.from_dict(store_result(DESCRIPTORS, COMBINED_LEARNING, ITERATIONS, MODELS, model_name, X_sludge_padel, y_sludge_padel, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced))
-        plot_rmse(COMBINED_LEARNING, DESCRIPTORS, df_top_feature_padel_noCL)
-        plot_r2(COMBINED_LEARNING, DESCRIPTORS, df_top_feature_padel_noCL)
-    # else:
-        # selected_X_sludge_padel_array, selected_features_sludge, X_combined_padel, y_combined_padel = generate_df_for_collinearity_analysis(CLUSTER_THRESHOLD, DENDROGRAM, COMBINED_LEARNING_COLLINEARITY, X_sludge_padel, y_sludge_padel, soil_bayesian_merge_padel_reduced)
-
-    # if COMBINED_LEARNING:
-
-        # features_10, features_20, features_30, features_40, features_all = return_topX_features(COMBINED_LEARNING_COLLINEARITY, selected_X_sludge_padel_array, y_sludge_padel, sludge_bayesian_merge_padel)
-
-
+        selected_X_sludge_padel_array, selected_features_sludge, X_combined_padel, y_combined_padel = generate_df_for_collinearity_analysis(X_sludge_padel, y_sludge_padel,soil_bayesian_merge_padel_reduced)
+        features_10, features_20, features_30, features_40, features_all, tree_importance_sorted_idx = return_topX_features(TOPS, selected_X_sludge_padel_array, y_sludge_padel, sludge_bayesian_merge_padel)
+        df_top_feature_padel_noCL = pd.DataFrame.from_dict(store_result(MODELS, model_name, X_sludge_padel, y_sludge_padel, features_10, features_20, features_30, features_40, features_all, soil_bayesian_merge_padel_reduced))
+        plot_rmse(df_top_feature_padel_noCL)
+        plot_r2(df_top_feature_padel_noCL)
 
     dict_features_list = {'top 10': features_10, 'top 20': features_20, 'top 30': features_30, 'top 40': features_40, 'all': features_all}
 
-    # if COMBINED_LEARNING:
-    #     df_top_feature_padel = pd.DataFrame.from_dict(store_result(MODELS, X_sludge_padel, y_sludge_padel))
-    # else:
-    #     df_top_feature_padel_noCL = pd.DataFrame.from_dict(store_result(MODELS, X_sludge_padel, y_sludge_padel))
-
-
-
-
 if __name__ == '__main__':
     main()