cluster_ids=hierarchy.fcluster(dist_linkage,CLUSTER_THRESHOLD,criterion="distance")# return a list
cluster_id_to_feature_ids=defaultdict(list)# create a default dict
foridx,cluster_idinenumerate(cluster_ids):# generate the tuple [(0, ""), (1, ""), (2, "")]
cluster_id_to_feature_ids[cluster_id].append(idx)# use the 'cluster_id' as the key
selected_feature_ids=[v[0]forvincluster_id_to_feature_ids.values()]# Each v is a list that might contins single or multiple cluster_id, here only thake the first element v[0] as the representative cludter_id
# Transform the inupt_df into np.array and tranpose so that the list in each row is the data of each features.
# We use the [i] to select the features that list in selected_features_ids.
# After the List Comprehension, you need to convert the list back to the np.array so that you can transpose to original direction of the column and row.
# When you run the combined learning, you first check the collinearity on soil and sludge together since the concatenated dataset might have some undiscover relationship.
selected_X_combined_padel_array,selected_features_combined=check_collinearity(X_combined_padel)#Return the nd.array and list of selected features # cluster threshold: 0.02, Descriptors: 1085
r2_list,rmse_list,mae_list,model_used,combined_learning_used,features_used,descriptor_used=run_model(X,y,MODELS[m],m,model_name,features_10,features_20,features_30,features_40,features_all,soil_bayesian_merge_padel_reduced)# X_sludge_padel, y_sludge_padel for sludge only