Page MenuHomec4science

utils_randomforest.py
No OneTemporary

File Metadata

Created
Sat, Jul 5, 17:27

utils_randomforest.py

#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import matplotlib.pyplot as plt
# Split data into three different sets.
# ratio_t is the propotion of the test set
# ratio_v is the propotion of the validation set
# The rest will be the training set
def split_data(X,y,ratio_t,ratio_v):
s = X.shape[0]
s1= int(ratio_t*s)
s2= int(ratio_v*s) + s1
# make sure that the sizes of all sets is ol
assert(s1<s2 and s2<=s)
indices = np.arange(s)
np.random.shuffle(indices)
# prepare indices of three different set
idx_test = indices[: s1]
idx_val = indices[s1: s2]
idx_train = indices[s2:]
X_test = X[ idx_test]
X_val = X[ idx_val]
X_train = X[ idx_train ]
y_test = y[ idx_test ]
y_val = y[ idx_val ]
y_train = y[idx_train]
return X_train, X_val, X_test, y_train, y_val, y_test
# Plot feature importance for the model
def plot_importance(cols, model):
features = cols
importances = model.feature_importances_
indices = np.argsort(model.feature_importances_)
plt.subplots(figsize=(11, 9))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='pink', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
# Normalize ou standarize the train set
# Then apply transform to validation and test set
def pre_processing (X_train,X_val,X_test, norm=False, std=False):
"""
Apply a transformation to the data.
Options are standardization or normalization (using L2 norm).
By default : no transformation applied to data
"""
if norm :
n = Normalizer()
X_train = n.fit_transform(X_train)
X_test = n.transform(X_test)
X_val = n.transform(X_val)
if std :
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_val = sc.transform(X_val)
return X_train, X_val, X_test
# Combine data for 2016 and 2017
# Default data set proportion setting : train 0.7 - val 0.2 - test 0.1
# Also return the column list
def prepare_combined_data_forCV(file_names,ratio_t=0.1, ratio_v=0.2, norm=False, std=False) :
if type(file_names) == str:
file_names = [file_names]
dfs = []
for file_name in file_names:
local_df = pd.read_csv(file_name)
dfs.append(local_df)
df = pd.concat(dfs, sort = False)
y = df['Produktion [kWh]**'].values
df = df.drop(columns=["Produktion [kWh]**", "Anlage_Ort # Emplacement de installation"])
cols = df.columns
X_u = df.values
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X_u, y, ratio_t, ratio_v)
X_train,X_val, X_test = pre_processing(X_train, X_val,X_test, norm, std)
print("There are: ")
print(len(cols),"features for each data points.")
print(X_train.shape[0],"data points in training set.")
print(X_val.shape[0],"data points in validation set.")
print(X_test.shape[0],"data points in test set.")
return X_train, X_val, X_test, y_train, y_val, y_test,cols
# Drop useless columns
# Also return the REMIANING feature list
def prepare_data_drop (file_names,cols_to_drop,norm=False, std=False):
if type(file_names) == str:
file_names = [file_names]
dfs = []
for file_name in file_names:
local_df = pd.read_csv(file_name)
dfs.append(local_df)
df = pd.concat(dfs, sort = False)
y = y = df['Produktion [kWh]**'].values # per 1000 habitants and drop habitants col
df = df.drop(columns=["Produktion [kWh]**", "Anlage_Ort # Emplacement de installation"])
for c in cols_to_drop:
if( c in df):
df = df.drop(columns=c)
cols = df.columns
X_u = df.values
X = StandardScaler().fit_transform(X_u)
print("There are: ")
print(len(cols),"reduced features for each data points.")
print(X.shape[0],"data points in total.")
return X, y, cols
# Prepare data for training set
# Drop useless columns
# Return X and y for prediction, as well as the remaining columns
def prepare_data (file_names,norm=False, std=False):
if type(file_names) == str:
file_names = [file_names]
dfs = []
for file_name in file_names:
local_df = pd.read_csv(file_name)
dfs.append(local_df)
df = pd.concat(dfs, sort = False)
y = y = df['Produktion [kWh]**'].values
df = df.drop(columns=["Produktion [kWh]**", "Anlage_Ort # Emplacement de installation"])
cols = df.columns
X_u = df.values
# data split
X = StandardScaler().fit_transform(X_u)
print("There are: ")
print(len(cols),"reduced features for each data points.")
print(X.shape[0],"data points.")
return X, y, cols
# Perform a cross validation for Random Forest
# Hyperparameters to tune is "n_estimators" and "max_depths"
# Aim to reduce the rmse
def rf_cv(X_train, X_val, y_train, y_val, max_depths, n_estimators):
rmse_val = []
best_depths = []
for n_e in n_estimators:
rmse_val_tmp = []
for d in max_depths:
# create rf
regr = RandomForestRegressor(max_depth=d, n_estimators=n_e,random_state=0) # param à tuner
# train
regr.fit(X_train, y_train)
# eval
y_pred_val = regr.predict(X_val)
rmse_te = np.sqrt(metrics.mean_squared_error(y_val, y_pred_val))
rmse_val_tmp.append(rmse_te)
best_d_tmp = np.argmin(rmse_val_tmp)
best_rmse_val_d = rmse_val_tmp[best_d_tmp]
best_depths.append(best_d_tmp) # best depth for rmses
rmse_val.append(best_rmse_val_d) # best rmse for best depth
print("n_estimators=",n_e, " depths=",max_depths[best_d_tmp]," with rmse_val ",best_rmse_val_d)
idx = np.argmin(rmse_val)
best_estimator = n_estimators[idx] # best n_estimator for best rmse
best_depth = max_depths[ best_depths[idx] ] # best depth
print("idx is ",idx)
print("best n_estimator is", best_estimator)
print("best_depth",best_depth)
return best_estimator, best_depth, rmse_val, best_depths
# Based on the best "n_estimators" and "max_depths" found, perform a second cross validation
# Hyperparameter to tune is "max_features"
# Aim to reduce the rmse
def rf_cv_2f(X_train, X_val, y_train, y_val, bestD,bestN,max_features_range):
rmse_val = []
for f in range(len(max_features_range)):
regr = RandomForestRegressor(max_features=int(max_features_range[f]) ,max_depth=bestD, n_estimators=bestN,random_state=0) # param à tuner
print("Tesing max_features =",int(max_features_range[f]))
regr.fit(X_train, y_train)
# eval
y_pred_val = regr.predict(X_val)
rmse_te = np.sqrt(metrics.mean_squared_error(y_val, y_pred_val))
rmse_val.append(rmse_te)
idx = np.argmin(rmse_val)
best_f = max_features_range[idx] # best max_features
print("idx is ",idx)
print("best max_features is", best_f," for bestD ",bestD," and bestN",bestN)
return int(best_f), rmse_val

Event Timeline