utils_randomforest.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Jul 5, 17:27

utils_randomforest.py
View Options

	#!/usr/bin/env python
	# coding: utf-8

	import pandas as pd
	import numpy as np
	from sklearn.model_selection import RandomizedSearchCV
	from sklearn.model_selection import GridSearchCV
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.preprocessing import Normalizer
	from sklearn.preprocessing import StandardScaler
	from sklearn import metrics
	import matplotlib.pyplot as plt


	# Split data into three different sets.
	# ratio_t is the propotion of the test set
	# ratio_v is the propotion of the validation set
	# The rest will be the training set
	def split_data(X,y,ratio_t,ratio_v):
	s = X.shape[0]
	s1= int(ratio_t*s)
	s2= int(ratio_v*s) + s1
	# make sure that the sizes of all sets is ol
	assert(s1<s2 and s2<=s)
	indices = np.arange(s)
	np.random.shuffle(indices)

	# prepare indices of three different set
	idx_test = indices[: s1]
	idx_val = indices[s1: s2]
	idx_train = indices[s2:]

	X_test = X[ idx_test]
	X_val = X[ idx_val]
	X_train = X[ idx_train ]

	y_test = y[ idx_test ]
	y_val = y[ idx_val ]
	y_train = y[idx_train]

	return X_train, X_val, X_test, y_train, y_val, y_test


	# Plot feature importance for the model
	def plot_importance(cols, model):
	features = cols
	importances = model.feature_importances_
	indices = np.argsort(model.feature_importances_)

	plt.subplots(figsize=(11, 9))
	plt.title('Feature Importances')
	plt.barh(range(len(indices)), importances[indices], color='pink', align='center')
	plt.yticks(range(len(indices)), [features[i] for i in indices])
	plt.xlabel('Relative Importance')
	plt.show()

	# Normalize ou standarize the train set
	# Then apply transform to validation and test set
	def pre_processing (X_train,X_val,X_test, norm=False, std=False):
	"""
	Apply a transformation to the data.
	Options are standardization or normalization (using L2 norm).
	By default : no transformation applied to data
	"""
	if norm :
	n = Normalizer()
	X_train = n.fit_transform(X_train)
	X_test = n.transform(X_test)
	X_val = n.transform(X_val)
	if std :
	sc = StandardScaler()
	X_train = sc.fit_transform(X_train)
	X_test = sc.transform(X_test)
	X_val = sc.transform(X_val)
	return X_train, X_val, X_test

	# Combine data for 2016 and 2017
	# Default data set proportion setting : train 0.7 - val 0.2 - test 0.1
	# Also return the column list
	def prepare_combined_data_forCV(file_names,ratio_t=0.1, ratio_v=0.2, norm=False, std=False) :
	if type(file_names) == str:
	file_names = [file_names]
	dfs = []
	for file_name in file_names:
	local_df = pd.read_csv(file_name)
	dfs.append(local_df)

	df = pd.concat(dfs, sort = False)
	y = df['Produktion [kWh]**'].values
	df = df.drop(columns=["Produktion [kWh]**", "Anlage_Ort # Emplacement de installation"])
	cols = df.columns
	X_u = df.values
	X_train, X_val, X_test, y_train, y_val, y_test = split_data(X_u, y, ratio_t, ratio_v)
	X_train,X_val, X_test = pre_processing(X_train, X_val,X_test, norm, std)
	print("There are: ")
	print(len(cols),"features for each data points.")
	print(X_train.shape[0],"data points in training set.")
	print(X_val.shape[0],"data points in validation set.")
	print(X_test.shape[0],"data points in test set.")
	return X_train, X_val, X_test, y_train, y_val, y_test,cols

	# Drop useless columns
	# Also return the REMIANING feature list
	def prepare_data_drop (file_names,cols_to_drop,norm=False, std=False):

	if type(file_names) == str:
	file_names = [file_names]
	dfs = []
	for file_name in file_names:
	local_df = pd.read_csv(file_name)
	dfs.append(local_df)

	df = pd.concat(dfs, sort = False)

	y = y = df['Produktion [kWh]**'].values # per 1000 habitants and drop habitants col
	df = df.drop(columns=["Produktion [kWh]**", "Anlage_Ort # Emplacement de installation"])
	for c in cols_to_drop:
	if( c in df):
	df = df.drop(columns=c)

	cols = df.columns
	X_u = df.values
	X = StandardScaler().fit_transform(X_u)
	print("There are: ")
	print(len(cols),"reduced features for each data points.")
	print(X.shape[0],"data points in total.")
	return X, y, cols


	# Prepare data for training set
	# Drop useless columns
	# Return X and y for prediction, as well as the remaining columns
	def prepare_data (file_names,norm=False, std=False):

	if type(file_names) == str:
	file_names = [file_names]
	dfs = []
	for file_name in file_names:
	local_df = pd.read_csv(file_name)
	dfs.append(local_df)

	df = pd.concat(dfs, sort = False)

	y = y = df['Produktion [kWh]**'].values
	df = df.drop(columns=["Produktion [kWh]**", "Anlage_Ort # Emplacement de installation"])

	cols = df.columns
	X_u = df.values

	# data split
	X = StandardScaler().fit_transform(X_u)
	print("There are: ")
	print(len(cols),"reduced features for each data points.")
	print(X.shape[0],"data points.")
	return X, y, cols


	# Perform a cross validation for Random Forest
	# Hyperparameters to tune is "n_estimators" and "max_depths"
	# Aim to reduce the rmse
	def rf_cv(X_train, X_val, y_train, y_val, max_depths, n_estimators):

	rmse_val = []
	best_depths = []

	for n_e in n_estimators:
	rmse_val_tmp = []
	for d in max_depths:
	# create rf
	regr = RandomForestRegressor(max_depth=d, n_estimators=n_e,random_state=0) # param à tuner

	# train
	regr.fit(X_train, y_train)

	# eval
	y_pred_val = regr.predict(X_val)
	rmse_te = np.sqrt(metrics.mean_squared_error(y_val, y_pred_val))
	rmse_val_tmp.append(rmse_te)

	best_d_tmp = np.argmin(rmse_val_tmp)
	best_rmse_val_d = rmse_val_tmp[best_d_tmp]

	best_depths.append(best_d_tmp) # best depth for rmses
	rmse_val.append(best_rmse_val_d) # best rmse for best depth
	print("n_estimators=",n_e, " depths=",max_depths[best_d_tmp]," with rmse_val ",best_rmse_val_d)

	idx = np.argmin(rmse_val)
	best_estimator = n_estimators[idx] # best n_estimator for best rmse
	best_depth = max_depths[ best_depths[idx] ] # best depth
	print("idx is ",idx)
	print("best n_estimator is", best_estimator)
	print("best_depth",best_depth)
	return best_estimator, best_depth, rmse_val, best_depths

	# Based on the best "n_estimators" and "max_depths" found, perform a second cross validation
	# Hyperparameter to tune is "max_features"
	# Aim to reduce the rmse
	def rf_cv_2f(X_train, X_val, y_train, y_val, bestD,bestN,max_features_range):
	rmse_val = []

	for f in range(len(max_features_range)):
	regr = RandomForestRegressor(max_features=int(max_features_range[f]) ,max_depth=bestD, n_estimators=bestN,random_state=0) # param à tuner

	print("Tesing max_features =",int(max_features_range[f]))
	regr.fit(X_train, y_train)

	# eval
	y_pred_val = regr.predict(X_val)
	rmse_te = np.sqrt(metrics.mean_squared_error(y_val, y_pred_val))
	rmse_val.append(rmse_te)

	idx = np.argmin(rmse_val)
	best_f = max_features_range[idx] # best max_features
	print("idx is ",idx)
	print("best max_features is", best_f," for bestD ",bestD," and bestN",bestN)
	return int(best_f), rmse_val

utils_randomforest.pyNo OneTemporaryActions

File Metadata

utils_randomforest.pyView Options

Event Timeline

utils_randomforest.py
No OneTemporary
Actions

utils_randomforest.py
View Options