RF_variableImportance.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, Jul 17, 13:45

RF_variableImportance.py
View Options

	import numpy as np
	import matplotlib.pyplot as plt
	from matplotlib.colors import LinearSegmentedColormap
	import pandas as pd
	import xarray as xr
	import os
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_squared_error as mse
	from meteo_data import Meteo_Reader
	from sklearn.externals import joblib
	import util


	data_path = os.path.abspath("/mnt/sda1/hyenergy/data/meteo")
	# data_path = os.path.abspath("/Users/alinawalch/Documents/EPFL/data/meteo")

	sample0 = pd.read_pickle(os.path.join(data_path, 'datasets', 'hourly_1M_sample.pkl'))

	ftr_table0 = sample0.drop(columns = ['index', 'lon','lat','date','year','CH','day','timestamp','SIS'])
	tgt_table0 = sample0[['SIS']]

	ftr_table = util.standardize(ftr_table0, ftr_table0.mean(), ftr_table0.std())
	tgt_table = util.standardize(tgt_table0, tgt_table0.mean(), tgt_table0.std())

	x0 = ftr_table0.as_matrix()
	t0 = tgt_table0.as_matrix().reshape((-1,))

	x_all = ftr_table.as_matrix()
	t_all = tgt_table.as_matrix().reshape((-1,))

	importances = np.zeros((7, x_all.shape[1]))
	stds = np.zeros((7, x_all.shape[1]))
	inds = np.zeros((7, x_all.shape[1]))

	# Random Forest
	n_est = 500

	#################### MONTHLY ANALYSIS ##################################

	data = pd.read_pickle(os.path.join(data_path, 'datasets', 'hourly_all_jan.pkl'))
	hrs = [9,11,13]

	for i in [0,1,2]:
	h = hrs[i]
	sample = data.loc[(data.hour==h)]
	ftr_jan0 = sample.drop(columns = ['lon','lat','year','CH','day','SIS']).reindex(columns = ftr_table0.columns)
	tgt_jan0 = sample[['SIS']]

	ftr_jan = util.standardize(ftr_jan0, ftr_jan0.mean(), ftr_jan0.std())
	tgt_jan = util.standardize(tgt_jan0, tgt_jan0.mean(), tgt_jan0.std())

	x0 = ftr_jan0.as_matrix()
	t0 = tgt_jan0.as_matrix().reshape((-1,))

	x = ftr_jan.as_matrix()
	t = tgt_jan.as_matrix().reshape((-1,))

	print(x.shape)
	print(t.shape)

	rf = RandomForestRegressor(n_estimators = n_est, max_depth = 100, n_jobs = -1)

	print('Training random forest... ')
	timer_0 = util.Timer()
	rf.fit(x, t)
	timer_0.stop()

	joblib.dump(rf, os.path.join(data_path,'datasets', ('model_importances_jan_%d.pkl' %h)))

	importances[i+1,:] = np.mean([tree.feature_importances_ for tree in rf.estimators_],axis=0)
	stds[i+1,:] = np.std([tree.feature_importances_ for tree in rf.estimators_],axis=0)
	inds[i+1,:] = np.argsort(importances[i+1,:])[-30:]
	print(importances[i+1,:])
	print(stds[i+1,:])
	print(inds[i+1,:])

	### June

	data = pd.read_pickle(os.path.join(data_path, 'datasets', 'hourly_all_jun.pkl'))
	hrs = [6,11,16]

	for i in [0,1,2]:
	h = hrs[i]
	sample = data.loc[(data.hour==h)]
	ftr_jun0 = sample.drop(columns = ['lon','lat','year','CH','day','SIS']).reindex(columns = ftr_table0.columns)
	tgt_jun0 = sample[['SIS']]

	ftr_jun = util.standardize(ftr_jun0, ftr_jun0.mean(), ftr_jun0.std())
	tgt_jun = util.standardize(tgt_jun0, tgt_jun0.mean(), tgt_jun0.std())

	x0 = ftr_jun0.as_matrix()
	t0 = tgt_jun0.as_matrix().reshape((-1,))

	x = ftr_jun.as_matrix()
	t = tgt_jun.as_matrix().reshape((-1,))

	rf = RandomForestRegressor(n_estimators = n_est, max_depth = 100, n_jobs = -1)

	print('Training random forest... ')
	timer_0 = util.Timer()
	rf.fit(x, t)
	timer_0.stop()

	joblib.dump(rf, os.path.join(data_path,'datasets', ('model_importances_jun_%d.pkl' %h)))

	importances[i+4,:] = np.mean([tree.feature_importances_ for tree in rf.estimators_],axis=0)
	stds[i+4,:] = np.std([tree.feature_importances_ for tree in rf.estimators_],axis=0)
	inds[i+4,:] = np.argsort(importances[i+4,:])[-30:]
	print(importances[i+4,:])
	print(stds[i+4,:])
	print(inds[i+4,:])

	rf = RandomForestRegressor(n_estimators = n_est, max_depth = 100, n_jobs = -1)

	print('Training random forest... ')
	timer_0 = util.Timer()
	rf.fit(x_all, t_all)
	timer_0.stop()

	joblib.dump(rf, os.path.join(data_path,'datasets', 'model_importances.pkl'))

	importances[0,:] = np.mean([tree.feature_importances_ for tree in rf.estimators_],axis=0)
	stds[0,:] = np.std([tree.feature_importances_ for tree in rf.estimators_],axis=0)
	inds[0,:] = np.argsort(importances[0,:])[-30:]
	print(importances[0,:])
	print(stds[0,:])
	print(inds[0,:])

	pd.DataFrame(data = importances, columns = ftr_table0.columns).to_csv('importances.csv')
	pd.DataFrame(data = stds, columns = ftr_table0.columns).to_csv('stds.csv')
	pd.DataFrame(data = inds, columns = ftr_table0.columns).to_csv('inds.csv')

	# plt.figure(figsize=(15,15))
	# plt.title("Feature Importances for Random Forest (%d trees)" %n_est)
	# plt.barh(range(30),importance[ind],
	# color="r", xerr=std[ind], alpha=0.4, align="center")
	# plt.yticks(range(30), feature_names[ind])
	# plt.ylim([-1, len(ind)])
	# plt.savefig("importances.png")

RF_variableImportance.pyNo OneTemporaryActions

File Metadata

RF_variableImportance.pyView Options

Event Timeline

RF_variableImportance.py
No OneTemporary
Actions

RF_variableImportance.py
View Options