Page MenuHomec4science

RF_variableImportance.py
No OneTemporary

File Metadata

Created
Tue, Apr 29, 22:23

RF_variableImportance.py

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import pandas as pd
import xarray as xr
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from meteo_data import Meteo_Reader
from sklearn.externals import joblib
import util
data_path = os.path.abspath("/mnt/sda1/hyenergy/data/meteo")
# data_path = os.path.abspath("/Users/alinawalch/Documents/EPFL/data/meteo")
sample0 = pd.read_pickle(os.path.join(data_path, 'datasets', 'hourly_1M_sample.pkl'))
ftr_table0 = sample0.drop(columns = ['index', 'lon','lat','date','year','CH','day','timestamp','SIS'])
tgt_table0 = sample0[['SIS']]
ftr_table = util.standardize(ftr_table0, ftr_table0.mean(), ftr_table0.std())
tgt_table = util.standardize(tgt_table0, tgt_table0.mean(), tgt_table0.std())
x0 = ftr_table0.as_matrix()
t0 = tgt_table0.as_matrix().reshape((-1,))
x_all = ftr_table.as_matrix()
t_all = tgt_table.as_matrix().reshape((-1,))
importances = np.zeros((7, x_all.shape[1]))
stds = np.zeros((7, x_all.shape[1]))
inds = np.zeros((7, x_all.shape[1]))
# Random Forest
n_est = 500
#################### MONTHLY ANALYSIS ##################################
data = pd.read_pickle(os.path.join(data_path, 'datasets', 'hourly_all_jan.pkl'))
hrs = [9,11,13]
for i in [0,1,2]:
h = hrs[i]
sample = data.loc[(data.hour==h)]
ftr_jan0 = sample.drop(columns = ['lon','lat','year','CH','day','SIS']).reindex(columns = ftr_table0.columns)
tgt_jan0 = sample[['SIS']]
ftr_jan = util.standardize(ftr_jan0, ftr_jan0.mean(), ftr_jan0.std())
tgt_jan = util.standardize(tgt_jan0, tgt_jan0.mean(), tgt_jan0.std())
x0 = ftr_jan0.as_matrix()
t0 = tgt_jan0.as_matrix().reshape((-1,))
x = ftr_jan.as_matrix()
t = tgt_jan.as_matrix().reshape((-1,))
print(x.shape)
print(t.shape)
rf = RandomForestRegressor(n_estimators = n_est, max_depth = 100, n_jobs = -1)
print('Training random forest... ')
timer_0 = util.Timer()
rf.fit(x, t)
timer_0.stop()
joblib.dump(rf, os.path.join(data_path,'datasets', ('model_importances_jan_%d.pkl' %h)))
importances[i+1,:] = np.mean([tree.feature_importances_ for tree in rf.estimators_],axis=0)
stds[i+1,:] = np.std([tree.feature_importances_ for tree in rf.estimators_],axis=0)
inds[i+1,:] = np.argsort(importances[i+1,:])[-30:]
print(importances[i+1,:])
print(stds[i+1,:])
print(inds[i+1,:])
### June
data = pd.read_pickle(os.path.join(data_path, 'datasets', 'hourly_all_jun.pkl'))
hrs = [6,11,16]
for i in [0,1,2]:
h = hrs[i]
sample = data.loc[(data.hour==h)]
ftr_jun0 = sample.drop(columns = ['lon','lat','year','CH','day','SIS']).reindex(columns = ftr_table0.columns)
tgt_jun0 = sample[['SIS']]
ftr_jun = util.standardize(ftr_jun0, ftr_jun0.mean(), ftr_jun0.std())
tgt_jun = util.standardize(tgt_jun0, tgt_jun0.mean(), tgt_jun0.std())
x0 = ftr_jun0.as_matrix()
t0 = tgt_jun0.as_matrix().reshape((-1,))
x = ftr_jun.as_matrix()
t = tgt_jun.as_matrix().reshape((-1,))
rf = RandomForestRegressor(n_estimators = n_est, max_depth = 100, n_jobs = -1)
print('Training random forest... ')
timer_0 = util.Timer()
rf.fit(x, t)
timer_0.stop()
joblib.dump(rf, os.path.join(data_path,'datasets', ('model_importances_jun_%d.pkl' %h)))
importances[i+4,:] = np.mean([tree.feature_importances_ for tree in rf.estimators_],axis=0)
stds[i+4,:] = np.std([tree.feature_importances_ for tree in rf.estimators_],axis=0)
inds[i+4,:] = np.argsort(importances[i+4,:])[-30:]
print(importances[i+4,:])
print(stds[i+4,:])
print(inds[i+4,:])
rf = RandomForestRegressor(n_estimators = n_est, max_depth = 100, n_jobs = -1)
print('Training random forest... ')
timer_0 = util.Timer()
rf.fit(x_all, t_all)
timer_0.stop()
joblib.dump(rf, os.path.join(data_path,'datasets', 'model_importances.pkl'))
importances[0,:] = np.mean([tree.feature_importances_ for tree in rf.estimators_],axis=0)
stds[0,:] = np.std([tree.feature_importances_ for tree in rf.estimators_],axis=0)
inds[0,:] = np.argsort(importances[0,:])[-30:]
print(importances[0,:])
print(stds[0,:])
print(inds[0,:])
pd.DataFrame(data = importances, columns = ftr_table0.columns).to_csv('importances.csv')
pd.DataFrame(data = stds, columns = ftr_table0.columns).to_csv('stds.csv')
pd.DataFrame(data = inds, columns = ftr_table0.columns).to_csv('inds.csv')
# plt.figure(figsize=(15,15))
# plt.title("Feature Importances for Random Forest (%d trees)" %n_est)
# plt.barh(range(30),importance[ind],
# color="r", xerr=std[ind], alpha=0.4, align="center")
# plt.yticks(range(30), feature_names[ind])
# plt.ylim([-1, len(ind)])
# plt.savefig("importances.png")

Event Timeline