Page MenuHomec4science

RF_uncertainties.py
No OneTemporary

File Metadata

Created
Sun, Apr 27, 04:58

RF_uncertainties.py

import matplotlib
matplotlib.use('qt5agg')
import numpy as np
import pandas as pd
import xarray as xr
import os
import hpelm
import util
from tqdm import tqdm
from ds import Dataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from tables import open_file, Atom, Filters
from meteo_data import Meteo_Reader
from sklearn.externals import joblib
from visualise_spacetime import plot_map_month, plot_map_year
def mse_norm(y,t,norm):
return mse(norm.normalize(y),norm.normalize(t))
def find_name(filename):
path, name = os.path.split(filename)
root, ext = os.path.splitext(name)
i = 0
while os.path.exists(filename):
i += 1
filename = os.path.join(path, str(i) + '_' + root + ext)
return filename
# # Random Forest with uncertainties
# In the script, a random forest is trained from a training data set, the MSE is computed from the training and test datasets, and the prediction is made for each tree seperately, in order to later compute the mean and the variance of the prediction. This script can only be run if all training data fits into memory. After prediction, the residuals based on the out-of-bag error are computed, and a new forest is trained on the residuals. The variance of the prediction represents the model uncertainty, while the predicted residuals estimate the data uncertainty. [See FG poster]
# ## Setup
# ### Inputs
data_path = sys.argv[1]
# data_path='/Users/alinawalch/Documents/EPFL/data/meteo'
dsname = sys.argv[2]
queryname = sys.argv[3]
n_est = int(sys.argv[4])
tree_depth = 100
save_model = False
# ### Load dataset and training, validation, testing and query matrices
my_ds = Dataset(data_path, dsname, queryname)
# find number of features and number of targets
nf = my_ds.feature_norm.m
nt = my_ds.target_norm.m
# Merge training and validation data for crossvalidation training data and rescale (matrices give normalised data)
x = util.get_matrix(my_ds.train_X0)
t = util.get_matrix(my_ds.train_T0)
test_x = util.get_matrix(my_ds.test_X0)
test_t = util.get_matrix(my_ds.test_T0)
query_x = util.get_matrix(my_ds.query_X0)
if nt == 1:
t=t.reshape((-1,))
test_t=test_t.reshape((-1,))
# ## Random Forest
# Add model to dataset folder structure
modelname = 'RF' + str(n_est)
my_ds.add_model(modelname, queryname)
forest = RandomForestRegressor(n_estimators = n_est, max_depth = tree_depth, oob_score = True, n_jobs = -1)
# ### Forest fitting
timer_0 = util.Timer()
forest.fit(x, t)
timer_0.stop()
logfile = find_name(os.path.join(my_ds.model_path,('log_train.csv')))
with open(logfile, 'w') as csvfile:
w = csv.writer(csvfile, delimiter=',')
w.writerow(['', 'walltime', 'cputime'])
w.writerow(['model', timer_0.walltime, timer_0.cputime])
if save_model:
joblib.dump(tree, os.path.join(my_ds.model_path, 'model.pkl'))
# ### Model uncertainty
# Declare matrices for sums and sum-of-squares (do not save all n_est outputs)
N = t.shape[0]
N_q = query_x.shape[0]
if nt == 1:
sigma = np.zeros(N)
sigma_sq = np.zeros(N)
sigma_q = np.zeros(N_q)
sigma_sq_q = np.zeros(N_q)
else:
sigma = np.zeros((N,nt))
sigma_sq = np.zeros((N,nt))
sigma_q = np.zeros((N_q,nt))
sigma_sq_q = np.zeros((N_q,nt))
k = 10.0
# Predict for each tree seperately and update the sums and sum-of-squares
for tree in tqdm(forest.estimators_):
y = tree.predict(x)
sigma += y/k
sigma_sq += (y/k)**2
y_q = tree.predict(query_x)
sigma_q += y_q/k
sigma_sq_q += (y_q/k)**2
# #### Compute output (mean) and model uncertainty (var) for training data and for query data
mean = sigma * k/n_est
var = k**2/n_est*(sigma_sq-sigma**2/n_est)
del sigma
del sigma_sq
mean_q = sigma_q * k/n_est
var_q = k**2/n_est*(sigma_sq_q-sigma_q**2/n_est)
del sigma_q
del sigma_sq_q
# #### Obtain out-of-bag and testing outputs and compare mean squared errors (normalized, for comparability with ELM)
oob = forest.oob_prediction_
y_te = forest.predict(test_x)
print('Normalised MSE (testing): %f' %mse_norm(y_te,test_t,my_ds.target_norm))
print('Normalised MSE (out-of-bag): %f' %mse_norm(oob,t,my_ds.target_norm))
print('Normalised MSE (training): %f' %mse_norm(mean,t,my_ds.target_norm))
logpred = find_name(os.path.join(my_ds.model_path,('log_pred_mse.csv')))
with open(logpred, 'w') as csvfile:
w = csv.writer(csvfile, delimiter=',')
w.writerow(['', 'mse'])
w.writerow(['testing', mse_norm(y_te,test_t,my_ds.target_norm)])
w.writerow(['out-of-bag', mse_norm(oob,t,my_ds.target_norm)])
w.writerow(['training', mse_norm(mean,t,my_ds.target_norm)])
# ## Residuals
# Compute residuals
res_oob = (oob-t)**2 - var
res = np.maximum(res_oob, 0)
del res_oob
# ### Fit residual forest
# #### Split training data in training (80%) and testing (20%)
ind = np.random.permutation(N)
ind_te = int(0.2*N)
# testing set:
x_te = x[ind[:ind_te],:]
res_te = res[ind[:ind_te]]
# training set:
x = x[ind[ind_te:],:]
res = res[ind[ind_te:]]
# #### Fit forest
residuals = RandomForestRegressor(n_estimators = n_est, max_depth = 100, oob_score = False, n_jobs = -1)
timer_0 = util.Timer()
residuals.fit(x, res)
timer_0.stop()
with open(logfile, 'w') as csvfile:
w = csv.writer(csvfile, delimiter=',')
w.writerow(['residuals', timer_0.walltime, timer_0.cputime])
# #### Check training and testing MSE (not normalized!)
var_data_te = residuals.predict(x_te)
var_data_tr = residuals.predict(x)
print('MSE of residuals (testing): %f' %mse(var_data_te, res_te))
print('MSE of residuals (training): %f' %mse(var_data_tr, res))
with open(logpred, 'w') as csvfile:
w = csv.writer(csvfile, delimiter=',')
w.writerow(['', 'mse'])
w.writerow(['residual_te', mse(var_data_te, res_te)])
w.writerow(['residual_tr', mse(var_data_te, res_te)])
# ### Predict data uncertainty
var_data_q = residuals.predict(query_x)
del residuals
# ## Evaluation
var_tot = var_data_q + var_q
var_tot_perc = abs(var_tot/mean_q)
# ### Make xarray datasets
pred = my_ds.make_xarray(query_x, mean_q, rescale = False)
model_unc = my_ds.make_xarray(query_x, np.sqrt(var_q), rescale = False)
data_unc = my_ds.make_xarray(query_x, np.sqrt(var_data_q), rescale = False)
total_unc = my_ds.make_xarray(query_x, np.sqrt(var_tot), rescale = False)
pred.to_netcdf(os.path.join(my_ds.query_path_out, 'prediction.nc'))
model_unc.to_netcdf(os.path.join(my_ds.query_path_out, 'model_unc.nc'))
data_unc.to_netcdf(os.path.join(my_ds.query_path_out, 'data_unc.nc'))
total_unc.to_netcdf(os.path.join(my_ds.query_path_out, 'total_unc.nc'))

Event Timeline