RF_uncertainties.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Apr 27, 04:58

RF_uncertainties.py
View Options

	import matplotlib
	matplotlib.use('qt5agg')

	import numpy as np
	import pandas as pd
	import xarray as xr
	import os
	import hpelm
	import util
	from tqdm import tqdm
	from ds import Dataset
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_squared_error as mse
	from tables import open_file, Atom, Filters
	from meteo_data import Meteo_Reader
	from sklearn.externals import joblib

	from visualise_spacetime import plot_map_month, plot_map_year

	def mse_norm(y,t,norm):
	return mse(norm.normalize(y),norm.normalize(t))

	def find_name(filename):
	path, name = os.path.split(filename)
	root, ext = os.path.splitext(name)
	i = 0
	while os.path.exists(filename):
	i += 1
	filename = os.path.join(path, str(i) + '_' + root + ext)
	return filename

	# # Random Forest with uncertainties
	# In the script, a random forest is trained from a training data set, the MSE is computed from the training and test datasets, and the prediction is made for each tree seperately, in order to later compute the mean and the variance of the prediction. This script can only be run if all training data fits into memory. After prediction, the residuals based on the out-of-bag error are computed, and a new forest is trained on the residuals. The variance of the prediction represents the model uncertainty, while the predicted residuals estimate the data uncertainty. [See FG poster]

	# ## Setup
	# ### Inputs

	data_path = sys.argv[1]
	# data_path='/Users/alinawalch/Documents/EPFL/data/meteo'
	dsname = sys.argv[2]
	queryname = sys.argv[3]

	n_est = int(sys.argv[4])
	tree_depth = 100
	save_model = False


	# ### Load dataset and training, validation, testing and query matrices

	my_ds = Dataset(data_path, dsname, queryname)

	# find number of features and number of targets
	nf = my_ds.feature_norm.m
	nt = my_ds.target_norm.m

	# Merge training and validation data for crossvalidation training data and rescale (matrices give normalised data)
	x = util.get_matrix(my_ds.train_X0)
	t = util.get_matrix(my_ds.train_T0)
	test_x = util.get_matrix(my_ds.test_X0)
	test_t = util.get_matrix(my_ds.test_T0)
	query_x = util.get_matrix(my_ds.query_X0)
	if nt == 1:
	t=t.reshape((-1,))
	test_t=test_t.reshape((-1,))

	# ## Random Forest

	# Add model to dataset folder structure
	modelname = 'RF' + str(n_est)
	my_ds.add_model(modelname, queryname)

	forest = RandomForestRegressor(n_estimators = n_est, max_depth = tree_depth, oob_score = True, n_jobs = -1)

	# ### Forest fitting

	timer_0 = util.Timer()
	forest.fit(x, t)
	timer_0.stop()

	logfile = find_name(os.path.join(my_ds.model_path,('log_train.csv')))
	with open(logfile, 'w') as csvfile:
	w = csv.writer(csvfile, delimiter=',')
	w.writerow(['', 'walltime', 'cputime'])
	w.writerow(['model', timer_0.walltime, timer_0.cputime])

	if save_model:
	joblib.dump(tree, os.path.join(my_ds.model_path, 'model.pkl'))

	# ### Model uncertainty

	# Declare matrices for sums and sum-of-squares (do not save all n_est outputs)
	N = t.shape[0]
	N_q = query_x.shape[0]
	if nt == 1:
	sigma = np.zeros(N)
	sigma_sq = np.zeros(N)

	sigma_q = np.zeros(N_q)
	sigma_sq_q = np.zeros(N_q)
	else:
	sigma = np.zeros((N,nt))
	sigma_sq = np.zeros((N,nt))

	sigma_q = np.zeros((N_q,nt))
	sigma_sq_q = np.zeros((N_q,nt))
	k = 10.0

	# Predict for each tree seperately and update the sums and sum-of-squares
	for tree in tqdm(forest.estimators_):
	y = tree.predict(x)
	sigma += y/k
	sigma_sq += (y/k)**2

	y_q = tree.predict(query_x)
	sigma_q += y_q/k
	sigma_sq_q += (y_q/k)**2


	# #### Compute output (mean) and model uncertainty (var) for training data and for query data

	mean = sigma * k/n_est
	var = k*2/n_est(sigma_sq-sigma**2/n_est)
	del sigma
	del sigma_sq

	mean_q = sigma_q * k/n_est
	var_q = k*2/n_est(sigma_sq_q-sigma_q**2/n_est)
	del sigma_q
	del sigma_sq_q

	# #### Obtain out-of-bag and testing outputs and compare mean squared errors (normalized, for comparability with ELM)

	oob = forest.oob_prediction_
	y_te = forest.predict(test_x)

	print('Normalised MSE (testing): %f' %mse_norm(y_te,test_t,my_ds.target_norm))
	print('Normalised MSE (out-of-bag): %f' %mse_norm(oob,t,my_ds.target_norm))
	print('Normalised MSE (training): %f' %mse_norm(mean,t,my_ds.target_norm))

	logpred = find_name(os.path.join(my_ds.model_path,('log_pred_mse.csv')))
	with open(logpred, 'w') as csvfile:
	w = csv.writer(csvfile, delimiter=',')
	w.writerow(['', 'mse'])
	w.writerow(['testing', mse_norm(y_te,test_t,my_ds.target_norm)])
	w.writerow(['out-of-bag', mse_norm(oob,t,my_ds.target_norm)])
	w.writerow(['training', mse_norm(mean,t,my_ds.target_norm)])

	# ## Residuals

	# Compute residuals
	res_oob = (oob-t)**2 - var
	res = np.maximum(res_oob, 0)
	del res_oob


	# ### Fit residual forest

	# #### Split training data in training (80%) and testing (20%)

	ind = np.random.permutation(N)
	ind_te = int(0.2*N)

	# testing set:
	x_te = x[ind[:ind_te],:]
	res_te = res[ind[:ind_te]]

	# training set:
	x = x[ind[ind_te:],:]
	res = res[ind[ind_te:]]


	# #### Fit forest

	residuals = RandomForestRegressor(n_estimators = n_est, max_depth = 100, oob_score = False, n_jobs = -1)

	timer_0 = util.Timer()
	residuals.fit(x, res)
	timer_0.stop()

	with open(logfile, 'w') as csvfile:
	w = csv.writer(csvfile, delimiter=',')
	w.writerow(['residuals', timer_0.walltime, timer_0.cputime])

	# #### Check training and testing MSE (not normalized!)

	var_data_te = residuals.predict(x_te)
	var_data_tr = residuals.predict(x)
	print('MSE of residuals (testing): %f' %mse(var_data_te, res_te))
	print('MSE of residuals (training): %f' %mse(var_data_tr, res))

	with open(logpred, 'w') as csvfile:
	w = csv.writer(csvfile, delimiter=',')
	w.writerow(['', 'mse'])
	w.writerow(['residual_te', mse(var_data_te, res_te)])
	w.writerow(['residual_tr', mse(var_data_te, res_te)])

	# ### Predict data uncertainty

	var_data_q = residuals.predict(query_x)

	del residuals


	# ## Evaluation

	var_tot = var_data_q + var_q
	var_tot_perc = abs(var_tot/mean_q)


	# ### Make xarray datasets

	pred = my_ds.make_xarray(query_x, mean_q, rescale = False)
	model_unc = my_ds.make_xarray(query_x, np.sqrt(var_q), rescale = False)
	data_unc = my_ds.make_xarray(query_x, np.sqrt(var_data_q), rescale = False)
	total_unc = my_ds.make_xarray(query_x, np.sqrt(var_tot), rescale = False)

	pred.to_netcdf(os.path.join(my_ds.query_path_out, 'prediction.nc'))
	model_unc.to_netcdf(os.path.join(my_ds.query_path_out, 'model_unc.nc'))
	data_unc.to_netcdf(os.path.join(my_ds.query_path_out, 'data_unc.nc'))
	total_unc.to_netcdf(os.path.join(my_ds.query_path_out, 'total_unc.nc'))

RF_uncertainties.pyNo OneTemporaryActions

File Metadata

RF_uncertainties.pyView Options

Event Timeline

RF_uncertainties.py
No OneTemporary
Actions

RF_uncertainties.py
View Options