ELM_uncertainties.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Apr 27, 23:27

ELM_uncertainties.py
View Options


	# coding: utf-8

	import numpy as np
	import pandas as pd
	import xarray as xr
	import os
	import hpelm
	import util
	import h5py
	import sys
	from tqdm import tqdm
	from ds import Dataset
	from ELM_ensemble import HPELM_Ensemble
	from sklearn.metrics import mean_squared_error as mse
	from tables import open_file, Atom, Filters

	# # Extreme Learning Machine Ensemble with uncertainties
	# The script makes use of the class ELM_ensemble, which is designed in analogy to the scikit-learn structure.
	# An ensemble of extreme learning machines is fitted with the training data (and the validation data if 'V' mode is selected). While training, the out-of-bag samples are stored for later reconstruction of the out-of-bag prediction. After fitting, the model is used to predict on the training data and on the query data (storing the outputs of each run). After prediction, the residuals based on the out-of-bag data are computed, and a new ELM ensemble is trained on the residuals. The variance of the prediction represents the model uncertainty, while the predicted residuals estimate the data uncertainty. [See FG poster]

	# _Notes_:
	# - Should the validation set be different (random) for each ELM?
	# - We should find a better method to implement the bootstrap - this method does not work for BIG DATA
	# - Computation of residuals is based on above assumption and requires all data to be loaded into memory
	# - Parallelising across the different ELMs (embedded in ELM_ensemble.py) should bring significant improvements to expected performance
	# - Find out how HPELM implements the parallelisation of the hdf5_tools - so that we can use the same approach for the functions in norms.py
	# - We need a solution for handling very large query datasets!

	# ## Setup
	############################ Inputs ############################

	# data_path = os.path.abspath("/mnt/sda1/hyenergy/data/meteo")
	data_path = sys.argv[1]
	dsname = sys.argv[2]
	queryname = sys.argv[3]

	n_est = int(sys.argv[4])
	n_nodes = int(sys.argv[5])
	accel = sys.argv[6]

	#################################################################

	# ### Load dataset and training matrix (for bootstrapping), validation and testing (for direct error computation)

	my_ds = Dataset(data_path, dsname, queryname)

	nf = my_ds.feature_norm.m
	nt = my_ds.target_norm.m

	my_ds.get_matrices(['train', 'test', 'val'])

	if accel is None: accel = 'CPU'
	modelname = 'ELM' + str(n_est) + '_' + str(n_nodes) + '_' + accel
	my_ds.add_model(modelname, queryname)
	modelpath = os.path.join(my_ds.model_path,'model')

	ens = HPELM_Ensemble(path = modelpath, n_estimators = n_est, n_nodes = n_nodes, n_features = nf, n_targets = nt, oob = True, accelerator = accel)

	# ### Model fitting
	# If model has been fitted before and to save computational time, uncomment ens.load() and comment out the fitting.

	# Fitting training data, use validation for model selection
	ens.fit(my_ds.train_X, my_ds.train_T, 'V', my_ds.val_X, my_ds.val_T)
	# ens.load()
	train_time = ens.train_times_

	# ### Model uncertainty
	# #### Predictions on training, testing and validation

	# prediction on training data:
	ens.predict(my_ds.train_X, my_ds.train_Y, label = 'tr')
	pred_time = ens.prediction_times_

	# pass the target into the prediction function in order to obtain the output matrix (for error computation)
	_, _ = ens.predict(my_ds.val_X, t = my_ds.val_t, label = 'val')
	# the error (and its evolution with ensemble size) is a variable of the ensemble class:
	val_mse = ens.mse_
	val_time = ens.prediction_times_

	# do the same thing for the test dataset:
	_, _ = ens.predict(my_ds.test_X, t = my_ds.test_t, label = 'te')
	test_mse = ens.mse_
	test_time = ens.prediction_times_

	# #### Prediction on query dataset

	# predict on query set:
	ens.predict(my_ds.query_X, my_ds.query_Y, label = 'query' , eval = True, norm = my_ds.target_norm)
	query_time = ens.prediction_times_


	# ## Residuals

	# obtain the rescaled out-of-bag prediction, model variance and total model mean based on the predicted training data
	oob, var, mean = ens.oob_prediction(my_ds.train_Y, norm = my_ds.target_norm)

	t0 = util.get_matrix(my_ds.train_T)

	res_oob = (oob-t0)**2 - var
	res = np.maximum(res_oob, 0)
	del res_oob, oob, var, mean, t0, ens

	# ### Fit residual model
	# #### Prepare residual files

	# Split data into training, validation and testing before writing to files in order to obtain test dataset to check model quality:

	N = res.shape[0]
	ind = np.random.permutation(N)
	# keep 10% for testing, and 10% of the remaining data for validation
	ind_tr = int(0.81*N)
	ind_val = int(0.9*N)

	# In order to be able to train the residual model, we need to write the residuals to a file and compute their normalisation - this is the core functionality of the Table_Writer class. The class writes the file and computes mean, norm, max and min, and returns a pandas DataFrame with it, that is then stored in memory:

	res_table = util.Table_Writer(my_ds.train_path_out, my_ds.target_norm.names, 'residuals')
	res_table.open_hdf5(k = 10.0)
	res_table.write_hdf5(res[:ind_tr], table = False)
	res_norm = res_table.close_hdf5()
	res_norm.to_csv(os.path.join(my_ds.model_path, "norm_residuals.csv"))


	# The norm computed above can be loaded into a Normalizer object, which contains the functions normalize and rescale that are useful when working with the data. Below, the residual norm is loaded, and the normalized residuals for training and validation as well as the targets for training, validation and testing are saved in the file given by the path res_T_XX. The targets for testing are never needed from memory and do hence not need to be stored.

	res_T_tr = os.path.join(my_ds.train_path_out, 'train_residuals_t.hdf5')
	res_T_val = os.path.join(my_ds.train_path_out, 'val_residuals_t.hdf5')

	res_norm = util.Normalizer(os.path.join(my_ds.model_path, "norm_residuals.csv"))
	res_norm.set_status('mean')
	res = res_norm.normalize(res)

	res_tr = res[:ind_tr]
	res_val = res[ind_tr:ind_val]
	res_te = res[ind_val:]

	util.make_hdf5(res_tr, res_T_tr)
	util.make_hdf5(res_val, res_T_val)

	# Now the training dataset has changed, so the features must also be rewritten:

	res_X_tr = os.path.join(my_ds.train_path_out, 'train_residuals_f.hdf5')
	res_X_val = os.path.join(my_ds.train_path_out, 'val_residuals_f.hdf5')
	res_X_te = os.path.join(my_ds.test_path_out, 'test_residuals_f.hdf5')
	util.make_hdf5(my_ds.train_x[:ind_tr], res_X_tr)
	util.make_hdf5(my_ds.train_x[ind_tr:ind_val], res_X_val)
	util.make_hdf5(my_ds.train_x[ind_val:], res_X_te)

	# #### Declare and fit model

	residualpath = os.path.join(my_ds.model_path,'residuals')
	res_ens = HPELM_Ensemble(path = residualpath, n_estimators = n_est, n_nodes = n_nodes, n_features = nf, n_targets = nt, oob = False, accelerator = accel)

	res_ens.fit(res_X_tr, res_T_tr, 'V', res_X_val, res_T_val)

	# ### Predict on residual model

	# #### Compute errors

	# pass the target into the prediction function in order to obtain the output matrix (for error computation)
	_, _ = res_ens.predict(res_X_val, t = res_val, label = 'val')
	# the error (and its evolution with ensemble size) is a variable of the ensemble class:
	res_val_mse = res_ens.mse_
	res_val_time = res_ens.prediction_times_

	# do the same thing for the test dataset:
	_, _ = res_ens.predict(res_X_te, t = res_te, label = 'te')
	res_test_mse = res_ens.mse_
	res_test_time = res_ens.prediction_times_

	# #### Predict on query grid

	res_query_Y = os.path.join(my_ds.query_path_out, 'residual_prediction.hdf5')
	res_ens.predict(my_ds.query_X, res_query_Y, eval = True, norm = res_norm, label = 'query') # eval computes directly the mean and the variance of the model
	res_query_time = res_ens.prediction_times_

	query_y_var_f = os.path.splitext(my_ds.query_Y)[0] + '_var.hdf5'

	del res, res_ens

	try:
	X = util.get_matrix(my_ds.query_X0)
	Y = util.get_matrix(my_ds.query_Y)
	var_model = util.get_matrix(query_y_var_f)
	var_data = util.get_matrix(os.path.join(my_ds.query_path_out, 'residual_prediction.hdf5'))
	var_tot = var_model + var_data

	# ## Evaluation

	my_ds.make_xarray(X, Y, rescale = False).to_netcdf(os.path.join(my_ds.query_path_out, 'prediction.nc'))
	my_ds.make_xarray(X, np.sqrt(np.maximum(var_model,0)), rescale = False).to_netcdf(os.path.join(my_ds.query_path_out, 'model_unc.nc'))
	my_ds.make_xarray(X, np.sqrt(np.maximum(var_data,0)), rescale = False).to_netcdf(os.path.join(my_ds.query_path_out, 'data_unc.nc'))
	my_ds.make_xarray(X, np.sqrt(np.maximum(var_tot,0)), rescale = False).to_netcdf(os.path.join(my_ds.query_path_out, 'total_unc.nc'))
	except:
	print('failed to load query data and to create xarray datasets - may be due to out-of-memory issue')

ELM_uncertainties.pyNo OneTemporaryActions

File Metadata

ELM_uncertainties.pyView Options

Event Timeline

ELM_uncertainties.py
No OneTemporary
Actions

ELM_uncertainties.py
View Options