Page MenuHomec4science

ELM_uncertainties.py
No OneTemporary

File Metadata

Created
Sun, Apr 27, 23:27

ELM_uncertainties.py

# coding: utf-8
import numpy as np
import pandas as pd
import xarray as xr
import os
import hpelm
import util
import h5py
import sys
from tqdm import tqdm
from ds import Dataset
from ELM_ensemble import HPELM_Ensemble
from sklearn.metrics import mean_squared_error as mse
from tables import open_file, Atom, Filters
# # Extreme Learning Machine Ensemble with uncertainties
# The script makes use of the class **ELM_ensemble**, which is designed in analogy to the scikit-learn structure.
# An ensemble of extreme learning machines is fitted with the training data (and the validation data if 'V' mode is selected). While training, the out-of-bag samples are stored for later reconstruction of the out-of-bag prediction. After fitting, the model is used to predict on the training data and on the query data (storing the outputs of each run). After prediction, the residuals based on the out-of-bag data are computed, and a new ELM ensemble is trained on the residuals. The variance of the prediction represents the model uncertainty, while the predicted residuals estimate the data uncertainty. [See FG poster]
# _**Notes**_:
# - **Should the validation set be different (random) for each ELM?**
# - We should find a better method to implement the bootstrap - this method does not work for **BIG DATA**
# - Computation of residuals is based on above assumption and requires all data to be loaded into memory
# - Parallelising across the different ELMs (embedded in *ELM_ensemble.py*) should bring significant improvements to expected performance
# - Find out how HPELM implements the parallelisation of the *hdf5_tools* - so that we can use the same approach for the functions in *norms.py*
# - We need a solution for handling very large query datasets!
# ## Setup
############################ Inputs ############################
# data_path = os.path.abspath("/mnt/sda1/hyenergy/data/meteo")
data_path = sys.argv[1]
dsname = sys.argv[2]
queryname = sys.argv[3]
n_est = int(sys.argv[4])
n_nodes = int(sys.argv[5])
accel = sys.argv[6]
#################################################################
# ### Load dataset and training matrix (for bootstrapping), validation and testing (for direct error computation)
my_ds = Dataset(data_path, dsname, queryname)
nf = my_ds.feature_norm.m
nt = my_ds.target_norm.m
my_ds.get_matrices(['train', 'test', 'val'])
if accel is None: accel = 'CPU'
modelname = 'ELM' + str(n_est) + '_' + str(n_nodes) + '_' + accel
my_ds.add_model(modelname, queryname)
modelpath = os.path.join(my_ds.model_path,'model')
ens = HPELM_Ensemble(path = modelpath, n_estimators = n_est, n_nodes = n_nodes, n_features = nf, n_targets = nt, oob = True, accelerator = accel)
# ### Model fitting
# If model has been fitted before and to save computational time, uncomment *ens.load()* and comment out the fitting.
# Fitting training data, use validation for model selection
ens.fit(my_ds.train_X, my_ds.train_T, 'V', my_ds.val_X, my_ds.val_T)
# ens.load()
train_time = ens.train_times_
# ### Model uncertainty
# #### Predictions on training, testing and validation
# prediction on training data:
ens.predict(my_ds.train_X, my_ds.train_Y, label = 'tr')
pred_time = ens.prediction_times_
# pass the target into the prediction function in order to obtain the output matrix (for error computation)
_, _ = ens.predict(my_ds.val_X, t = my_ds.val_t, label = 'val')
# the error (and its evolution with ensemble size) is a variable of the ensemble class:
val_mse = ens.mse_
val_time = ens.prediction_times_
# do the same thing for the test dataset:
_, _ = ens.predict(my_ds.test_X, t = my_ds.test_t, label = 'te')
test_mse = ens.mse_
test_time = ens.prediction_times_
# #### Prediction on query dataset
# predict on query set:
ens.predict(my_ds.query_X, my_ds.query_Y, label = 'query' , eval = True, norm = my_ds.target_norm)
query_time = ens.prediction_times_
# ## Residuals
# obtain the rescaled out-of-bag prediction, model variance and total model mean based on the predicted training data
oob, var, mean = ens.oob_prediction(my_ds.train_Y, norm = my_ds.target_norm)
t0 = util.get_matrix(my_ds.train_T)
res_oob = (oob-t0)**2 - var
res = np.maximum(res_oob, 0)
del res_oob, oob, var, mean, t0, ens
# ### Fit residual model
# #### Prepare residual files
# Split data into training, validation and testing before writing to files in order to obtain test dataset to check model quality:
N = res.shape[0]
ind = np.random.permutation(N)
# keep 10% for testing, and 10% of the remaining data for validation
ind_tr = int(0.81*N)
ind_val = int(0.9*N)
# In order to be able to train the residual model, we need to write the residuals to a file and compute their normalisation - this is the core functionality of the **Table_Writer** class. The class writes the file and computes mean, norm, max and min, and returns a pandas DataFrame with it, that is then stored in memory:
res_table = util.Table_Writer(my_ds.train_path_out, my_ds.target_norm.names, 'residuals')
res_table.open_hdf5(k = 10.0)
res_table.write_hdf5(res[:ind_tr], table = False)
res_norm = res_table.close_hdf5()
res_norm.to_csv(os.path.join(my_ds.model_path, "norm_residuals.csv"))
# The norm computed above can be loaded into a **Normalizer** object, which contains the functions *normalize* and *rescale* that are useful when working with the data. Below, the residual norm is loaded, and the normalized residuals for training and validation as well as the targets for training, validation and testing are saved in the file given by the path *res_T_XX*. The targets for testing are never needed from memory and do hence not need to be stored.
res_T_tr = os.path.join(my_ds.train_path_out, 'train_residuals_t.hdf5')
res_T_val = os.path.join(my_ds.train_path_out, 'val_residuals_t.hdf5')
res_norm = util.Normalizer(os.path.join(my_ds.model_path, "norm_residuals.csv"))
res_norm.set_status('mean')
res = res_norm.normalize(res)
res_tr = res[:ind_tr]
res_val = res[ind_tr:ind_val]
res_te = res[ind_val:]
util.make_hdf5(res_tr, res_T_tr)
util.make_hdf5(res_val, res_T_val)
# Now the training dataset has changed, so the features must also be rewritten:
res_X_tr = os.path.join(my_ds.train_path_out, 'train_residuals_f.hdf5')
res_X_val = os.path.join(my_ds.train_path_out, 'val_residuals_f.hdf5')
res_X_te = os.path.join(my_ds.test_path_out, 'test_residuals_f.hdf5')
util.make_hdf5(my_ds.train_x[:ind_tr], res_X_tr)
util.make_hdf5(my_ds.train_x[ind_tr:ind_val], res_X_val)
util.make_hdf5(my_ds.train_x[ind_val:], res_X_te)
# #### Declare and fit model
residualpath = os.path.join(my_ds.model_path,'residuals')
res_ens = HPELM_Ensemble(path = residualpath, n_estimators = n_est, n_nodes = n_nodes, n_features = nf, n_targets = nt, oob = False, accelerator = accel)
res_ens.fit(res_X_tr, res_T_tr, 'V', res_X_val, res_T_val)
# ### Predict on residual model
# #### Compute errors
# pass the target into the prediction function in order to obtain the output matrix (for error computation)
_, _ = res_ens.predict(res_X_val, t = res_val, label = 'val')
# the error (and its evolution with ensemble size) is a variable of the ensemble class:
res_val_mse = res_ens.mse_
res_val_time = res_ens.prediction_times_
# do the same thing for the test dataset:
_, _ = res_ens.predict(res_X_te, t = res_te, label = 'te')
res_test_mse = res_ens.mse_
res_test_time = res_ens.prediction_times_
# #### Predict on query grid
res_query_Y = os.path.join(my_ds.query_path_out, 'residual_prediction.hdf5')
res_ens.predict(my_ds.query_X, res_query_Y, eval = True, norm = res_norm, label = 'query') # eval computes directly the mean and the variance of the model
res_query_time = res_ens.prediction_times_
query_y_var_f = os.path.splitext(my_ds.query_Y)[0] + '_var.hdf5'
del res, res_ens
try:
X = util.get_matrix(my_ds.query_X0)
Y = util.get_matrix(my_ds.query_Y)
var_model = util.get_matrix(query_y_var_f)
var_data = util.get_matrix(os.path.join(my_ds.query_path_out, 'residual_prediction.hdf5'))
var_tot = var_model + var_data
# ## Evaluation
my_ds.make_xarray(X, Y, rescale = False).to_netcdf(os.path.join(my_ds.query_path_out, 'prediction.nc'))
my_ds.make_xarray(X, np.sqrt(np.maximum(var_model,0)), rescale = False).to_netcdf(os.path.join(my_ds.query_path_out, 'model_unc.nc'))
my_ds.make_xarray(X, np.sqrt(np.maximum(var_data,0)), rescale = False).to_netcdf(os.path.join(my_ds.query_path_out, 'data_unc.nc'))
my_ds.make_xarray(X, np.sqrt(np.maximum(var_tot,0)), rescale = False).to_netcdf(os.path.join(my_ds.query_path_out, 'total_unc.nc'))
except:
print('failed to load query data and to create xarray datasets - may be due to out-of-memory issue')

Event Timeline