Page MenuHomec4science

ELM_uncertainties_train.py
No OneTemporary

File Metadata

Created
Wed, Apr 30, 10:43

ELM_uncertainties_train.py

# coding: utf-8
import numpy as np
import pandas as pd
import xarray as xr
import os
import hpelm
import util
import h5py
import sys
from tqdm import tqdm
from ds import Dataset
from ELM_ensemble import HPELM_Ensemble
from sklearn.metrics import mean_squared_error as mse
from tables import open_file, Atom, Filters
# # Extreme Learning Machine Ensemble with uncertainties
# The script makes use of the class **ELM_ensemble**, which is designed in analogy to the scikit-learn structure.
# An ensemble of extreme learning machines is fitted with the training data (and the validation data if 'V' mode is selected). While training, the out-of-bag samples are stored for later reconstruction of the out-of-bag prediction. After fitting, the model is used to predict on the training data and on the query data (storing the outputs of each run). After prediction, the residuals based on the out-of-bag data are computed, and a new ELM ensemble is trained on the residuals. The variance of the prediction represents the model uncertainty, while the predicted residuals estimate the data uncertainty. [See FG poster]
# _**Notes**_:
# - **Should the validation set be different (random) for each ELM?**
# - We should find a better method to implement the bootstrap - this method does not work for **BIG DATA**
# - Computation of residuals is based on above assumption and requires all data to be loaded into memory
# - Parallelising across the different ELMs (embedded in *ELM_ensemble.py*) should bring significant improvements to expected performance
# - Find out how HPELM implements the parallelisation of the *hdf5_tools* - so that we can use the same approach for the functions in *norms.py*
# - We need a solution for handling very large query datasets!
# ## Setup
############################ Inputs ############################
# data_path = os.path.abspath("/mnt/sda1/hyenergy/data/meteo")
data_path = sys.argv[1]
dsname = sys.argv[2]
n_est = int(sys.argv[4])
n_nodes = int(sys.argv[5])
accel = str(sys.argv[6])
#################################################################
# ### Load dataset and training matrix (for bootstrapping), validation and testing (for direct error computation)
my_ds = Dataset(data_path, dsname)
nf = my_ds.feature_norm.m
nt = my_ds.target_norm.m
my_ds.get_matrices(['train', 'test', 'val'])
print('Loaded dataset %s' %dsname)
if accel is None: accel = 'CPU'
modelname = 'ELM' + str(n_est) + '_' + str(n_nodes) + '_' + accel
my_ds.add_model(modelname)
modelpath = os.path.join(my_ds.model_path,'model')
ens = HPELM_Ensemble(path = modelpath, n_estimators = n_est, n_nodes = n_nodes, n_features = nf, n_targets = nt, oob = True, accelerator = accel)
# set up logging files (log summary)
logfile_mse = util.find_name(os.path.join(my_ds.model_path,'log_pred_mse.csv'))
util.write_row_to_csv(logfile_mse, [ '', 'mse' ], initialize = True)
logfile_time = util.find_name(os.path.join(my_ds.model_path,'log_time.csv'))
util.write_row_to_csv(logfile_time, ['', 'walltime', 'cputime'], initialize = True)
# ### Model fitting
# If model has been fitted before and to save computational time, uncomment *ens.load()* and comment out the fitting.
# Fitting training data, use validation for model selection
ens.fit(my_ds.train_X, my_ds.train_T, 'V', my_ds.val_X, my_ds.val_T)
# ens.load()
train_time = np.sum( np.asarray(ens.train_times_), axis = 0 )
# write to log
util.write_row_to_csv( logfile_time, [ 'model_train', train_time[0], train_time[1] ] )
# ### Model uncertainty
# #### Predictions on training, testing and validation
# pass the target into the prediction function in order to obtain the output matrix (for error computation)
ens.predict(my_ds.val_X, t = my_ds.val_t, label = 'val')
# the error (and its evolution with ensemble size) is a variable of the ensemble class:
val_mse = ens.mse_[-1]
val_time = np.sum( np.asarray(ens.train_times_), axis = 0 )
# write to log
util.write_row_to_csv( logfile_mse, [ 'validation', val_mse ])
util.write_row_to_csv( logfile_time, [ 'val_pred' , val_time[0], val_time[1] ])
# do the same thing for the test dataset:
ens.predict(my_ds.test_X, t = my_ds.test_t, label = 'te')
test_mse = ens.mse_[-1]
test_time = np.sum( np.asarray(ens.train_times_), axis = 0 )
# write to log
util.write_row_to_csv( logfile_mse, [ 'testing' , test_mse ])
util.write_row_to_csv( logfile_time, [ 'test_pred', test_time[0], test_time[1] ])
# prediction on training data:
ens.predict(my_ds.train_X, my_ds.train_Y, label = 'tr')
pred_time_train = np.sum( np.asarray(ens.train_times_), axis = 0 )
# write to log
util.write_row_to_csv( logfile_time, ['train_pred', pred_time_train[0], pred_time_train[1] ])
# ## Residuals
# obtain the rescaled out-of-bag prediction, model variance and total model mean based on the predicted training data
oob, var, mean = ens.oob_prediction(my_ds.train_Y, norm = my_ds.target_norm)
t0 = util.get_matrix(my_ds.train_T)
res_oob = (oob-t0)**2 - var
res = np.maximum(res_oob, 0)
# compute mean squared errors for oob and training data
oob_mse = mse( my_ds.train_t, my_ds.target_norm.normalize( oob ))
train_mse = mse( my_ds.train_t, my_ds.target_norm.normalize( mean ))
# write to log
util.write_row_to_csv( logfile_mse, [ 'out-of-bag', oob_mse ])
util.write_row_to_csv( logfile_mse, [ 'training' , train_mse ])
del res_oob, oob, var, mean, t0, ens
# ### Fit residual model
# #### Prepare residual files
# Split data into training, validation and testing before writing to files in order to obtain test dataset to check model quality:
N = res.shape[0]
ind = np.random.permutation(N)
# keep 10% for testing, and 10% of the remaining data for validation
ind_tr = int( 0.81 * N )
ind_val = int( 0.9 * N )
# In order to be able to train the residual model, we need to write the residuals to a file and compute their normalisation - this is the core functionality of the **Table_Writer** class. The class writes the file and computes mean, norm, max and min, and returns a pandas DataFrame with it, that is then stored in memory:
res_table = util.Table_Writer(my_ds.train_path_out, my_ds.target_norm.names, 'residuals')
res_table.open_hdf5(k = 10.0)
res_table.write_hdf5(res[:ind_tr], table = False)
res_norm = res_table.close_hdf5()
res_norm.to_csv(os.path.join(my_ds.model_path, "norm_residuals.csv"))
# The norm computed above can be loaded into a **Normalizer** object, which contains the functions *normalize* and *rescale* that are useful when working with the data. Below, the residual norm is loaded, and the normalized residuals for training and validation as well as the targets for training, validation and testing are saved in the file given by the path *res_T_XX*. The targets for testing are never needed from memory and do hence not need to be stored.
res_T_tr = os.path.join(my_ds.train_path_out, 'train_residuals_t.hdf5')
res_T_val = os.path.join(my_ds.train_path_out, 'val_residuals_t.hdf5')
res_norm = util.Normalizer(os.path.join(my_ds.model_path, "norm_residuals.csv"))
res_norm.set_status('mean')
res = res_norm.normalize(res)
res_tr = res[ : ind_tr ]
res_val = res[ ind_tr : ind_val ]
res_te = res[ ind_val : ]
util.make_hdf5(res_tr, res_T_tr)
util.make_hdf5(res_val, res_T_val)
# Now the training dataset has changed, so the features must also be rewritten:
res_X_tr = os.path.join( my_ds.train_path_out, 'train_residuals_f.hdf5' )
res_X_val = os.path.join( my_ds.train_path_out, 'val_residuals_f.hdf5' )
res_X_te = os.path.join( my_ds.test_path_out, 'test_residuals_f.hdf5' )
util.make_hdf5( my_ds.train_x[ :ind_tr ], res_X_tr )
util.make_hdf5( my_ds.train_x[ ind_tr:ind_val ], res_X_val )
util.make_hdf5( my_ds.train_x[ ind_val: ], res_X_te )
# #### Declare and fit model
residualpath = os.path.join(my_ds.model_path,'residuals')
res_ens = HPELM_Ensemble(path = residualpath, n_estimators = n_est, n_nodes = n_nodes, n_features = nf, n_targets = nt, oob = False, accelerator = accel)
res_ens.fit(res_X_tr, res_T_tr, 'V', res_X_val, res_T_val)
res_train_time = np.sum( np.asarray(res_ens.train_times_), axis = 0 )
# write to log
util.write_row_to_csv( logfile_time, [ 'residuals_train', res_train_time[0], res_train_time[1] ] )
# ### Predict on residual model
# #### Compute errors
# pass the target into the prediction function in order to obtain the output matrix (for error computation)
res_ens.predict(res_X_val, t = res_val, label = 'val')
# the error (and its evolution with ensemble size) is a variable of the ensemble class:
res_val_mse = res_ens.mse_[-1]
res_val_time = np.sum( np.asarray( res_ens.prediction_times_ ), axis = 0 )
# write to log
util.write_row_to_csv( logfile_mse, [ 'residual_validation' , res_val_mse ])
util.write_row_to_csv( logfile_time, [ 'residual_val', res_val_time[0], res_val_time[1] ])
# do the same thing for the test dataset:
res_ens.predict(res_X_te, t = res_te, label = 'te')
res_test_mse = res_ens.mse_[-1]
res_test_time = np.sum( np.asarray( res_ens.prediction_times_ ), axis = 0 )
# write to log
util.write_row_to_csv( logfile_mse, [ 'residual_testing' , res_test_mse ])
util.write_row_to_csv( logfile_time, [ 'residual_te', res_test_time[0], res_test_time[1] ])
del res, res_ens

Event Timeline