ELM_uncertainties_train.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Apr 30, 10:43

ELM_uncertainties_train.py
View Options


	# coding: utf-8

	import numpy as np
	import pandas as pd
	import xarray as xr
	import os
	import hpelm
	import util
	import h5py
	import sys
	from tqdm import tqdm
	from ds import Dataset
	from ELM_ensemble import HPELM_Ensemble
	from sklearn.metrics import mean_squared_error as mse
	from tables import open_file, Atom, Filters

	# # Extreme Learning Machine Ensemble with uncertainties
	# The script makes use of the class ELM_ensemble, which is designed in analogy to the scikit-learn structure.
	# An ensemble of extreme learning machines is fitted with the training data (and the validation data if 'V' mode is selected). While training, the out-of-bag samples are stored for later reconstruction of the out-of-bag prediction. After fitting, the model is used to predict on the training data and on the query data (storing the outputs of each run). After prediction, the residuals based on the out-of-bag data are computed, and a new ELM ensemble is trained on the residuals. The variance of the prediction represents the model uncertainty, while the predicted residuals estimate the data uncertainty. [See FG poster]

	# _Notes_:
	# - Should the validation set be different (random) for each ELM?
	# - We should find a better method to implement the bootstrap - this method does not work for BIG DATA
	# - Computation of residuals is based on above assumption and requires all data to be loaded into memory
	# - Parallelising across the different ELMs (embedded in ELM_ensemble.py) should bring significant improvements to expected performance
	# - Find out how HPELM implements the parallelisation of the hdf5_tools - so that we can use the same approach for the functions in norms.py
	# - We need a solution for handling very large query datasets!

	# ## Setup
	############################ Inputs ############################

	# data_path = os.path.abspath("/mnt/sda1/hyenergy/data/meteo")
	data_path = sys.argv[1]
	dsname = sys.argv[2]

	n_est = int(sys.argv[4])
	n_nodes = int(sys.argv[5])
	accel = str(sys.argv[6])

	#################################################################

	# ### Load dataset and training matrix (for bootstrapping), validation and testing (for direct error computation)

	my_ds = Dataset(data_path, dsname)

	nf = my_ds.feature_norm.m
	nt = my_ds.target_norm.m

	my_ds.get_matrices(['train', 'test', 'val'])

	print('Loaded dataset %s' %dsname)

	if accel is None: accel = 'CPU'
	modelname = 'ELM' + str(n_est) + '_' + str(n_nodes) + '_' + accel
	my_ds.add_model(modelname)
	modelpath = os.path.join(my_ds.model_path,'model')

	ens = HPELM_Ensemble(path = modelpath, n_estimators = n_est, n_nodes = n_nodes, n_features = nf, n_targets = nt, oob = True, accelerator = accel)

	# set up logging files (log summary)
	logfile_mse = util.find_name(os.path.join(my_ds.model_path,'log_pred_mse.csv'))
	util.write_row_to_csv(logfile_mse, [ '', 'mse' ], initialize = True)

	logfile_time = util.find_name(os.path.join(my_ds.model_path,'log_time.csv'))
	util.write_row_to_csv(logfile_time, ['', 'walltime', 'cputime'], initialize = True)

	# ### Model fitting
	# If model has been fitted before and to save computational time, uncomment ens.load() and comment out the fitting.

	# Fitting training data, use validation for model selection
	ens.fit(my_ds.train_X, my_ds.train_T, 'V', my_ds.val_X, my_ds.val_T)
	# ens.load()
	train_time = np.sum( np.asarray(ens.train_times_), axis = 0 )

	# write to log
	util.write_row_to_csv( logfile_time, [ 'model_train', train_time[0], train_time[1] ] )

	# ### Model uncertainty
	# #### Predictions on training, testing and validation

	# pass the target into the prediction function in order to obtain the output matrix (for error computation)
	ens.predict(my_ds.val_X, t = my_ds.val_t, label = 'val')
	# the error (and its evolution with ensemble size) is a variable of the ensemble class:
	val_mse = ens.mse_[-1]
	val_time = np.sum( np.asarray(ens.train_times_), axis = 0 )

	# write to log
	util.write_row_to_csv( logfile_mse, [ 'validation', val_mse ])
	util.write_row_to_csv( logfile_time, [ 'val_pred' , val_time[0], val_time[1] ])

	# do the same thing for the test dataset:
	ens.predict(my_ds.test_X, t = my_ds.test_t, label = 'te')
	test_mse = ens.mse_[-1]
	test_time = np.sum( np.asarray(ens.train_times_), axis = 0 )

	# write to log
	util.write_row_to_csv( logfile_mse, [ 'testing' , test_mse ])
	util.write_row_to_csv( logfile_time, [ 'test_pred', test_time[0], test_time[1] ])

	# prediction on training data:
	ens.predict(my_ds.train_X, my_ds.train_Y, label = 'tr')
	pred_time_train = np.sum( np.asarray(ens.train_times_), axis = 0 )

	# write to log
	util.write_row_to_csv( logfile_time, ['train_pred', pred_time_train[0], pred_time_train[1] ])

	# ## Residuals

	# obtain the rescaled out-of-bag prediction, model variance and total model mean based on the predicted training data
	oob, var, mean = ens.oob_prediction(my_ds.train_Y, norm = my_ds.target_norm)

	t0 = util.get_matrix(my_ds.train_T)

	res_oob = (oob-t0)**2 - var
	res = np.maximum(res_oob, 0)

	# compute mean squared errors for oob and training data
	oob_mse = mse( my_ds.train_t, my_ds.target_norm.normalize( oob ))
	train_mse = mse( my_ds.train_t, my_ds.target_norm.normalize( mean ))

	# write to log
	util.write_row_to_csv( logfile_mse, [ 'out-of-bag', oob_mse ])
	util.write_row_to_csv( logfile_mse, [ 'training' , train_mse ])

	del res_oob, oob, var, mean, t0, ens

	# ### Fit residual model
	# #### Prepare residual files

	# Split data into training, validation and testing before writing to files in order to obtain test dataset to check model quality:

	N = res.shape[0]
	ind = np.random.permutation(N)
	# keep 10% for testing, and 10% of the remaining data for validation
	ind_tr = int( 0.81 * N )
	ind_val = int( 0.9 * N )

	# In order to be able to train the residual model, we need to write the residuals to a file and compute their normalisation - this is the core functionality of the Table_Writer class. The class writes the file and computes mean, norm, max and min, and returns a pandas DataFrame with it, that is then stored in memory:

	res_table = util.Table_Writer(my_ds.train_path_out, my_ds.target_norm.names, 'residuals')
	res_table.open_hdf5(k = 10.0)
	res_table.write_hdf5(res[:ind_tr], table = False)
	res_norm = res_table.close_hdf5()
	res_norm.to_csv(os.path.join(my_ds.model_path, "norm_residuals.csv"))


	# The norm computed above can be loaded into a Normalizer object, which contains the functions normalize and rescale that are useful when working with the data. Below, the residual norm is loaded, and the normalized residuals for training and validation as well as the targets for training, validation and testing are saved in the file given by the path res_T_XX. The targets for testing are never needed from memory and do hence not need to be stored.

	res_T_tr = os.path.join(my_ds.train_path_out, 'train_residuals_t.hdf5')
	res_T_val = os.path.join(my_ds.train_path_out, 'val_residuals_t.hdf5')

	res_norm = util.Normalizer(os.path.join(my_ds.model_path, "norm_residuals.csv"))
	res_norm.set_status('mean')
	res = res_norm.normalize(res)

	res_tr = res[ : ind_tr ]
	res_val = res[ ind_tr : ind_val ]
	res_te = res[ ind_val : ]

	util.make_hdf5(res_tr, res_T_tr)
	util.make_hdf5(res_val, res_T_val)

	# Now the training dataset has changed, so the features must also be rewritten:

	res_X_tr = os.path.join( my_ds.train_path_out, 'train_residuals_f.hdf5' )
	res_X_val = os.path.join( my_ds.train_path_out, 'val_residuals_f.hdf5' )
	res_X_te = os.path.join( my_ds.test_path_out, 'test_residuals_f.hdf5' )
	util.make_hdf5( my_ds.train_x[ :ind_tr ], res_X_tr )
	util.make_hdf5( my_ds.train_x[ ind_tr:ind_val ], res_X_val )
	util.make_hdf5( my_ds.train_x[ ind_val: ], res_X_te )

	# #### Declare and fit model

	residualpath = os.path.join(my_ds.model_path,'residuals')
	res_ens = HPELM_Ensemble(path = residualpath, n_estimators = n_est, n_nodes = n_nodes, n_features = nf, n_targets = nt, oob = False, accelerator = accel)

	res_ens.fit(res_X_tr, res_T_tr, 'V', res_X_val, res_T_val)
	res_train_time = np.sum( np.asarray(res_ens.train_times_), axis = 0 )

	# write to log
	util.write_row_to_csv( logfile_time, [ 'residuals_train', res_train_time[0], res_train_time[1] ] )

	# ### Predict on residual model

	# #### Compute errors

	# pass the target into the prediction function in order to obtain the output matrix (for error computation)
	res_ens.predict(res_X_val, t = res_val, label = 'val')
	# the error (and its evolution with ensemble size) is a variable of the ensemble class:
	res_val_mse = res_ens.mse_[-1]
	res_val_time = np.sum( np.asarray( res_ens.prediction_times_ ), axis = 0 )

	# write to log
	util.write_row_to_csv( logfile_mse, [ 'residual_validation' , res_val_mse ])
	util.write_row_to_csv( logfile_time, [ 'residual_val', res_val_time[0], res_val_time[1] ])

	# do the same thing for the test dataset:
	res_ens.predict(res_X_te, t = res_te, label = 'te')
	res_test_mse = res_ens.mse_[-1]
	res_test_time = np.sum( np.asarray( res_ens.prediction_times_ ), axis = 0 )

	# write to log
	util.write_row_to_csv( logfile_mse, [ 'residual_testing' , res_test_mse ])
	util.write_row_to_csv( logfile_time, [ 'residual_te', res_test_time[0], res_test_time[1] ])

	del res, res_ens

ELM_uncertainties_train.pyNo OneTemporaryActions

File Metadata

ELM_uncertainties_train.pyView Options

Event Timeline

ELM_uncertainties_train.py
No OneTemporary
Actions

ELM_uncertainties_train.py
View Options