ELM_ensemble_variant.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Jun 28, 19:58

ELM_ensemble_variant.py
View Options

	import numpy as np
	import pandas as pd
	import xarray as xr
	import os
	import sys
	import time
	import hpelm
	import util
	from ds import Dataset
	from tables import open_file, Atom, Filters
	import csv
	from sklearn.metrics import mean_squared_error as mse
	import h5py

	def write_hdf5(data, file):
	if os.path.exists(file):
	os.remove(file)

	util.make_hdf5(data, file)

	class HPELM_Ensemble():

	def __init__(self, path, n_estimators, n_nodes, n_features, n_targets, t_nodes = 'sigm', bootstrap = True, oob = False, max_features = None, precision = 'double', accelerator = None, save_model = False):
	self.n_est = n_estimators
	self.n_nodes = n_nodes
	self.t_nodes = t_nodes
	self.nf = n_features
	self.nt = n_targets
	self.bootstrap = bootstrap
	self.oob = oob
	self.max_features = max_features
	self.save_model = save_model
	self.model_path = path
	if not os.path.exists(path): os.mkdir(path)
	if not os.path.exists(os.path.join(path, 'tmp')): os.mkdir(os.path.join(path, 'tmp'))

	if accelerator == 'GPU':
	accel = 'GPU'
	else:
	accel = accelerator

	if self.max_features is not None:
	self.feature_selector = np.zeros(n_est, self.max_features)

	self.estimators_ = []
	for i in range(n_estimators):
	self.estimators_.append(hpelm.HPELM(n_features, n_targets, accelerator = accel, precision = precision))

	def fit(self, X, T, val = None, val_X = None, val_T = None, regularization = '', error_threshold = 1.0):
	# path: path where auxilary files should be stored
	# X, T, val_X, val_T --> files

	MAX_ATTEMPTS_TRAIN = 10

	self.train_times_ = []
	print('\n\nTraining model')

	logfile = util.find_name(os.path.join(self.model_path,('log_train.csv')))
	with open(logfile, 'w') as csvfile:
	w = csv.writer(csvfile, delimiter=',')
	w.writerow(['model_ID', 'walltime', 'cputime'])

	if self.bootstrap:
	# load training features and targets from hdf5
	x = util.get_matrix(X)
	t = util.get_matrix(T)

	# set up parameters for the fitting
	n = x.shape[0]
	oob_ds = None

	# if out-of-bag error is to be computed laster, open a file to save the oob mask
	if self.oob:
	oob_file = os.path.join(self.model_path, 'OOB.hdf5')
	f = h5py.File(oob_file, 'w')
	oob_ds = f.create_dataset('data', (n, self.n_est), dtype='i')

	t_cpu = []
	t_wall = []

	# loop through all the estimators
	for model, m in zip( self.estimators_, range(len(self.estimators_)) ):

	print('Fitting model %d' %m)
	tt = util.Timer()

	# apply bootstrap:
	if self.bootstrap:
	X_tr, T_tr = self._bootstrap_iteration(m, n, x, t, oob_ds)

	else:
	X_tr = X
	T_tr = T

	model.add_neurons(self.n_nodes, self.t_nodes)
	if val is None:
	model.train(X_tr, T_tr, regularization)

	else:
	# train the model as often as needed in order to achieve a low
	recompute = True
	count = 0
	while(recompute and count < MAX_ATTEMPTS_TRAIN):
	model.train(X_tr, T_tr, val, regularization, Xv = val_X, Tv = val_T)

	if error_threshold is None:
	recompute == False
	else:
	recompute = self._check_error(model, error_threshold, val_X, val_T)
	if recompute:
	print("Error exceeded threshold: re-training the model")
	count += 1
	self._bootstrap_iteration( m, n, x, t, oob_ds)

	if self.save_model: model.save(os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))

	tt.stop(print_any = False)

	with open(logfile, 'a') as csvfile:
	w = csv.writer(csvfile, delimiter=',')
	w.writerow([m, tt.walltime, tt.cputime])

	self.train_times_.append([tt.cputime, tt.walltime])

	if self.oob: f.close()

	def load(self):
	m = -1
	for model in self.estimators_:
	m += 1

	try:
	model.load(os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))
	except:
	print('ERROR: could not load model in ' + os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))


	def predict(self, X, Y = None, t = None, eval = False, norm = None, label = ''):
	# t: target --> used to calculate errors "on the fly"

	print('\n\nPredicting for %s' %label)

	self.prediction_times_ = []

	if Y is None:
	y_out = []
	else:
	path = os.path.split(Y)[0]
	body = os.path.splitext(Y)[0]
	name = os.path.split(body)[1]

	logfile = util.find_name(os.path.join(self.model_path,('log_pred_%s.csv' %label)))

	if t is not None:
	y_pred = np.zeros(t.shape)
	self.mse_ = np.zeros(self.n_est)
	mse_models = np.zeros(self.n_est)
	get_mse = True
	else: get_mse = False

	with open(logfile, 'w') as csvfile:
	w = csv.writer(csvfile, delimiter=',')
	if get_mse: w.writerow(['model_ID', 'mse', 'mse_model', 'walltime', 'cputime'])
	else: w.writerow(['model_ID', 'walltime', 'cputime'])

	if self.bootstrap and self.max_features is not None:
	features = util.get_matrix(X)
	X_pred = os.path.join(self.model_path, 'tmp', 'pred_x.hdf5')
	else:
	X_pred = X

	for model, m in zip(self.estimators_, range(len(self.estimators_)) ):

	print('Predicting on model %d' %m)
	tt = util.Timer()

	if self.bootstrap and self.max_features is not None:
	write_hdf5( features[ : , self.feature_selector[i,:] ], X_pred)

	if Y is None:
	y_pred_tmp = model.predict(X_pred)
	y_out.append(y_pred_tmp)

	if get_mse:
	y_pred += y_pred_tmp
	self.mse_[m] = mse(t, y_pred/(m+1))
	mse_models[m] = mse(t, y_pred_tmp)

	else:
	Y_pred = ('%s_%02d.hdf5' %(body,m))
	model.predict(X_pred, Y_pred)

	tt.stop(print_any = False)

	with open(logfile, 'a') as csvfile:
	w = csv.writer(csvfile, delimiter=',')
	if get_mse: w.writerow([m, self.mse_[m], mse_models[m], tt.walltime, tt.cputime])
	else: w.writerow([m, tt.walltime, tt.cputime])

	self.prediction_times_.append([tt.cputime, tt.walltime])

	if Y is None:
	if t is not None:
	return y_pred/self.n_est, y_out
	else:
	return y_out
	elif eval:
	util.merge_files(Y, self.n_est, batches = True, norm = norm)

	def oob_prediction(self, Y, norm = None):
	# path: path where oob.hdf5 can be found
	# FOR NOW NOT IN BATCHES (assumes ds to be small enough - necessary condition for bootstrap!)

	body = os.path.splitext(Y)[0]
	filepath = os.path.split(Y)[0]
	name = os.path.split(body)[1]

	oob_file = os.path.join(self.model_path, 'OOB.hdf5')
	oob_inds = util.get_matrix(oob_file)
	n = oob_inds.shape[0]

	oob_count = np.zeros((n,1))
	oob_sigma = np.zeros((n,self.nt))
	y_sigma = np.zeros((n,self.nt))
	y_sigma_sq = np.zeros((n,self.nt))

	for m in range(self.n_est):
	inds = oob_inds[:,m].reshape((-1,1))
	oob_mask = np.repeat(inds,self.nt,axis = 1)

	Y_pred = ('%s_%02d.hdf5' %(body,m))
	y_tmp = util.get_matrix(Y_pred)
	os.remove(Y_pred)

	if norm is not None:
	y_tmp = norm.rescale(y_tmp)

	y_sigma += y_tmp
	y_sigma_sq += y_tmp ** 2
	oob_count += inds
	oob_sigma += oob_mask * y_tmp

	zero_inds = np.nonzero(oob_count == 0)
	print('%d out of %d samples never out-of-bag' %(len(zero_inds), n))
	print('-> substituted these samples with overall prediction')

	oob_sigma[zero_inds,:] = y_sigma[zero_inds,:]
	oob_count[zero_inds] = self.n_est

	mean = y_sigma/self.n_est
	var = 1.0/self.n_est(y_sigma_sq-y_sigma*2/self.n_est)
	oob = oob_sigma/oob_count
	util.make_hdf5(mean, Y)
	util.make_hdf5(var, body + '_var.hdf5')
	util.make_hdf5(oob, os.path.join(filepath, 'oob_prediction.hdf5'))

	return oob, var, mean


	def _check_error(self, model, threshold, feature_file, target_file):
	# get the matrix of target values
	target = util.get_matrix(target_file)

	# compute the prediction for the current model and the mean-squared-error
	prediction = model.predict(feature_file)
	current_mse = mse(target, prediction)

	if current_mse >= threshold:
	return True
	else:
	return False


	def _bootstrap_iteration(self, model_ID, n_samples, features, targets, oob_ds = None):
	ind = np.random.randint(n, size = n)
	# ind = np.floor(np.random.rand(n_samples)*n_samples).astype(int)

	if self.oob:
	oob_vec = np.zeros(n_samples)
	oob_vec[np.delete(range(n_samples), np.unique(ind))] = 1 # get out-of-bag indices and set to 1
	oob_ds[:, model_ID] = oob_vec

	if self.max_features is not None:
	ftrs = np.random.permutation(sself.max_features)[ :self.max_features ]
	self.feature_selector[model_ID, :] = ftrs
	else:
	ftrs = range(self.nf)

	train = features[ ind , ftrs ]
	train_t = targets[ ind ]

	X_tr = os.path.join(self.model_path, 'tmp', 'train_x.hdf5')
	T_tr = os.path.join(self.model_path, 'tmp', 'train_t.hdf5')

	write_hdf5(train, X_tr)
	write_hdf5(train_t, T_tr)

	return X_tr, T_tr

ELM_ensemble_variant.pyNo OneTemporaryActions

File Metadata

ELM_ensemble_variant.pyView Options

Event Timeline

ELM_ensemble_variant.py
No OneTemporary
Actions

ELM_ensemble_variant.py
View Options