ELM_ensemble.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Apr 27, 21:31

ELM_ensemble.py
View Options

	import numpy as np
	import pandas as pd
	import xarray as xr
	import os
	import sys
	import time
	import hpelm
	import util
	from ds import Dataset
	from tables import open_file, Atom, Filters
	import csv
	from sklearn.metrics import mean_squared_error as mse
	import h5py

	def find_name(filename):
	path, name = os.path.split(filename)
	root, ext = os.path.splitext(name)
	i = 0
	while os.path.exists(filename):
	i += 1
	filename = os.path.join(path, str(i) + '_' + root + ext)
	return filename

	class HPELM_Ensemble():

	def __init__(self, path, n_estimators, n_nodes, n_features, n_targets, t_nodes = 'sigm', bootstrap = True, oob = False, precision = None, accelerator = None, save_model = True):
	self.n_est = n_estimators
	self.n_nodes = n_nodes
	self.t_nodes = t_nodes
	self.nf = n_features
	self.nt = n_targets
	self.bootstrap = bootstrap
	self.oob = oob
	self.save_model = save_model
	self.model_path = path
	if not os.path.exists(path): os.mkdir(path)
	if not os.path.exists(os.path.join(path, 'tmp')): os.mkdir(os.path.join(path, 'tmp'))

	if accelerator == 'GPU':
	accel = 'GPU'
	if precision is None: precision = 'single'
	else:
	accel = accelerator
	if precision is None: precision = 'double'

	self.estimators_ = []
	for i in range(n_estimators):
	self.estimators_.append(hpelm.HPELM(n_features, n_targets, accelerator = accel, precision = precision))

	def fit(self, X, T, val = None, val_X = None, val_T = None):
	# path: path where auxilary files should be stored
	# X, T, val_X, val_T --> files

	self.train_times_ = []
	print('\n\nTraining model')

	logfile = find_name(os.path.join(self.model_path,('log_train.csv')))
	with open(logfile, 'w') as csvfile:
	w = csv.writer(csvfile, delimiter=',')
	w.writerow(['model_ID', 'walltime', 'cputime'])

	if self.bootstrap:
	x = util.get_matrix(X)
	t = util.get_matrix(T)
	n = x.shape[0]

	if self.oob:
	oob_file = os.path.join(self.model_path, 'OOB.hdf5')
	f = h5py.File(oob_file, 'w')
	oob_ds = f.create_dataset('data', (n, self.n_est), dtype='i')

	t_cpu = []
	t_wall = []
	m = -1
	for model in self.estimators_:
	m += 1

	print('Fitting model %d' %m)
	tt = util.Timer()

	# apply bootstrap:
	if self.bootstrap:
	ind = np.random.randint(n, size = n)

	if self.oob:
	oob_vec = np.zeros(n)
	oob_vec[np.delete(range(n), np.unique(ind))] = 1 # get out-of-bag indices and set to 1
	oob_ds[:, m] = oob_vec

	train = x[ind,:]
	train_t = t[ind]

	X_tr = os.path.join(self.model_path, 'tmp', 'train_x.hdf5')
	T_tr = os.path.join(self.model_path, 'tmp', 'train_t.hdf5')

	if os.path.exists(X_tr):
	os.remove(X_tr)

	if os.path.exists(T_tr):
	os.remove(T_tr)

	util.make_hdf5(train, X_tr)
	util.make_hdf5(train_t, T_tr)

	else:
	X_tr = X
	T_tr = T

	model.add_neurons(self.n_nodes, self.t_nodes)
	if val is None:
	model.train(X_tr, T_tr)
	else:
	model.train(X_tr, T_tr, val, Xv = val_X, Tv = val_T)

	if self.save_model: model.save(os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))

	tt.stop(print_any = False)

	with open(logfile, 'a') as csvfile:
	w = csv.writer(csvfile, delimiter=',')
	w.writerow([m, tt.walltime, tt.cputime])

	self.train_times_.append([tt.cputime, tt.walltime])

	if self.oob: f.close()

	def load(self):
	m = -1
	for model in self.estimators_:
	m += 1

	try:
	model.load(os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))
	except:
	print('ERROR: could not load model in ' + os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))


	def predict(self, X, Y = None, t = None, eval = False, norm = None, label = ''):
	# t: target --> used to calculate errors "on the fly"

	print('\n\nPredicting for %s' %label)

	self.prediction_times_ = []

	if Y is None:
	y_out = []
	else:
	path = os.path.split(Y)[0]
	body = os.path.splitext(Y)[0]
	name = os.path.split(body)[1]

	logfile = find_name(os.path.join(self.model_path,('log_pred_%s.csv' %label)))

	if t is not None:
	y_pred = np.zeros(t.shape)
	self.mse_ = np.zeros(self.n_est)
	get_mse = True
	else: get_mse = False

	with open(logfile, 'w') as csvfile:
	w = csv.writer(csvfile, delimiter=',')
	if get_mse: w.writerow(['model_ID', 'mse', 'walltime', 'cputime'])
	else: w.writerow(['model_ID', 'walltime', 'cputime'])

	m = -1
	for model in self.estimators_:
	m += 1

	print('Predicting on model %d' %m)
	tt = util.Timer()

	if Y is None:
	y_pred_tmp = model.predict(X)
	y_out.append(y_pred_tmp)

	if get_mse:
	y_pred += y_pred_tmp
	self.mse_[m] = mse(t, y_pred/(m+1))

	else:
	Y_pred = ('%s_%02d.hdf5' %(body,m))
	model.predict(X, Y_pred)

	tt.stop(print_any = False)

	with open(logfile, 'a') as csvfile:
	w = csv.writer(csvfile, delimiter=',')
	if get_mse: w.writerow([m, self.mse_[m], tt.walltime, tt.cputime])
	else: w.writerow([m, tt.walltime, tt.cputime])

	self.prediction_times_.append([tt.cputime, tt.walltime])

	if Y is None:
	if t is not None:
	return y_pred/self.n_est, y_out
	else:
	return y_out
	elif eval:
	util.merge_files(Y, self.n_est, batches = True, norm = norm)

	def oob_prediction(self, Y, norm = None):
	# path: path where oob.hdf5 can be found
	# FOR NOW NOT IN BATCHES (assumes ds to be small enough - necessary condition for bootstrap!)

	body = os.path.splitext(Y)[0]
	filepath = os.path.split(Y)[0]
	name = os.path.split(body)[1]

	oob_file = os.path.join(self.model_path, 'OOB.hdf5')
	oob_inds = util.get_matrix(oob_file)
	n = oob_inds.shape[0]

	oob_count = np.zeros((n,1))
	oob_sigma = np.zeros((n,self.nt))
	y_sigma = np.zeros((n,self.nt))
	y_sigma_sq = np.zeros((n,self.nt))

	for m in range(self.n_est):
	inds = oob_inds[:,m].reshape((-1,1))
	oob_mask = np.repeat(inds,self.nt,axis = 1)

	Y_pred = ('%s_%02d.hdf5' %(body,m))
	y_tmp = util.get_matrix(Y_pred)
	os.remove(Y_pred)

	if norm is not None:
	y_tmp = norm.rescale(y_tmp)

	y_sigma += y_tmp
	y_sigma_sq += y_tmp ** 2
	oob_count += inds
	oob_sigma += oob_mask * y_tmp

	zero_inds = np.nonzero(oob_count == 0)
	print('%d out of %d samples never out-of-bag' %(len(zero_inds), n))
	print('-> substituted these samples with overall prediction')

	oob_sigma[zero_inds,:] = y_sigma[zero_inds,:]
	oob_count[zero_inds] = self.n_est

	mean = y_sigma/self.n_est
	var = 1.0/self.n_est(y_sigma_sq-y_sigma*2/self.n_est)
	oob = oob_sigma/oob_count
	util.make_hdf5(mean, Y)
	util.make_hdf5(var, body + '_var.hdf5')
	util.make_hdf5(oob, os.path.join(filepath, 'oob_prediction.hdf5'))

	return oob, var, mean

ELM_ensemble.pyNo OneTemporaryActions

File Metadata

ELM_ensemble.pyView Options

Event Timeline

ELM_ensemble.py
No OneTemporary
Actions

ELM_ensemble.py
View Options