Page MenuHomec4science

ELM_ensemble_variant.py
No OneTemporary

File Metadata

Created
Sat, Jun 28, 19:58

ELM_ensemble_variant.py

import numpy as np
import pandas as pd
import xarray as xr
import os
import sys
import time
import hpelm
import util
from ds import Dataset
from tables import open_file, Atom, Filters
import csv
from sklearn.metrics import mean_squared_error as mse
import h5py
def write_hdf5(data, file):
if os.path.exists(file):
os.remove(file)
util.make_hdf5(data, file)
class HPELM_Ensemble():
def __init__(self, path, n_estimators, n_nodes, n_features, n_targets, t_nodes = 'sigm', bootstrap = True, oob = False, max_features = None, precision = 'double', accelerator = None, save_model = False):
self.n_est = n_estimators
self.n_nodes = n_nodes
self.t_nodes = t_nodes
self.nf = n_features
self.nt = n_targets
self.bootstrap = bootstrap
self.oob = oob
self.max_features = max_features
self.save_model = save_model
self.model_path = path
if not os.path.exists(path): os.mkdir(path)
if not os.path.exists(os.path.join(path, 'tmp')): os.mkdir(os.path.join(path, 'tmp'))
if accelerator == 'GPU':
accel = 'GPU'
else:
accel = accelerator
if self.max_features is not None:
self.feature_selector = np.zeros(n_est, self.max_features)
self.estimators_ = []
for i in range(n_estimators):
self.estimators_.append(hpelm.HPELM(n_features, n_targets, accelerator = accel, precision = precision))
def fit(self, X, T, val = None, val_X = None, val_T = None, regularization = '', error_threshold = 1.0):
# path: path where auxilary files should be stored
# X, T, val_X, val_T --> files
MAX_ATTEMPTS_TRAIN = 10
self.train_times_ = []
print('\n\nTraining model')
logfile = util.find_name(os.path.join(self.model_path,('log_train.csv')))
with open(logfile, 'w') as csvfile:
w = csv.writer(csvfile, delimiter=',')
w.writerow(['model_ID', 'walltime', 'cputime'])
if self.bootstrap:
# load training features and targets from hdf5
x = util.get_matrix(X)
t = util.get_matrix(T)
# set up parameters for the fitting
n = x.shape[0]
oob_ds = None
# if out-of-bag error is to be computed laster, open a file to save the oob mask
if self.oob:
oob_file = os.path.join(self.model_path, 'OOB.hdf5')
f = h5py.File(oob_file, 'w')
oob_ds = f.create_dataset('data', (n, self.n_est), dtype='i')
t_cpu = []
t_wall = []
# loop through all the estimators
for model, m in zip( self.estimators_, range(len(self.estimators_)) ):
print('Fitting model %d' %m)
tt = util.Timer()
# apply bootstrap:
if self.bootstrap:
X_tr, T_tr = self._bootstrap_iteration(m, n, x, t, oob_ds)
else:
X_tr = X
T_tr = T
model.add_neurons(self.n_nodes, self.t_nodes)
if val is None:
model.train(X_tr, T_tr, regularization)
else:
# train the model as often as needed in order to achieve a low
recompute = True
count = 0
while(recompute and count < MAX_ATTEMPTS_TRAIN):
model.train(X_tr, T_tr, val, regularization, Xv = val_X, Tv = val_T)
if error_threshold is None:
recompute == False
else:
recompute = self._check_error(model, error_threshold, val_X, val_T)
if recompute:
print("Error exceeded threshold: re-training the model")
count += 1
self._bootstrap_iteration( m, n, x, t, oob_ds)
if self.save_model: model.save(os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))
tt.stop(print_any = False)
with open(logfile, 'a') as csvfile:
w = csv.writer(csvfile, delimiter=',')
w.writerow([m, tt.walltime, tt.cputime])
self.train_times_.append([tt.cputime, tt.walltime])
if self.oob: f.close()
def load(self):
m = -1
for model in self.estimators_:
m += 1
try:
model.load(os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))
except:
print('ERROR: could not load model in ' + os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))
def predict(self, X, Y = None, t = None, eval = False, norm = None, label = ''):
# t: target --> used to calculate errors "on the fly"
print('\n\nPredicting for %s' %label)
self.prediction_times_ = []
if Y is None:
y_out = []
else:
path = os.path.split(Y)[0]
body = os.path.splitext(Y)[0]
name = os.path.split(body)[1]
logfile = util.find_name(os.path.join(self.model_path,('log_pred_%s.csv' %label)))
if t is not None:
y_pred = np.zeros(t.shape)
self.mse_ = np.zeros(self.n_est)
mse_models = np.zeros(self.n_est)
get_mse = True
else: get_mse = False
with open(logfile, 'w') as csvfile:
w = csv.writer(csvfile, delimiter=',')
if get_mse: w.writerow(['model_ID', 'mse', 'mse_model', 'walltime', 'cputime'])
else: w.writerow(['model_ID', 'walltime', 'cputime'])
if self.bootstrap and self.max_features is not None:
features = util.get_matrix(X)
X_pred = os.path.join(self.model_path, 'tmp', 'pred_x.hdf5')
else:
X_pred = X
for model, m in zip(self.estimators_, range(len(self.estimators_)) ):
print('Predicting on model %d' %m)
tt = util.Timer()
if self.bootstrap and self.max_features is not None:
write_hdf5( features[ : , self.feature_selector[i,:] ], X_pred)
if Y is None:
y_pred_tmp = model.predict(X_pred)
y_out.append(y_pred_tmp)
if get_mse:
y_pred += y_pred_tmp
self.mse_[m] = mse(t, y_pred/(m+1))
mse_models[m] = mse(t, y_pred_tmp)
else:
Y_pred = ('%s_%02d.hdf5' %(body,m))
model.predict(X_pred, Y_pred)
tt.stop(print_any = False)
with open(logfile, 'a') as csvfile:
w = csv.writer(csvfile, delimiter=',')
if get_mse: w.writerow([m, self.mse_[m], mse_models[m], tt.walltime, tt.cputime])
else: w.writerow([m, tt.walltime, tt.cputime])
self.prediction_times_.append([tt.cputime, tt.walltime])
if Y is None:
if t is not None:
return y_pred/self.n_est, y_out
else:
return y_out
elif eval:
util.merge_files(Y, self.n_est, batches = True, norm = norm)
def oob_prediction(self, Y, norm = None):
# path: path where oob.hdf5 can be found
# FOR NOW NOT IN BATCHES (assumes ds to be small enough - necessary condition for bootstrap!)
body = os.path.splitext(Y)[0]
filepath = os.path.split(Y)[0]
name = os.path.split(body)[1]
oob_file = os.path.join(self.model_path, 'OOB.hdf5')
oob_inds = util.get_matrix(oob_file)
n = oob_inds.shape[0]
oob_count = np.zeros((n,1))
oob_sigma = np.zeros((n,self.nt))
y_sigma = np.zeros((n,self.nt))
y_sigma_sq = np.zeros((n,self.nt))
for m in range(self.n_est):
inds = oob_inds[:,m].reshape((-1,1))
oob_mask = np.repeat(inds,self.nt,axis = 1)
Y_pred = ('%s_%02d.hdf5' %(body,m))
y_tmp = util.get_matrix(Y_pred)
os.remove(Y_pred)
if norm is not None:
y_tmp = norm.rescale(y_tmp)
y_sigma += y_tmp
y_sigma_sq += y_tmp ** 2
oob_count += inds
oob_sigma += oob_mask * y_tmp
zero_inds = np.nonzero(oob_count == 0)
print('%d out of %d samples never out-of-bag' %(len(zero_inds), n))
print('-> substituted these samples with overall prediction')
oob_sigma[zero_inds,:] = y_sigma[zero_inds,:]
oob_count[zero_inds] = self.n_est
mean = y_sigma/self.n_est
var = 1.0/self.n_est*(y_sigma_sq-y_sigma**2/self.n_est)
oob = oob_sigma/oob_count
util.make_hdf5(mean, Y)
util.make_hdf5(var, body + '_var.hdf5')
util.make_hdf5(oob, os.path.join(filepath, 'oob_prediction.hdf5'))
return oob, var, mean
def _check_error(self, model, threshold, feature_file, target_file):
# get the matrix of target values
target = util.get_matrix(target_file)
# compute the prediction for the current model and the mean-squared-error
prediction = model.predict(feature_file)
current_mse = mse(target, prediction)
if current_mse >= threshold:
return True
else:
return False
def _bootstrap_iteration(self, model_ID, n_samples, features, targets, oob_ds = None):
ind = np.random.randint(n, size = n)
# ind = np.floor(np.random.rand(n_samples)*n_samples).astype(int)
if self.oob:
oob_vec = np.zeros(n_samples)
oob_vec[np.delete(range(n_samples), np.unique(ind))] = 1 # get out-of-bag indices and set to 1
oob_ds[:, model_ID] = oob_vec
if self.max_features is not None:
ftrs = np.random.permutation(sself.max_features)[ :self.max_features ]
self.feature_selector[model_ID, :] = ftrs
else:
ftrs = range(self.nf)
train = features[ ind , ftrs ]
train_t = targets[ ind ]
X_tr = os.path.join(self.model_path, 'tmp', 'train_x.hdf5')
T_tr = os.path.join(self.model_path, 'tmp', 'train_t.hdf5')
write_hdf5(train, X_tr)
write_hdf5(train_t, T_tr)
return X_tr, T_tr

Event Timeline