Page MenuHomec4science

ELM_ensemble.py
No OneTemporary

File Metadata

Created
Sun, Apr 27, 21:31

ELM_ensemble.py

import numpy as np
import pandas as pd
import xarray as xr
import os
import sys
import time
import hpelm
import util
from ds import Dataset
from tables import open_file, Atom, Filters
import csv
from sklearn.metrics import mean_squared_error as mse
import h5py
def find_name(filename):
path, name = os.path.split(filename)
root, ext = os.path.splitext(name)
i = 0
while os.path.exists(filename):
i += 1
filename = os.path.join(path, str(i) + '_' + root + ext)
return filename
class HPELM_Ensemble():
def __init__(self, path, n_estimators, n_nodes, n_features, n_targets, t_nodes = 'sigm', bootstrap = True, oob = False, precision = None, accelerator = None, save_model = True):
self.n_est = n_estimators
self.n_nodes = n_nodes
self.t_nodes = t_nodes
self.nf = n_features
self.nt = n_targets
self.bootstrap = bootstrap
self.oob = oob
self.save_model = save_model
self.model_path = path
if not os.path.exists(path): os.mkdir(path)
if not os.path.exists(os.path.join(path, 'tmp')): os.mkdir(os.path.join(path, 'tmp'))
if accelerator == 'GPU':
accel = 'GPU'
if precision is None: precision = 'single'
else:
accel = accelerator
if precision is None: precision = 'double'
self.estimators_ = []
for i in range(n_estimators):
self.estimators_.append(hpelm.HPELM(n_features, n_targets, accelerator = accel, precision = precision))
def fit(self, X, T, val = None, val_X = None, val_T = None):
# path: path where auxilary files should be stored
# X, T, val_X, val_T --> files
self.train_times_ = []
print('\n\nTraining model')
logfile = find_name(os.path.join(self.model_path,('log_train.csv')))
with open(logfile, 'w') as csvfile:
w = csv.writer(csvfile, delimiter=',')
w.writerow(['model_ID', 'walltime', 'cputime'])
if self.bootstrap:
x = util.get_matrix(X)
t = util.get_matrix(T)
n = x.shape[0]
if self.oob:
oob_file = os.path.join(self.model_path, 'OOB.hdf5')
f = h5py.File(oob_file, 'w')
oob_ds = f.create_dataset('data', (n, self.n_est), dtype='i')
t_cpu = []
t_wall = []
m = -1
for model in self.estimators_:
m += 1
print('Fitting model %d' %m)
tt = util.Timer()
# apply bootstrap:
if self.bootstrap:
ind = np.random.randint(n, size = n)
if self.oob:
oob_vec = np.zeros(n)
oob_vec[np.delete(range(n), np.unique(ind))] = 1 # get out-of-bag indices and set to 1
oob_ds[:, m] = oob_vec
train = x[ind,:]
train_t = t[ind]
X_tr = os.path.join(self.model_path, 'tmp', 'train_x.hdf5')
T_tr = os.path.join(self.model_path, 'tmp', 'train_t.hdf5')
if os.path.exists(X_tr):
os.remove(X_tr)
if os.path.exists(T_tr):
os.remove(T_tr)
util.make_hdf5(train, X_tr)
util.make_hdf5(train_t, T_tr)
else:
X_tr = X
T_tr = T
model.add_neurons(self.n_nodes, self.t_nodes)
if val is None:
model.train(X_tr, T_tr)
else:
model.train(X_tr, T_tr, val, Xv = val_X, Tv = val_T)
if self.save_model: model.save(os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))
tt.stop(print_any = False)
with open(logfile, 'a') as csvfile:
w = csv.writer(csvfile, delimiter=',')
w.writerow([m, tt.walltime, tt.cputime])
self.train_times_.append([tt.cputime, tt.walltime])
if self.oob: f.close()
def load(self):
m = -1
for model in self.estimators_:
m += 1
try:
model.load(os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))
except:
print('ERROR: could not load model in ' + os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))
def predict(self, X, Y = None, t = None, eval = False, norm = None, label = ''):
# t: target --> used to calculate errors "on the fly"
print('\n\nPredicting for %s' %label)
self.prediction_times_ = []
if Y is None:
y_out = []
else:
path = os.path.split(Y)[0]
body = os.path.splitext(Y)[0]
name = os.path.split(body)[1]
logfile = find_name(os.path.join(self.model_path,('log_pred_%s.csv' %label)))
if t is not None:
y_pred = np.zeros(t.shape)
self.mse_ = np.zeros(self.n_est)
get_mse = True
else: get_mse = False
with open(logfile, 'w') as csvfile:
w = csv.writer(csvfile, delimiter=',')
if get_mse: w.writerow(['model_ID', 'mse', 'walltime', 'cputime'])
else: w.writerow(['model_ID', 'walltime', 'cputime'])
m = -1
for model in self.estimators_:
m += 1
print('Predicting on model %d' %m)
tt = util.Timer()
if Y is None:
y_pred_tmp = model.predict(X)
y_out.append(y_pred_tmp)
if get_mse:
y_pred += y_pred_tmp
self.mse_[m] = mse(t, y_pred/(m+1))
else:
Y_pred = ('%s_%02d.hdf5' %(body,m))
model.predict(X, Y_pred)
tt.stop(print_any = False)
with open(logfile, 'a') as csvfile:
w = csv.writer(csvfile, delimiter=',')
if get_mse: w.writerow([m, self.mse_[m], tt.walltime, tt.cputime])
else: w.writerow([m, tt.walltime, tt.cputime])
self.prediction_times_.append([tt.cputime, tt.walltime])
if Y is None:
if t is not None:
return y_pred/self.n_est, y_out
else:
return y_out
elif eval:
util.merge_files(Y, self.n_est, batches = True, norm = norm)
def oob_prediction(self, Y, norm = None):
# path: path where oob.hdf5 can be found
# FOR NOW NOT IN BATCHES (assumes ds to be small enough - necessary condition for bootstrap!)
body = os.path.splitext(Y)[0]
filepath = os.path.split(Y)[0]
name = os.path.split(body)[1]
oob_file = os.path.join(self.model_path, 'OOB.hdf5')
oob_inds = util.get_matrix(oob_file)
n = oob_inds.shape[0]
oob_count = np.zeros((n,1))
oob_sigma = np.zeros((n,self.nt))
y_sigma = np.zeros((n,self.nt))
y_sigma_sq = np.zeros((n,self.nt))
for m in range(self.n_est):
inds = oob_inds[:,m].reshape((-1,1))
oob_mask = np.repeat(inds,self.nt,axis = 1)
Y_pred = ('%s_%02d.hdf5' %(body,m))
y_tmp = util.get_matrix(Y_pred)
os.remove(Y_pred)
if norm is not None:
y_tmp = norm.rescale(y_tmp)
y_sigma += y_tmp
y_sigma_sq += y_tmp ** 2
oob_count += inds
oob_sigma += oob_mask * y_tmp
zero_inds = np.nonzero(oob_count == 0)
print('%d out of %d samples never out-of-bag' %(len(zero_inds), n))
print('-> substituted these samples with overall prediction')
oob_sigma[zero_inds,:] = y_sigma[zero_inds,:]
oob_count[zero_inds] = self.n_est
mean = y_sigma/self.n_est
var = 1.0/self.n_est*(y_sigma_sq-y_sigma**2/self.n_est)
oob = oob_sigma/oob_count
util.make_hdf5(mean, Y)
util.make_hdf5(var, body + '_var.hdf5')
util.make_hdf5(oob, os.path.join(filepath, 'oob_prediction.hdf5'))
return oob, var, mean

Event Timeline