Page MenuHomec4science

ELM_ensemble.py
No OneTemporary

File Metadata

Created
Mon, Apr 28, 23:49

ELM_ensemble.py

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import pandas as pd
import xarray as xr
import os
import sys
import time
import hpelm
import util
from ds import Dataset
from tables import open_file, Atom, Filters
import csv
from sklearn.metrics import mean_squared_error as mse
import h5py
def find_name(filename):
path, name = os.path.split(filename)
root, ext = os.path.splitext(name)
i = 0
while os.path.exists(filename):
i += 1
filename = os.path.join(path, str(i) + '_' + root + ext)
return filename
class HPELM_Ensemble():
def __init__(self, path, n_estimators, n_nodes, n_features, n_targets, t_nodes = 'sigm', bootstrap = True, oob = False, precision = 'double', accelerator = None, save_model = True):
self.n_est = n_estimators
self.n_nodes = n_nodes
self.t_nodes = t_nodes
self.nf = n_features
self.nt = n_targets
self.bootstrap = bootstrap
self.oob = oob
self.save_model = save_model
self.model_path = path
if not os.path.exists(path): os.mkdir(path)
if not os.path.exists(os.path.join(path, 'tmp')): os.mkdir(os.path.join(path, 'tmp'))
self.estimators_ = []
for i in range(n_estimators):
self.estimators_.append(hpelm.hp_elm.HPELM(n_features, n_targets, accelerator = accelerator, precision = precision))
def fit(self, X, T, val = None, val_X = None, val_T = None):
# path: path where auxilary files should be stored
# X, T, val_X, val_T --> files
self.train_times_ = []
print('\n\nTraining model')
logfile = find_name(os.path.join(self.model_path,('log_train.csv')))
with open(logfile, 'w') as csvfile:
w = csv.writer(csvfile, delimiter=',')
w.writerow(['model_ID', 'walltime', 'cputime'])
if self.bootstrap:
x = util.get_matrix(X)
t = util.get_matrix(T)
n = x.shape[0]
if self.oob:
oob_file = os.path.join(self.model_path, 'OOB.hdf5')
f = h5py.File(oob_file, 'w')
oob_ds = f.create_dataset('data', (n, self.n_est), dtype='i')
t_cpu = []
t_wall = []
m = -1
for model in self.estimators_:
m += 1
print('Fitting model %d' %m)
tt = util.Timer()
# apply bootstrap:
if self.bootstrap:
ind = np.random.randint(n, size = n)
if self.oob:
oob_vec = np.zeros(n)
oob_vec[np.delete(range(n), np.unique(ind))] = 1 # get out-of-bag indices and set to 1
oob_ds[:, m] = oob_vec
train = x[ind,:]
train_t = t[ind]
X_tr = os.path.join(self.model_path, 'tmp', 'train_x.hdf5')
T_tr = os.path.join(self.model_path, 'tmp', 'train_t.hdf5')
if os.path.exists(X_tr):
os.remove(X_tr)
if os.path.exists(T_tr):
os.remove(T_tr)
util.make_hdf5(train, X_tr)
util.make_hdf5(train_t, T_tr)
else:
X_tr = X
T_tr = T
model.add_neurons(self.n_nodes, self.t_nodes)
if val is None:
model.train(X_tr, T_tr)
else:
model.train(X_tr, T_tr, val, Xv = val_X, Tv = val_T)
if self.save_model: model.save(os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))
tt.stop(print_any = False)
with open(logfile, 'a') as csvfile:
w = csv.writer(csvfile, delimiter=',')
w.writerow([m, tt.walltime, tt.cputime])
self.train_times_.append([tt.cputime, tt.walltime])
if self.oob: f.close()
def load(self):
m = -1
for model in self.estimators_:
m += 1
try:
model.load(os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))
except:
print('ERROR: could not load model in ' + os.path.join(self.model_path, ('model_%02d.hdf5' %(m))))
def predict(self, X, Y = None, t = None, eval = False, norm = None, label = ''):
# t: target --> used to calculate errors "on the fly"
print('\n\nPredicting for %s' %label)
self.prediction_times_ = []
if Y is None:
y_out = []
else:
path = os.path.split(Y)[0]
body = os.path.splitext(Y)[0]
name = os.path.split(body)[1]
logfile = find_name(os.path.join(self.model_path,('log_pred_%s.csv' %label)))
if t is not None:
y_pred = np.zeros(t.shape)
self.mse_ = np.zeros(self.n_est)
get_mse = True
else: get_mse = False
with open(logfile, 'w') as csvfile:
w = csv.writer(csvfile, delimiter=',')
if get_mse: w.writerow(['model_ID', 'mse', 'walltime', 'cputime'])
else: w.writerow(['model_ID', 'walltime', 'cputime'])
m = -1
for model in self.estimators_:
m += 1
print('Predicting on model %d' %m)
tt = util.Timer()
if Y is None:
y_pred_tmp = model.predict(X)
y_out.append(y_pred_tmp)
if get_mse:
y_pred += y_pred_tmp
self.mse_[m] = mse(t, y_pred/(m+1))
else:
Y_pred = ('%s_%02d.hdf5' %(body,m))
model.predict(X, Y_pred)
tt.stop(print_any = False)
with open(logfile, 'a') as csvfile:
w = csv.writer(csvfile, delimiter=',')
if get_mse: w.writerow([m, self.mse_[m], tt.walltime, tt.cputime])
else: w.writerow([m, tt.walltime, tt.cputime])
self.prediction_times_.append([tt.cputime, tt.walltime])
if Y is None:
if t is not None:
return y_pred/self.n_est, y_out
else:
return y_out
elif eval:
util.merge_files(Y, self.n_est, batches = True, norm = norm)
def oob_prediction(self, Y, norm = None):
# path: path where oob.hdf5 can be found
# FOR NOW NOT IN BATCHES (assumes ds to be small enough - necessary condition for bootstrap!)
body = os.path.splitext(Y)[0]
name = os.path.split(body)[1]
oob_file = os.path.join(self.model_path, 'OOB.hdf5')
oob_inds = util.get_matrix(oob_file)
n = oob_inds.shape[0]
oob_count = np.zeros((n,1))
oob_sigma = np.zeros((n,self.nt))
y_sigma = np.zeros((n,self.nt))
y_sigma_sq = np.zeros((n,self.nt))
for m in range(self.n_est):
inds = oob_inds[:,m].reshape((-1,1))
oob_mask = np.repeat(inds,self.nt,axis = 1)
Y_pred = ('%s_%02d.hdf5' %(body,m))
y_tmp = util.get_matrix(Y_pred)
os.remove(Y_pred)
if norm is not None:
y_tmp = norm.rescale(y_tmp)
y_sigma += y_tmp
y_sigma_sq += y_tmp ** 2
oob_count += inds
oob_sigma += oob_mask * y_tmp
zero_inds = np.nonzero(oob_count == 0)
print('%d out of %d samples never out-of-bag' %(len(zero_inds), n))
print('-> substituted these samples with overall prediction')
oob_sigma[zero_inds,:] = y_sigma[zero_inds,:]
oob_count[zero_inds] = self.n_est
mean = y_sigma/self.n_est
var = 1.0/self.n_est*(y_sigma_sq-y_sigma**2/self.n_est)
oob = oob_sigma/oob_count
return oob, var, mean

Event Timeline