Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F119711008
ELM_ensemble_variant.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Jun 28, 19:58
Size
10 KB
Mime Type
text/x-python
Expires
Mon, Jun 30, 19:58 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
27039109
Attached To
R8800 solar_potential
ELM_ensemble_variant.py
View Options
import
numpy
as
np
import
pandas
as
pd
import
xarray
as
xr
import
os
import
sys
import
time
import
hpelm
import
util
from
ds
import
Dataset
from
tables
import
open_file
,
Atom
,
Filters
import
csv
from
sklearn.metrics
import
mean_squared_error
as
mse
import
h5py
def
write_hdf5
(
data
,
file
):
if
os
.
path
.
exists
(
file
):
os
.
remove
(
file
)
util
.
make_hdf5
(
data
,
file
)
class
HPELM_Ensemble
():
def
__init__
(
self
,
path
,
n_estimators
,
n_nodes
,
n_features
,
n_targets
,
t_nodes
=
'sigm'
,
bootstrap
=
True
,
oob
=
False
,
max_features
=
None
,
precision
=
'double'
,
accelerator
=
None
,
save_model
=
False
):
self
.
n_est
=
n_estimators
self
.
n_nodes
=
n_nodes
self
.
t_nodes
=
t_nodes
self
.
nf
=
n_features
self
.
nt
=
n_targets
self
.
bootstrap
=
bootstrap
self
.
oob
=
oob
self
.
max_features
=
max_features
self
.
save_model
=
save_model
self
.
model_path
=
path
if
not
os
.
path
.
exists
(
path
):
os
.
mkdir
(
path
)
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
path
,
'tmp'
)):
os
.
mkdir
(
os
.
path
.
join
(
path
,
'tmp'
))
if
accelerator
==
'GPU'
:
accel
=
'GPU'
else
:
accel
=
accelerator
if
self
.
max_features
is
not
None
:
self
.
feature_selector
=
np
.
zeros
(
n_est
,
self
.
max_features
)
self
.
estimators_
=
[]
for
i
in
range
(
n_estimators
):
self
.
estimators_
.
append
(
hpelm
.
HPELM
(
n_features
,
n_targets
,
accelerator
=
accel
,
precision
=
precision
))
def
fit
(
self
,
X
,
T
,
val
=
None
,
val_X
=
None
,
val_T
=
None
,
regularization
=
''
,
error_threshold
=
1.0
):
# path: path where auxilary files should be stored
# X, T, val_X, val_T --> files
MAX_ATTEMPTS_TRAIN
=
10
self
.
train_times_
=
[]
print
(
'
\n\n
Training model'
)
logfile
=
util
.
find_name
(
os
.
path
.
join
(
self
.
model_path
,(
'log_train.csv'
)))
with
open
(
logfile
,
'w'
)
as
csvfile
:
w
=
csv
.
writer
(
csvfile
,
delimiter
=
','
)
w
.
writerow
([
'model_ID'
,
'walltime'
,
'cputime'
])
if
self
.
bootstrap
:
# load training features and targets from hdf5
x
=
util
.
get_matrix
(
X
)
t
=
util
.
get_matrix
(
T
)
# set up parameters for the fitting
n
=
x
.
shape
[
0
]
oob_ds
=
None
# if out-of-bag error is to be computed laster, open a file to save the oob mask
if
self
.
oob
:
oob_file
=
os
.
path
.
join
(
self
.
model_path
,
'OOB.hdf5'
)
f
=
h5py
.
File
(
oob_file
,
'w'
)
oob_ds
=
f
.
create_dataset
(
'data'
,
(
n
,
self
.
n_est
),
dtype
=
'i'
)
t_cpu
=
[]
t_wall
=
[]
# loop through all the estimators
for
model
,
m
in
zip
(
self
.
estimators_
,
range
(
len
(
self
.
estimators_
))
):
print
(
'Fitting model
%d
'
%
m
)
tt
=
util
.
Timer
()
# apply bootstrap:
if
self
.
bootstrap
:
X_tr
,
T_tr
=
self
.
_bootstrap_iteration
(
m
,
n
,
x
,
t
,
oob_ds
)
else
:
X_tr
=
X
T_tr
=
T
model
.
add_neurons
(
self
.
n_nodes
,
self
.
t_nodes
)
if
val
is
None
:
model
.
train
(
X_tr
,
T_tr
,
regularization
)
else
:
# train the model as often as needed in order to achieve a low
recompute
=
True
count
=
0
while
(
recompute
and
count
<
MAX_ATTEMPTS_TRAIN
):
model
.
train
(
X_tr
,
T_tr
,
val
,
regularization
,
Xv
=
val_X
,
Tv
=
val_T
)
if
error_threshold
is
None
:
recompute
==
False
else
:
recompute
=
self
.
_check_error
(
model
,
error_threshold
,
val_X
,
val_T
)
if
recompute
:
print
(
"Error exceeded threshold: re-training the model"
)
count
+=
1
self
.
_bootstrap_iteration
(
m
,
n
,
x
,
t
,
oob_ds
)
if
self
.
save_model
:
model
.
save
(
os
.
path
.
join
(
self
.
model_path
,
(
'model_
%02d
.hdf5'
%
(
m
))))
tt
.
stop
(
print_any
=
False
)
with
open
(
logfile
,
'a'
)
as
csvfile
:
w
=
csv
.
writer
(
csvfile
,
delimiter
=
','
)
w
.
writerow
([
m
,
tt
.
walltime
,
tt
.
cputime
])
self
.
train_times_
.
append
([
tt
.
cputime
,
tt
.
walltime
])
if
self
.
oob
:
f
.
close
()
def
load
(
self
):
m
=
-
1
for
model
in
self
.
estimators_
:
m
+=
1
try
:
model
.
load
(
os
.
path
.
join
(
self
.
model_path
,
(
'model_
%02d
.hdf5'
%
(
m
))))
except
:
print
(
'ERROR: could not load model in '
+
os
.
path
.
join
(
self
.
model_path
,
(
'model_
%02d
.hdf5'
%
(
m
))))
def
predict
(
self
,
X
,
Y
=
None
,
t
=
None
,
eval
=
False
,
norm
=
None
,
label
=
''
):
# t: target --> used to calculate errors "on the fly"
print
(
'
\n\n
Predicting for
%s
'
%
label
)
self
.
prediction_times_
=
[]
if
Y
is
None
:
y_out
=
[]
else
:
path
=
os
.
path
.
split
(
Y
)[
0
]
body
=
os
.
path
.
splitext
(
Y
)[
0
]
name
=
os
.
path
.
split
(
body
)[
1
]
logfile
=
util
.
find_name
(
os
.
path
.
join
(
self
.
model_path
,(
'log_pred_
%s
.csv'
%
label
)))
if
t
is
not
None
:
y_pred
=
np
.
zeros
(
t
.
shape
)
self
.
mse_
=
np
.
zeros
(
self
.
n_est
)
mse_models
=
np
.
zeros
(
self
.
n_est
)
get_mse
=
True
else
:
get_mse
=
False
with
open
(
logfile
,
'w'
)
as
csvfile
:
w
=
csv
.
writer
(
csvfile
,
delimiter
=
','
)
if
get_mse
:
w
.
writerow
([
'model_ID'
,
'mse'
,
'mse_model'
,
'walltime'
,
'cputime'
])
else
:
w
.
writerow
([
'model_ID'
,
'walltime'
,
'cputime'
])
if
self
.
bootstrap
and
self
.
max_features
is
not
None
:
features
=
util
.
get_matrix
(
X
)
X_pred
=
os
.
path
.
join
(
self
.
model_path
,
'tmp'
,
'pred_x.hdf5'
)
else
:
X_pred
=
X
for
model
,
m
in
zip
(
self
.
estimators_
,
range
(
len
(
self
.
estimators_
))
):
print
(
'Predicting on model
%d
'
%
m
)
tt
=
util
.
Timer
()
if
self
.
bootstrap
and
self
.
max_features
is
not
None
:
write_hdf5
(
features
[
:
,
self
.
feature_selector
[
i
,:]
],
X_pred
)
if
Y
is
None
:
y_pred_tmp
=
model
.
predict
(
X_pred
)
y_out
.
append
(
y_pred_tmp
)
if
get_mse
:
y_pred
+=
y_pred_tmp
self
.
mse_
[
m
]
=
mse
(
t
,
y_pred
/
(
m
+
1
))
mse_models
[
m
]
=
mse
(
t
,
y_pred_tmp
)
else
:
Y_pred
=
(
'
%s
_
%02d
.hdf5'
%
(
body
,
m
))
model
.
predict
(
X_pred
,
Y_pred
)
tt
.
stop
(
print_any
=
False
)
with
open
(
logfile
,
'a'
)
as
csvfile
:
w
=
csv
.
writer
(
csvfile
,
delimiter
=
','
)
if
get_mse
:
w
.
writerow
([
m
,
self
.
mse_
[
m
],
mse_models
[
m
],
tt
.
walltime
,
tt
.
cputime
])
else
:
w
.
writerow
([
m
,
tt
.
walltime
,
tt
.
cputime
])
self
.
prediction_times_
.
append
([
tt
.
cputime
,
tt
.
walltime
])
if
Y
is
None
:
if
t
is
not
None
:
return
y_pred
/
self
.
n_est
,
y_out
else
:
return
y_out
elif
eval
:
util
.
merge_files
(
Y
,
self
.
n_est
,
batches
=
True
,
norm
=
norm
)
def
oob_prediction
(
self
,
Y
,
norm
=
None
):
# path: path where oob.hdf5 can be found
# FOR NOW NOT IN BATCHES (assumes ds to be small enough - necessary condition for bootstrap!)
body
=
os
.
path
.
splitext
(
Y
)[
0
]
filepath
=
os
.
path
.
split
(
Y
)[
0
]
name
=
os
.
path
.
split
(
body
)[
1
]
oob_file
=
os
.
path
.
join
(
self
.
model_path
,
'OOB.hdf5'
)
oob_inds
=
util
.
get_matrix
(
oob_file
)
n
=
oob_inds
.
shape
[
0
]
oob_count
=
np
.
zeros
((
n
,
1
))
oob_sigma
=
np
.
zeros
((
n
,
self
.
nt
))
y_sigma
=
np
.
zeros
((
n
,
self
.
nt
))
y_sigma_sq
=
np
.
zeros
((
n
,
self
.
nt
))
for
m
in
range
(
self
.
n_est
):
inds
=
oob_inds
[:,
m
]
.
reshape
((
-
1
,
1
))
oob_mask
=
np
.
repeat
(
inds
,
self
.
nt
,
axis
=
1
)
Y_pred
=
(
'
%s
_
%02d
.hdf5'
%
(
body
,
m
))
y_tmp
=
util
.
get_matrix
(
Y_pred
)
os
.
remove
(
Y_pred
)
if
norm
is
not
None
:
y_tmp
=
norm
.
rescale
(
y_tmp
)
y_sigma
+=
y_tmp
y_sigma_sq
+=
y_tmp
**
2
oob_count
+=
inds
oob_sigma
+=
oob_mask
*
y_tmp
zero_inds
=
np
.
nonzero
(
oob_count
==
0
)
print
(
'
%d
out of
%d
samples never out-of-bag'
%
(
len
(
zero_inds
),
n
))
print
(
'-> substituted these samples with overall prediction'
)
oob_sigma
[
zero_inds
,:]
=
y_sigma
[
zero_inds
,:]
oob_count
[
zero_inds
]
=
self
.
n_est
mean
=
y_sigma
/
self
.
n_est
var
=
1.0
/
self
.
n_est
*
(
y_sigma_sq
-
y_sigma
**
2
/
self
.
n_est
)
oob
=
oob_sigma
/
oob_count
util
.
make_hdf5
(
mean
,
Y
)
util
.
make_hdf5
(
var
,
body
+
'_var.hdf5'
)
util
.
make_hdf5
(
oob
,
os
.
path
.
join
(
filepath
,
'oob_prediction.hdf5'
))
return
oob
,
var
,
mean
def
_check_error
(
self
,
model
,
threshold
,
feature_file
,
target_file
):
# get the matrix of target values
target
=
util
.
get_matrix
(
target_file
)
# compute the prediction for the current model and the mean-squared-error
prediction
=
model
.
predict
(
feature_file
)
current_mse
=
mse
(
target
,
prediction
)
if
current_mse
>=
threshold
:
return
True
else
:
return
False
def
_bootstrap_iteration
(
self
,
model_ID
,
n_samples
,
features
,
targets
,
oob_ds
=
None
):
ind
=
np
.
random
.
randint
(
n
,
size
=
n
)
# ind = np.floor(np.random.rand(n_samples)*n_samples).astype(int)
if
self
.
oob
:
oob_vec
=
np
.
zeros
(
n_samples
)
oob_vec
[
np
.
delete
(
range
(
n_samples
),
np
.
unique
(
ind
))]
=
1
# get out-of-bag indices and set to 1
oob_ds
[:,
model_ID
]
=
oob_vec
if
self
.
max_features
is
not
None
:
ftrs
=
np
.
random
.
permutation
(
sself
.
max_features
)[
:
self
.
max_features
]
self
.
feature_selector
[
model_ID
,
:]
=
ftrs
else
:
ftrs
=
range
(
self
.
nf
)
train
=
features
[
ind
,
ftrs
]
train_t
=
targets
[
ind
]
X_tr
=
os
.
path
.
join
(
self
.
model_path
,
'tmp'
,
'train_x.hdf5'
)
T_tr
=
os
.
path
.
join
(
self
.
model_path
,
'tmp'
,
'train_t.hdf5'
)
write_hdf5
(
train
,
X_tr
)
write_hdf5
(
train_t
,
T_tr
)
return
X_tr
,
T_tr
Event Timeline
Log In to Comment