Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F110663028
RF_tests_2.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Apr 27, 10:17
Size
6 KB
Mime Type
text/x-python
Expires
Tue, Apr 29, 10:17 (2 d)
Engine
blob
Format
Raw Data
Handle
25803476
Attached To
R8800 solar_potential
RF_tests_2.py
View Options
import
numpy
as
np
import
pandas
as
pd
import
xarray
as
xr
import
os
import
hpelm
import
util
from
ds
import
Dataset
from
sklearn.ensemble
import
RandomForestRegressor
from
sklearn.metrics
import
mean_squared_error
as
mse
from
tables
import
open_file
,
Atom
,
Filters
from
sklearn.externals
import
joblib
from
meteo_data
import
Meteo_Reader
from
features
import
Training
,
Testing
import
h5py
data_path
=
os
.
path
.
abspath
(
"/mnt/sda1/hyenergy/data/meteo"
)
# data_path = os.path.abspath("/Users/alinawalch/Documents/EPFL/data/meteo")
testname
=
'test_rf_2'
locmasks
=
[
'rand500'
,
'rand1000'
]
# locmasks = ['rand100']
t_mask
=
'rand'
# querynames = ['query_locs_13d_2000']
querynames
=
[
'query_locs_13d_500'
,
'query_locs_13d_250'
]
forest_size
=
500
max_treedepth
=
100
# forest_size = 10
# max_treedepth = 10
ds
=
2001
ftrs
=
[
'_3D'
,
'_6D'
]
ftr_lists
=
([
'x'
,
'y'
,
'z'
,
'month'
,
'hour'
],[
'x'
,
'y'
,
'z'
,
'medDoG'
,
'big_NS'
,
'big_EW'
,
'month'
,
'hour'
])
lbl_list
=
[
'SIS'
]
hours
=
list
(
range
(
3
,
20
))
months
=
list
(
range
(
1
,
13
))
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
data_path
,
'datasets'
,
testname
+
'.csv'
)):
with
open
(
os
.
path
.
join
(
data_path
,
'datasets'
,
testname
+
'.csv'
),
'w'
)
as
f
:
f
.
write
(
'dataset, t_locs,n_locs,n_features,n_trees,'
+
'fit_t_cpu,fit_t_wall,tr_mse,tr_mse_0,tr_t_cpu,tr_t_wall,'
+
'te_mse,te_mse_0,te_t_cpu,te_t_wall,'
+
'qu_t_cpu,qu_t_wall,query
\n
'
)
for
locmask
in
locmasks
:
for
ftr
,
ftr_list
in
zip
(
ftrs
,
ftr_lists
):
# get precise number of training and testing locations
mask
=
np
.
loadtxt
(
'locations/
%s
.txt'
%
locmask
)
n_mask
=
np
.
sum
(
mask
)
# set location masks for the training and test
train_locs
=
"locations/"
+
locmask
+
"_train.txt"
test_locs
=
"locations/"
+
locmask
+
"_test.txt"
dsname
=
str
(
ds
)
+
'_'
+
locmask
+
'_SIS'
+
ftr
print
(
'Making datasets ... '
)
t_set
=
util
.
Timer
()
new_set
=
Training
(
data_path
,
dsname
,
ftr_list
,
lbl_list
)
new_set
.
make_dataset
(
year
=
ds
,
sample_name
=
train_locs
,
test_name
=
test_locs
)
new_set
.
normalize_all
(
feature_norm
=
'mean'
,
target_norm
=
'mean'
,
val_ratio
=
1.0
)
for
queryname
in
querynames
:
# queryname = querynames[0]
print
(
'Making query dataset for
%s
'
%
queryname
)
myquery
=
Testing
(
data_path
,
dsname
,
query_name
=
queryname
)
myquery
.
make_query
(
loc
=
queryname
+
'.csv'
,
hour
=
hours
,
month
=
months
)
myquery
.
normalize_input
()
t_set
.
stop
(
print_wallclock
=
False
)
# load dataset
print
(
'
\n
load dataset'
)
my_ds
=
Dataset
(
data_path
,
dsname
)
# my_ds.get_matrices(['train', 'test', 'val'])
n_ftrs
=
len
(
my_ds
.
feature_norm
.
names
)
# make matrices for training and testing
x0
=
util
.
get_matrix
(
my_ds
.
train_X0
)
t0
=
util
.
get_matrix
(
my_ds
.
train_T0
)
.
reshape
((
-
1
,))
x
=
my_ds
.
feature_norm
.
normalize
(
x0
)
t
=
my_ds
.
target_norm
.
normalize
(
t0
)
test_x0
=
util
.
get_matrix
(
my_ds
.
test_X0
)
test_t0
=
util
.
get_matrix
(
my_ds
.
test_T0
)
.
reshape
((
-
1
,))
test_x
=
my_ds
.
feature_norm
.
normalize
(
test_x0
)
test_t
=
my_ds
.
target_norm
.
normalize
(
test_t0
)
print
(
'Current iteration: ds =
%d
, mask =
%s
, n_ftrs =
%d
, tree size =
%d
'
%
(
ds
,
locmask
,
n_ftrs
,
forest_size
))
####################
t_fit
=
util
.
Timer
(
start
=
False
)
t_tr
=
util
.
Timer
(
start
=
False
)
t_te
=
util
.
Timer
(
start
=
False
)
t_qu
=
util
.
Timer
(
start
=
False
)
tr_mse
=
tr_mse_0
=
te_mse
=
te_mse_0
=
-
1
with
open
(
os
.
path
.
join
(
data_path
,
'datasets'
,
testname
+
'.csv'
),
'a'
)
as
f
:
f
.
write
(
'
%d
,
%s
,
%d
,
%d
,
%d
,'
%
(
ds
,
t_mask
,
n_mask
,
n_ftrs
,
forest_size
))
try
:
# make a new model
modelname
=
'RF'
+
str
(
forest_size
)
+
'_1'
print
(
'
\n
adding model'
)
my_ds
.
add_model
(
modelname
)
tree
=
RandomForestRegressor
(
n_estimators
=
forest_size
,
max_depth
=
max_treedepth
,
n_jobs
=
-
1
)
print
(
'
\n
tree fitting (training)'
)
t_fit
.
start
()
tree
.
fit
(
x
,
t
)
t_fit
.
stop
()
joblib
.
dump
(
tree
,
os
.
path
.
join
(
my_ds
.
model_path
,
'model.pkl'
))
with
open
(
os
.
path
.
join
(
my_ds
.
model_path
,
'log.txt'
),
'w'
)
as
f
:
f
.
write
(
't_CPU,t_wall
\n
%f
,
%f
'
%
(
t_fit
.
cputime
,
t_fit
.
walltime
))
with
open
(
os
.
path
.
join
(
data_path
,
'datasets'
,
testname
+
'.csv'
),
'a'
)
as
f
:
f
.
write
(
'
%f
,
%f
,'
%
(
t_fit
.
cputime
,
t_fit
.
walltime
))
print
(
'
\n
saved tree'
)
print
(
'
\n
prediction on training set'
)
t_tr
.
start
()
y
=
tree
.
predict
(
x
)
tr_mse
=
mse
(
t
,
y
)
y0
=
my_ds
.
target_norm
.
rescale
(
y
)
tr_mse_0
=
mse
(
t0
,
y0
)
t_tr
.
stop
()
with
h5py
.
File
(
my_ds
.
train_Y0
,
'w'
)
as
f
:
dset
=
f
.
create_dataset
(
'prediction'
,
data
=
y0
)
with
h5py
.
File
(
my_ds
.
train_Y
,
'w'
)
as
f
:
dset
=
f
.
create_dataset
(
'prediction'
,
data
=
y
)
with
open
(
os
.
path
.
join
(
my_ds
.
train_path_out
,
'log.txt'
),
'w'
)
as
f
:
f
.
write
(
't_CPU,t_wall,mse,mse0
\n
%f
,
%f
,
%f
,
%f
'
%
(
t_tr
.
cputime
,
t_tr
.
walltime
,
tr_mse
,
tr_mse_0
))
with
open
(
os
.
path
.
join
(
data_path
,
'datasets'
,
testname
+
'.csv'
),
'a'
)
as
f
:
f
.
write
(
'
%f
,
%f
,
%f
,
%f
,'
%
(
tr_mse
,
tr_mse_0
,
t_tr
.
cputime
,
t_tr
.
walltime
))
print
(
'
\n
prediction on test set'
)
t_te
.
start
()
test_y
=
tree
.
predict
(
test_x
)
te_mse
=
mse
(
test_t
,
test_y
)
test_y0
=
my_ds
.
target_norm
.
rescale
(
test_y
)
te_mse_0
=
mse
(
test_t0
,
test_y0
)
t_te
.
stop
()
with
h5py
.
File
(
my_ds
.
test_Y0
,
'w'
)
as
f
:
dset
=
f
.
create_dataset
(
'prediction'
,
data
=
test_y0
)
with
h5py
.
File
(
my_ds
.
test_Y
,
'w'
)
as
f
:
dset
=
f
.
create_dataset
(
'prediction'
,
data
=
test_y
)
with
open
(
os
.
path
.
join
(
my_ds
.
test_path_out
,
'log.txt'
),
'w'
)
as
f
:
f
.
write
(
't_CPU,t_wall,mse,mse0
\n
%f
,
%f
,
%f
,
%f
'
%
(
t_te
.
cputime
,
t_te
.
walltime
,
te_mse
,
te_mse_0
))
with
open
(
os
.
path
.
join
(
data_path
,
'datasets'
,
testname
+
'.csv'
),
'a'
)
as
f
:
f
.
write
(
'
%f
,
%f
,
%f
,
%f
,'
%
(
te_mse
,
te_mse_0
,
t_te
.
cputime
,
t_te
.
walltime
))
####################
for
queryname
in
querynames
:
# queryname = querynames[0]
####################
t_qu
=
util
.
Timer
(
start
=
False
)
my_ds
.
load_query
(
queryname
)
# my_ds.get_matrices(['query'])
my_ds
.
add_query_to_model
(
queryname
)
query_x0
=
util
.
get_matrix
(
my_ds
.
query_X0
)
query_x
=
my_ds
.
feature_norm
.
normalize
(
query_x0
)
print
(
'
\n
prediction on query set'
)
t_qu
.
start
()
query_y
=
tree
.
predict
(
query_x
)
t_qu
.
stop
()
query_y0
=
my_ds
.
target_norm
.
rescale
(
query_y
)
rf_out
=
my_ds
.
make_xarray
(
query_x0
,
query_y0
,
rescale
=
False
)
with
h5py
.
File
(
my_ds
.
query_Y0
,
'w'
)
as
f
:
dset
=
f
.
create_dataset
(
'prediction'
,
data
=
query_y0
)
with
h5py
.
File
(
my_ds
.
query_Y
,
'w'
)
as
f
:
dset
=
f
.
create_dataset
(
'prediction'
,
data
=
query_y
)
with
open
(
os
.
path
.
join
(
my_ds
.
query_path_out
,
'log.txt'
),
'w'
)
as
f
:
f
.
write
(
't_CPU,t_wall
\n
%f
,
%f
'
%
(
t_qu
.
cputime
,
t_qu
.
walltime
))
with
open
(
os
.
path
.
join
(
data_path
,
'datasets'
,
testname
+
'.csv'
),
'a'
)
as
f
:
f
.
write
(
'
%f
,
%f
,
%s
'
%
(
t_qu
.
cputime
,
t_qu
.
walltime
,
queryname
))
rf_out
.
to_netcdf
(
os
.
path
.
join
(
my_ds
.
query_path_out
,
'prediction.nc'
))
except
:
print
(
'Error occurred during evaluation of forest - skip current iteration'
)
with
open
(
os
.
path
.
join
(
data_path
,
'datasets'
,
testname
+
'.csv'
),
'a'
)
as
f
:
f
.
write
(
'
\n
'
)
Event Timeline
Log In to Comment