Page MenuHomec4science

RF_tests_2.py
No OneTemporary

File Metadata

Created
Sun, Apr 27, 10:17

RF_tests_2.py

import numpy as np
import pandas as pd
import xarray as xr
import os
import hpelm
import util
from ds import Dataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from tables import open_file, Atom, Filters
from sklearn.externals import joblib
from meteo_data import Meteo_Reader
from features import Training, Testing
import h5py
data_path = os.path.abspath("/mnt/sda1/hyenergy/data/meteo")
# data_path = os.path.abspath("/Users/alinawalch/Documents/EPFL/data/meteo")
testname = 'test_rf_2'
locmasks = ['rand500', 'rand1000']
# locmasks = ['rand100']
t_mask = 'rand'
# querynames = ['query_locs_13d_2000']
querynames = ['query_locs_13d_500', 'query_locs_13d_250']
forest_size = 500
max_treedepth = 100
# forest_size = 10
# max_treedepth = 10
ds = 2001
ftrs = ['_3D','_6D']
ftr_lists = (['x','y','z','month','hour'],['x','y','z','medDoG','big_NS','big_EW','month','hour'])
lbl_list = ['SIS']
hours = list(range(3,20))
months = list(range(1,13))
if not os.path.exists(os.path.join(data_path, 'datasets', testname + '.csv')):
with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'w') as f:
f.write('dataset, t_locs,n_locs,n_features,n_trees,' +
'fit_t_cpu,fit_t_wall,tr_mse,tr_mse_0,tr_t_cpu,tr_t_wall,' +
'te_mse,te_mse_0,te_t_cpu,te_t_wall,' +
'qu_t_cpu,qu_t_wall,query\n')
for locmask in locmasks:
for ftr,ftr_list in zip(ftrs, ftr_lists):
# get precise number of training and testing locations
mask = np.loadtxt('locations/%s.txt' %locmask)
n_mask = np.sum(mask)
# set location masks for the training and test
train_locs = "locations/"+locmask+"_train.txt"
test_locs = "locations/"+locmask+"_test.txt"
dsname = str(ds) + '_' + locmask + '_SIS' + ftr
print('Making datasets ... ')
t_set = util.Timer()
new_set = Training(data_path, dsname, ftr_list, lbl_list)
new_set.make_dataset(year = ds, sample_name = train_locs, test_name = test_locs)
new_set.normalize_all(feature_norm = 'mean', target_norm = 'mean', val_ratio = 1.0)
for queryname in querynames:
# queryname = querynames[0]
print('Making query dataset for %s' %queryname)
myquery = Testing(data_path, dsname, query_name = queryname)
myquery.make_query(loc = queryname+'.csv', hour = hours, month = months)
myquery.normalize_input()
t_set.stop(print_wallclock = False)
# load dataset
print('\nload dataset')
my_ds = Dataset(data_path, dsname)
# my_ds.get_matrices(['train', 'test', 'val'])
n_ftrs = len(my_ds.feature_norm.names)
# make matrices for training and testing
x0 = util.get_matrix(my_ds.train_X0)
t0 = util.get_matrix(my_ds.train_T0).reshape((-1,))
x = my_ds.feature_norm.normalize(x0)
t = my_ds.target_norm.normalize(t0)
test_x0 = util.get_matrix(my_ds.test_X0)
test_t0 = util.get_matrix(my_ds.test_T0).reshape((-1,))
test_x = my_ds.feature_norm.normalize(test_x0)
test_t = my_ds.target_norm.normalize(test_t0)
print('Current iteration: ds = %d, mask = %s, n_ftrs = %d, tree size = %d' %(ds,locmask, n_ftrs, forest_size))
####################
t_fit = util.Timer(start = False)
t_tr = util.Timer(start = False)
t_te = util.Timer(start = False)
t_qu = util.Timer(start = False)
tr_mse = tr_mse_0 = te_mse = te_mse_0 = -1
with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'a') as f:
f.write('%d,%s,%d,%d,%d,'
%(ds, t_mask, n_mask, n_ftrs, forest_size))
try:
# make a new model
modelname = 'RF' + str(forest_size) + '_1'
print('\nadding model')
my_ds.add_model(modelname)
tree = RandomForestRegressor(n_estimators = forest_size, max_depth = max_treedepth, n_jobs = -1)
print('\ntree fitting (training)')
t_fit.start()
tree.fit(x, t)
t_fit.stop()
joblib.dump(tree, os.path.join(my_ds.model_path, 'model.pkl'))
with open(os.path.join(my_ds.model_path, 'log.txt'), 'w') as f:
f.write('t_CPU,t_wall\n%f,%f' %(t_fit.cputime, t_fit.walltime))
with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'a') as f:
f.write('%f,%f,'
%(t_fit.cputime, t_fit.walltime))
print('\nsaved tree')
print('\nprediction on training set')
t_tr.start()
y = tree.predict(x)
tr_mse = mse(t,y)
y0 = my_ds.target_norm.rescale(y)
tr_mse_0 = mse(t0,y0)
t_tr.stop()
with h5py.File(my_ds.train_Y0, 'w') as f:
dset = f.create_dataset('prediction', data = y0)
with h5py.File(my_ds.train_Y, 'w') as f:
dset = f.create_dataset('prediction', data = y)
with open(os.path.join(my_ds.train_path_out, 'log.txt'), 'w') as f:
f.write('t_CPU,t_wall,mse,mse0\n%f,%f,%f,%f' %(t_tr.cputime, t_tr.walltime, tr_mse, tr_mse_0))
with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'a') as f:
f.write('%f,%f,%f,%f,'
%(tr_mse, tr_mse_0, t_tr.cputime, t_tr.walltime))
print('\nprediction on test set')
t_te.start()
test_y = tree.predict(test_x)
te_mse = mse(test_t,test_y)
test_y0 = my_ds.target_norm.rescale(test_y)
te_mse_0 = mse(test_t0,test_y0)
t_te.stop()
with h5py.File(my_ds.test_Y0, 'w') as f:
dset = f.create_dataset('prediction', data = test_y0)
with h5py.File(my_ds.test_Y, 'w') as f:
dset = f.create_dataset('prediction', data = test_y)
with open(os.path.join(my_ds.test_path_out, 'log.txt'), 'w') as f:
f.write('t_CPU,t_wall,mse,mse0\n%f,%f,%f,%f' %(t_te.cputime, t_te.walltime, te_mse, te_mse_0))
with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'a') as f:
f.write('%f,%f,%f,%f,'
%(te_mse, te_mse_0, t_te.cputime, t_te.walltime))
####################
for queryname in querynames:
# queryname = querynames[0]
####################
t_qu = util.Timer(start = False)
my_ds.load_query(queryname)
# my_ds.get_matrices(['query'])
my_ds.add_query_to_model(queryname)
query_x0 = util.get_matrix(my_ds.query_X0)
query_x = my_ds.feature_norm.normalize(query_x0)
print('\nprediction on query set')
t_qu.start()
query_y = tree.predict(query_x)
t_qu.stop()
query_y0 = my_ds.target_norm.rescale(query_y)
rf_out = my_ds.make_xarray(query_x0, query_y0, rescale = False)
with h5py.File(my_ds.query_Y0, 'w') as f:
dset = f.create_dataset('prediction', data = query_y0)
with h5py.File(my_ds.query_Y, 'w') as f:
dset = f.create_dataset('prediction', data = query_y)
with open(os.path.join(my_ds.query_path_out, 'log.txt'), 'w') as f:
f.write('t_CPU,t_wall\n%f,%f' %(t_qu.cputime, t_qu.walltime))
with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'a') as f:
f.write('%f,%f,%s'
%(t_qu.cputime, t_qu.walltime, queryname))
rf_out.to_netcdf(os.path.join(my_ds.query_path_out, 'prediction.nc'))
except:
print('Error occurred during evaluation of forest - skip current iteration')
with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'a') as f:
f.write('\n')

Event Timeline