RF_tests_2.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Apr 27, 10:17

RF_tests_2.py
View Options

	import numpy as np
	import pandas as pd
	import xarray as xr
	import os
	import hpelm
	import util
	from ds import Dataset
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_squared_error as mse
	from tables import open_file, Atom, Filters
	from sklearn.externals import joblib
	from meteo_data import Meteo_Reader
	from features import Training, Testing
	import h5py

	data_path = os.path.abspath("/mnt/sda1/hyenergy/data/meteo")
	# data_path = os.path.abspath("/Users/alinawalch/Documents/EPFL/data/meteo")
	testname = 'test_rf_2'

	locmasks = ['rand500', 'rand1000']
	# locmasks = ['rand100']
	t_mask = 'rand'
	# querynames = ['query_locs_13d_2000']
	querynames = ['query_locs_13d_500', 'query_locs_13d_250']
	forest_size = 500
	max_treedepth = 100
	# forest_size = 10
	# max_treedepth = 10

	ds = 2001

	ftrs = ['_3D','_6D']
	ftr_lists = (['x','y','z','month','hour'],['x','y','z','medDoG','big_NS','big_EW','month','hour'])
	lbl_list = ['SIS']

	hours = list(range(3,20))
	months = list(range(1,13))

	if not os.path.exists(os.path.join(data_path, 'datasets', testname + '.csv')):
	with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'w') as f:
	f.write('dataset, t_locs,n_locs,n_features,n_trees,' +
	'fit_t_cpu,fit_t_wall,tr_mse,tr_mse_0,tr_t_cpu,tr_t_wall,' +
	'te_mse,te_mse_0,te_t_cpu,te_t_wall,' +
	'qu_t_cpu,qu_t_wall,query\n')

	for locmask in locmasks:
	for ftr,ftr_list in zip(ftrs, ftr_lists):

	# get precise number of training and testing locations
	mask = np.loadtxt('locations/%s.txt' %locmask)
	n_mask = np.sum(mask)

	# set location masks for the training and test
	train_locs = "locations/"+locmask+"_train.txt"
	test_locs = "locations/"+locmask+"_test.txt"
	dsname = str(ds) + '_' + locmask + '_SIS' + ftr

	print('Making datasets ... ')
	t_set = util.Timer()
	new_set = Training(data_path, dsname, ftr_list, lbl_list)
	new_set.make_dataset(year = ds, sample_name = train_locs, test_name = test_locs)
	new_set.normalize_all(feature_norm = 'mean', target_norm = 'mean', val_ratio = 1.0)
	for queryname in querynames:
	# queryname = querynames[0]
	print('Making query dataset for %s' %queryname)
	myquery = Testing(data_path, dsname, query_name = queryname)
	myquery.make_query(loc = queryname+'.csv', hour = hours, month = months)
	myquery.normalize_input()
	t_set.stop(print_wallclock = False)

	# load dataset
	print('\nload dataset')

	my_ds = Dataset(data_path, dsname)
	# my_ds.get_matrices(['train', 'test', 'val'])

	n_ftrs = len(my_ds.feature_norm.names)

	# make matrices for training and testing
	x0 = util.get_matrix(my_ds.train_X0)
	t0 = util.get_matrix(my_ds.train_T0).reshape((-1,))
	x = my_ds.feature_norm.normalize(x0)
	t = my_ds.target_norm.normalize(t0)

	test_x0 = util.get_matrix(my_ds.test_X0)
	test_t0 = util.get_matrix(my_ds.test_T0).reshape((-1,))
	test_x = my_ds.feature_norm.normalize(test_x0)
	test_t = my_ds.target_norm.normalize(test_t0)

	print('Current iteration: ds = %d, mask = %s, n_ftrs = %d, tree size = %d' %(ds,locmask, n_ftrs, forest_size))
	####################

	t_fit = util.Timer(start = False)
	t_tr = util.Timer(start = False)
	t_te = util.Timer(start = False)
	t_qu = util.Timer(start = False)
	tr_mse = tr_mse_0 = te_mse = te_mse_0 = -1

	with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'a') as f:
	f.write('%d,%s,%d,%d,%d,'
	%(ds, t_mask, n_mask, n_ftrs, forest_size))

	try:
	# make a new model
	modelname = 'RF' + str(forest_size) + '_1'
	print('\nadding model')
	my_ds.add_model(modelname)

	tree = RandomForestRegressor(n_estimators = forest_size, max_depth = max_treedepth, n_jobs = -1)

	print('\ntree fitting (training)')
	t_fit.start()
	tree.fit(x, t)
	t_fit.stop()

	joblib.dump(tree, os.path.join(my_ds.model_path, 'model.pkl'))
	with open(os.path.join(my_ds.model_path, 'log.txt'), 'w') as f:
	f.write('t_CPU,t_wall\n%f,%f' %(t_fit.cputime, t_fit.walltime))
	with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'a') as f:
	f.write('%f,%f,'
	%(t_fit.cputime, t_fit.walltime))
	print('\nsaved tree')

	print('\nprediction on training set')
	t_tr.start()
	y = tree.predict(x)
	tr_mse = mse(t,y)

	y0 = my_ds.target_norm.rescale(y)
	tr_mse_0 = mse(t0,y0)
	t_tr.stop()

	with h5py.File(my_ds.train_Y0, 'w') as f:
	dset = f.create_dataset('prediction', data = y0)
	with h5py.File(my_ds.train_Y, 'w') as f:
	dset = f.create_dataset('prediction', data = y)
	with open(os.path.join(my_ds.train_path_out, 'log.txt'), 'w') as f:
	f.write('t_CPU,t_wall,mse,mse0\n%f,%f,%f,%f' %(t_tr.cputime, t_tr.walltime, tr_mse, tr_mse_0))
	with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'a') as f:
	f.write('%f,%f,%f,%f,'
	%(tr_mse, tr_mse_0, t_tr.cputime, t_tr.walltime))


	print('\nprediction on test set')
	t_te.start()
	test_y = tree.predict(test_x)
	te_mse = mse(test_t,test_y)

	test_y0 = my_ds.target_norm.rescale(test_y)
	te_mse_0 = mse(test_t0,test_y0)
	t_te.stop()

	with h5py.File(my_ds.test_Y0, 'w') as f:
	dset = f.create_dataset('prediction', data = test_y0)
	with h5py.File(my_ds.test_Y, 'w') as f:
	dset = f.create_dataset('prediction', data = test_y)
	with open(os.path.join(my_ds.test_path_out, 'log.txt'), 'w') as f:
	f.write('t_CPU,t_wall,mse,mse0\n%f,%f,%f,%f' %(t_te.cputime, t_te.walltime, te_mse, te_mse_0))
	with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'a') as f:
	f.write('%f,%f,%f,%f,'
	%(te_mse, te_mse_0, t_te.cputime, t_te.walltime))

	####################

	for queryname in querynames:
	# queryname = querynames[0]

	####################
	t_qu = util.Timer(start = False)

	my_ds.load_query(queryname)
	# my_ds.get_matrices(['query'])
	my_ds.add_query_to_model(queryname)

	query_x0 = util.get_matrix(my_ds.query_X0)
	query_x = my_ds.feature_norm.normalize(query_x0)

	print('\nprediction on query set')
	t_qu.start()
	query_y = tree.predict(query_x)
	t_qu.stop()

	query_y0 = my_ds.target_norm.rescale(query_y)
	rf_out = my_ds.make_xarray(query_x0, query_y0, rescale = False)

	with h5py.File(my_ds.query_Y0, 'w') as f:
	dset = f.create_dataset('prediction', data = query_y0)
	with h5py.File(my_ds.query_Y, 'w') as f:
	dset = f.create_dataset('prediction', data = query_y)
	with open(os.path.join(my_ds.query_path_out, 'log.txt'), 'w') as f:
	f.write('t_CPU,t_wall\n%f,%f' %(t_qu.cputime, t_qu.walltime))
	with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'a') as f:
	f.write('%f,%f,%s'
	%(t_qu.cputime, t_qu.walltime, queryname))

	rf_out.to_netcdf(os.path.join(my_ds.query_path_out, 'prediction.nc'))
	except:
	print('Error occurred during evaluation of forest - skip current iteration')

	with open(os.path.join(data_path, 'datasets', testname + '.csv'), 'a') as f:
	f.write('\n')

RF_tests_2.pyNo OneTemporaryActions

File Metadata

RF_tests_2.pyView Options

Event Timeline

RF_tests_2.py
No OneTemporary
Actions

RF_tests_2.py
View Options