ds.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Apr 28, 10:53

ds.py
View Options

	import numpy as np
	import pandas as pd
	import xarray as xr
	import os
	import time
	import h5py
	import util
	from features import Testing
	import norms

	class Dataset():

	def __init__(self, path, ds_name, query_name = None):
	self.data_path = os.path.join(path,"raw_data")
	self.ds_path = os.path.join(path,"datasets", ds_name)
	self.location_path = os.path.join(path,"locations")
	self.train_path = os.path.join(self.ds_path, 'train')
	self.test_path = os.path.join(self.ds_path, 'test')
	self.query_path = None

	# Create a new folder if it does not exist yet:
	if not os.path.exists(self.ds_path):
	print("Features and targets not yet create - make features first")
	return

	self.load_training()
	self.load_validation()
	self.load_testing()
	self.load_metadata()

	# LOAD TESTING/QUERY DATA
	if query_name is not None:
	self.load_query(query_name)

	################################################ READ Filenames #######################################################

	def load_normalization(self):
	self.feature_norm = util.Normalizer(os.path.join(self.ds_path, "norm_features.csv"))
	self.target_norm = util.Normalizer(os.path.join(self.ds_path, "norm_targets.csv"))

	def load_metadata(self):
	dates = pd.read_csv(os.path.join(self.train_path, 'train_dates.csv'), header = None, index_col = 0)
	self.train_start = str(dates.loc['start_date'].as_matrix()[0])
	self.train_end = str(dates.loc['end_date'].as_matrix()[0])

	self.train_locs = pd.read_csv(os.path.join(self.train_path, 'train_locations.csv'), index_col = 0)

	def load_training(self):
	# LOAD TRAINING DATA
	if os.path.exists(os.path.join(self.train_path, "features.hdf5")) and os.path.exists(os.path.join(self.train_path,"targets.hdf5")):
	self.train_X = os.path.join(self.train_path,"train_features.hdf5")
	self.train_X0 = os.path.join(self.train_path,"features.hdf5")
	self.train_T = os.path.join(self.train_path,"train_targets.hdf5")
	self.train_T0 = os.path.join(self.train_path,"targets.hdf5")
	self.load_normalization()
	print("Set training features and targets")
	else:
	print("Training features and targets not yet created")
	return

	def load_validation(self):
	# LOAD VALIDATION DATA
	if os.path.exists(os.path.join(self.train_path, "features.hdf5")) and os.path.exists(os.path.join(self.train_path,"targets.hdf5")):
	self.val_X = os.path.join(self.train_path,"val_features.hdf5")
	self.val_T = os.path.join(self.train_path,"val_targets.hdf5")
	print("Set validation features and targets")
	self.val_Y = os.path.join(self.train_path,"val_prediction.hdf5")
	else:
	print("Validation features and targets not yet created")

	def load_testing(self):
	# LOAD TESTING DATA
	if os.path.exists(os.path.join(self.test_path, "features.hdf5")) and os.path.exists(os.path.join(self.test_path,"targets.hdf5")):
	self.test_X = os.path.join(self.test_path,"test_features.hdf5")
	self.test_X0 = os.path.join(self.test_path,"features.hdf5")
	self.test_T = os.path.join(self.test_path,"test_targets.hdf5")
	self.test_T0 = os.path.join(self.test_path,"targets.hdf5")
	print("Set testing features and targets")

	else:
	print("Testing features and targets not yet created")

	def load_query(self, query_name):
	self.query_path = os.path.join(self.ds_path, 'query', query_name)
	# LOAD QUERY DATA
	if os.path.exists(os.path.join(self.query_path, "features.hdf5")):
	self.query_X = os.path.join(self.query_path,"query_features.hdf5")
	self.query_X0 = os.path.join(self.query_path,"features.hdf5")
	print("Set query features and targets")
	else:
	print("Query features and targets not yet created")

	def rescale_query(self, force_rescaling = False):
	self.rescale_output('query_prediction.hdf5', 'prediction', self.query_path_out, force_rescaling = force_rescaling)

	############################### add model ######################################

	def add_model(self, modelname, query_name = None):
	self.model_path = os.path.join(self.ds_path, modelname)

	self.train_path_out = os.path.join(self.model_path, 'train')
	self.test_path_out = os.path.join(self.model_path, 'test')

	if query_name is not None:
	self.query_path_out = os.path.join(self.model_path, 'query', query_name)

	if os.path.exists(self.model_path):
	print("Model directory already exists - no new folders created")

	else:
	os.mkdir(self.model_path)
	os.mkdir(os.path.join(self.model_path, 'train'))
	os.mkdir(os.path.join(self.model_path, 'test'))
	os.mkdir(os.path.join(self.model_path, 'query'))

	for subdir, dirs, files in os.walk(os.path.join(self.ds_path, 'query')):
	for directory in dirs:
	os.mkdir(os.path.join(self.model_path, 'query', directory))
	break

	self.train_Y = os.path.join(self.train_path_out,"train_prediction.hdf5")
	self.train_Y0 = os.path.join(self.train_path_out,"prediction.hdf5")

	self.test_Y = os.path.join(self.test_path_out,"test_prediction.hdf5")
	self.test_Y0 = os.path.join(self.test_path_out,"prediction.hdf5")

	if query_name is not None:
	if os.path.exists(self.query_path_out):
	self.query_Y = os.path.join(self.query_path_out,"query_prediction.hdf5")
	self.query_Y0 = os.path.join(self.query_path_out,"prediction.hdf5")
	else:
	print('ERROR: Query not found')

	def add_query_to_model(self, query_name, modelname = None):
	if modelname is not None:
	self.model_path = os.path.join(self.ds_path, modelname)

	self.query_path_out = os.path.join(self.model_path, 'query', query_name)

	if os.path.exists(self.query_path_out):
	self.query_Y = os.path.join(self.query_path_out,"query_prediction.hdf5")
	self.query_Y0 = os.path.join(self.query_path_out,"prediction.hdf5")
	else:
	print('ERROR: Query not found')

	############################### make dataset #####################################

	def make_xarray(self, x, y, rescale):
	coords = ['x', 'y', 'hour', 'month']
	tbl_idx = list(set(coords) & set(self.feature_norm.names))

	if type(x) == str:
	x = util.get_matrix(x)

	if type(y) == str:
	y = util.get_matrix(y)

	if rescale:
	x = self.feature_norm.rescale(x)
	y = self.target_norm.rescale(y)

	X = pd.DataFrame(data = x, columns = self.feature_norm.names)
	Y = pd.DataFrame(data = y, columns = self.target_norm.names)

	tbl = pd.concat([X, Y], axis = 1)
	tbl.set_index(tbl_idx, inplace = True)
	tbl = tbl.groupby(tbl_idx).mean() # make table unique

	return tbl.to_xarray()


	###################################################################################

	def get_matrices(self, variables = ['train', 'test', 'val']):
	for v in variables:
	if v == 'train':
	self.train_x = util.get_matrix(self.train_X)
	self.train_t = util.get_matrix(self.train_T)

	if v == 'val':
	self.val_x = util.get_matrix(self.val_X)
	self.val_t = util.get_matrix(self.val_T)

	if v == 'test':
	self.test_x = util.get_matrix(self.test_X)
	self.test_t = util.get_matrix(self.test_T)

	if v == 'query' and self.query_path is not None:
	self.query_x = util.get_matrix(self.query_X)

	def rescale_output(self, h5file, target_name, path, set_norm = False, norm_type = 'none', force_rescaling = False):
	infile = os.path.join(path, h5file)
	outfile = os.path.join(path, target_name)

	if os.path.exists(infile):
	if set_norm:
	self.target_norm.set_status(norm_type)
	norms.rescale_hdf5_copy(infile, outfile, self.target_norm, force_rescaling = force_rescaling)
	else:
	print("Query outputs to rescale do not exist")

ds.pyNo OneTemporaryActions

File Metadata

ds.pyView Options

Event Timeline

ds.py
No OneTemporary
Actions

ds.py
View Options