Page MenuHomec4science

ds.py
No OneTemporary

File Metadata

Created
Mon, Apr 28, 10:53
import numpy as np
import pandas as pd
import xarray as xr
import os
import time
import h5py
import util
from features import Testing
import norms
class Dataset():
def __init__(self, path, ds_name, query_name = None):
self.data_path = os.path.join(path,"raw_data")
self.ds_path = os.path.join(path,"datasets", ds_name)
self.location_path = os.path.join(path,"locations")
self.train_path = os.path.join(self.ds_path, 'train')
self.test_path = os.path.join(self.ds_path, 'test')
self.query_path = None
# Create a new folder if it does not exist yet:
if not os.path.exists(self.ds_path):
print("Features and targets not yet create - make features first")
return
self.load_training()
self.load_validation()
self.load_testing()
self.load_metadata()
# LOAD TESTING/QUERY DATA
if query_name is not None:
self.load_query(query_name)
################################################ READ Filenames #######################################################
def load_normalization(self):
self.feature_norm = util.Normalizer(os.path.join(self.ds_path, "norm_features.csv"))
self.target_norm = util.Normalizer(os.path.join(self.ds_path, "norm_targets.csv"))
def load_metadata(self):
dates = pd.read_csv(os.path.join(self.train_path, 'train_dates.csv'), header = None, index_col = 0)
self.train_start = str(dates.loc['start_date'].as_matrix()[0])
self.train_end = str(dates.loc['end_date'].as_matrix()[0])
self.train_locs = pd.read_csv(os.path.join(self.train_path, 'train_locations.csv'), index_col = 0)
def load_training(self):
# LOAD TRAINING DATA
if os.path.exists(os.path.join(self.train_path, "features.hdf5")) and os.path.exists(os.path.join(self.train_path,"targets.hdf5")):
self.train_X = os.path.join(self.train_path,"train_features.hdf5")
self.train_X0 = os.path.join(self.train_path,"features.hdf5")
self.train_T = os.path.join(self.train_path,"train_targets.hdf5")
self.train_T0 = os.path.join(self.train_path,"targets.hdf5")
self.load_normalization()
print("Set training features and targets")
else:
print("Training features and targets not yet created")
return
def load_validation(self):
# LOAD VALIDATION DATA
if os.path.exists(os.path.join(self.train_path, "features.hdf5")) and os.path.exists(os.path.join(self.train_path,"targets.hdf5")):
self.val_X = os.path.join(self.train_path,"val_features.hdf5")
self.val_T = os.path.join(self.train_path,"val_targets.hdf5")
print("Set validation features and targets")
self.val_Y = os.path.join(self.train_path,"val_prediction.hdf5")
else:
print("Validation features and targets not yet created")
def load_testing(self):
# LOAD TESTING DATA
if os.path.exists(os.path.join(self.test_path, "features.hdf5")) and os.path.exists(os.path.join(self.test_path,"targets.hdf5")):
self.test_X = os.path.join(self.test_path,"test_features.hdf5")
self.test_X0 = os.path.join(self.test_path,"features.hdf5")
self.test_T = os.path.join(self.test_path,"test_targets.hdf5")
self.test_T0 = os.path.join(self.test_path,"targets.hdf5")
print("Set testing features and targets")
else:
print("Testing features and targets not yet created")
def load_query(self, query_name):
self.query_path = os.path.join(self.ds_path, 'query', query_name)
# LOAD QUERY DATA
if os.path.exists(os.path.join(self.query_path, "features.hdf5")):
self.query_X = os.path.join(self.query_path,"query_features.hdf5")
self.query_X0 = os.path.join(self.query_path,"features.hdf5")
print("Set query features and targets")
else:
print("Query features and targets not yet created")
def rescale_query(self, force_rescaling = False):
self.rescale_output('query_prediction.hdf5', 'prediction', self.query_path_out, force_rescaling = force_rescaling)
############################### add model ######################################
def add_model(self, modelname, query_name = None):
self.model_path = os.path.join(self.ds_path, modelname)
self.train_path_out = os.path.join(self.model_path, 'train')
self.test_path_out = os.path.join(self.model_path, 'test')
if query_name is not None:
self.query_path_out = os.path.join(self.model_path, 'query', query_name)
if os.path.exists(self.model_path):
print("Model directory already exists - no new folders created")
else:
os.mkdir(self.model_path)
os.mkdir(os.path.join(self.model_path, 'train'))
os.mkdir(os.path.join(self.model_path, 'test'))
os.mkdir(os.path.join(self.model_path, 'query'))
for subdir, dirs, files in os.walk(os.path.join(self.ds_path, 'query')):
for directory in dirs:
os.mkdir(os.path.join(self.model_path, 'query', directory))
break
self.train_Y = os.path.join(self.train_path_out,"train_prediction.hdf5")
self.train_Y0 = os.path.join(self.train_path_out,"prediction.hdf5")
self.test_Y = os.path.join(self.test_path_out,"test_prediction.hdf5")
self.test_Y0 = os.path.join(self.test_path_out,"prediction.hdf5")
if query_name is not None:
if os.path.exists(self.query_path_out):
self.query_Y = os.path.join(self.query_path_out,"query_prediction.hdf5")
self.query_Y0 = os.path.join(self.query_path_out,"prediction.hdf5")
else:
print('ERROR: Query not found')
def add_query_to_model(self, query_name, modelname = None):
if modelname is not None:
self.model_path = os.path.join(self.ds_path, modelname)
self.query_path_out = os.path.join(self.model_path, 'query', query_name)
if os.path.exists(self.query_path_out):
self.query_Y = os.path.join(self.query_path_out,"query_prediction.hdf5")
self.query_Y0 = os.path.join(self.query_path_out,"prediction.hdf5")
else:
print('ERROR: Query not found')
############################### make dataset #####################################
def make_xarray(self, x, y, rescale):
coords = ['x', 'y', 'hour', 'month']
tbl_idx = list(set(coords) & set(self.feature_norm.names))
if type(x) == str:
x = util.get_matrix(x)
if type(y) == str:
y = util.get_matrix(y)
if rescale:
x = self.feature_norm.rescale(x)
y = self.target_norm.rescale(y)
X = pd.DataFrame(data = x, columns = self.feature_norm.names)
Y = pd.DataFrame(data = y, columns = self.target_norm.names)
tbl = pd.concat([X, Y], axis = 1)
tbl.set_index(tbl_idx, inplace = True)
tbl = tbl.groupby(tbl_idx).mean() # make table unique
return tbl.to_xarray()
###################################################################################
def get_matrices(self, variables = ['train', 'test', 'val']):
for v in variables:
if v == 'train':
self.train_x = util.get_matrix(self.train_X)
self.train_t = util.get_matrix(self.train_T)
if v == 'val':
self.val_x = util.get_matrix(self.val_X)
self.val_t = util.get_matrix(self.val_T)
if v == 'test':
self.test_x = util.get_matrix(self.test_X)
self.test_t = util.get_matrix(self.test_T)
if v == 'query' and self.query_path is not None:
self.query_x = util.get_matrix(self.query_X)
def rescale_output(self, h5file, target_name, path, set_norm = False, norm_type = 'none', force_rescaling = False):
infile = os.path.join(path, h5file)
outfile = os.path.join(path, target_name)
if os.path.exists(infile):
if set_norm:
self.target_norm.set_status(norm_type)
norms.rescale_hdf5_copy(infile, outfile, self.target_norm, force_rescaling = force_rescaling)
else:
print("Query outputs to rescale do not exist")

Event Timeline