ds_old.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Jul 6, 08:51

ds_old.py
View Options

	import numpy as np
	import pandas as pd
	import xarray as xr
	import os
	import time
	from features import Features
	import h5py
	import util

	class Dataset:

	def __init__(self, path, model_name, query_name = None):
	self.data_path = os.path.join(path,"raw_data")
	self.model_path = os.path.join(path,"datasets", model_name)
	self.location_path = os.path.join(path,"locations")

	# Create a new folder if it does not exist yet:
	if not os.path.exists(self.model_path):
	os.mkdir(self.model_path)

	# LOAD TRAINING DATA
	if os.path.exists(os.path.join(self.model_path,"features.hdf5")):
	self.features = os.path.join(self.model_path,"features.hdf5")
	self.targets = os.path.join(self.model_path,"targets.hdf5")
	self.load_normalization()
	print("Set features and targets")
	else:
	print("Features and targets not yet created - run make_dataset before modelling")
	return

	# LOAD TESTING/QUERY DATA
	if query_name is not None:
	if os.path.exists(os.path.join(self.query_path,"features_query.hdf5")):
	self.get_query(query_name)
	else:
	print("Query features not yet created - run make_query before modelling")

	####################################################### QUERY DATA #######################################################

	def get_query(self, query_name):
	"""
	check for hdf5 feature table for query data

	Inputs:
	query_name name for folder that contains the query data set
	"""

	self.query_path = os.path.join(self.model_path,query_name)
	self.features_query = os.path.join(self.query_path,"features_query.hdf5")

	if not os.path.exists(self.features_query):
	print("Query features not yet created - run make_query before modelling")
	else:
	print("Set %s as query feature set" %self.features_query)

	def make_query(self, name, loc, hour = list(range(3,20)), month = list(range(1,13)), year = None, input_vars = None, normalise = True):
	"""
	write hdf5 feature table for query data

	Inputs:
	name name for folder that contains the query data set
	loc filename for location set
	hour, month, year LISTS of values that should be added to the query set
	input_vars include other variables in the NOT YET IMPLEMENTED
	norm perform normalisation of data
	"""

	# Check that all prerequisite files are in the correct directory
	if not os.path.exists(os.path.join(self.location_path,loc)):
	print("ERROR: Location file not found - Check %s for %s" %(self.location_path,loc))
	return

	if not os.path.exists(os.path.join(self.model_path,'features.hdf5')):
	print("ERROR: Training files not found - Check %s" %(self.model_path))
	return

	timer = time.clock()

	# Create dataset according to features
	pts = pd.read_csv(os.path.join(self.location_path,loc)) # read locations
	pts = pts.reset_index().groupby(['x', 'y']).agg('first').to_xarray().drop('index') # create xarray dataset from locations

	# add data as required
	if hour is not None:
	pts.coords['hour'] = hour

	if month is not None:
	pts.coords['month'] = month

	if year is not None:
	pts.coords['year'] = year

	pts = pts.to_dataframe().dropna().reset_index() # turn back into table of feature points

	# Make feature table
	arr = pts.as_matrix(columns = self.feature_norm.columns)

	if arr.shape[1] != len(self.feature_norm.columns):
	print("ERROR: Not all features given as input")
	return

	# create files and folders
	self.query_path = os.path.join(self.model_path,name)
	self.features_query = os.path.join(self.query_path,"features_query.hdf5")

	if not os.path.exists(self.query_path):
	os.mkdir(self.query_path)

	# normalise and write to HDF5
	ftr_file = h5py.File(self.features_query, "w")
	ftr_ds = ftr_file.create_dataset('features', data=arr)

	if normalise: ftr_ds[:,:],_,_ = util.normalize(ftr_ds, Xnorm = self.feature_norm.values)

	ftr_file.close()

	timer = time.clock() - timer
	print("Dataset successfully created in %.2f seconds" %(timer))

	################################################### TRAINING DATA ########################################################

	def make_dataset(self, features, targets, start_date, end_date, variables_to_read, raw_format = 'H', sample_name = None, norm_features = "mean", norm_targets = "mean", append = False):
	"""
	Create HDF5 table of training features and targets (4 files: features.hdf5, targets.hdf5, ftr_norm.txt, tgt_norm.txt)

	Inputs:
	filename name of subfolder for data storage
	start_date, end_date range of dates to be considered
	variables_to_read, raw_format list of data variables (SIS, SISDIR, SISCF, ALB etc) & format in which data is read
	sample_name name of file containing subsample (if None, all locations are considered)
	norm_features, norm_targets desired type of normalisation for features and for targets; one of ['range', 'mean', 'none']
	append True: add data to end of file - NOT YET IMPLEMENTED!!!!
	"""

	# Set up reader for raw data
	reader = Radiation_Reader(self.data_path, variables = variables_to_read, data_format = raw_format, split_raw = True)
	locations = 'CH'
	if sample_name is not None:
	reader.read_sample(filename = sample_name)
	locations = 'sample'

	month_start = pd.date_range(start_date, end_date, freq = 'MS')
	month_end = pd.date_range(start_date, end_date, freq = 'M')

	# Set up HDF5 files for features and targets
	self.features = os.path.join(self.model_path,"features.hdf5")
	self.targets = os.path.join(self.model_path,"targets.hdf5")
	n_f = len(features)
	n_t = len(targets)

	ftr_file = h5py.File(self.features, "w")
	tgt_file = h5py.File(self.targets, "w")

	ftr_ds = ftr_file.create_dataset('features', (0,n_f), maxshape=(None, n_f))
	tgt_ds = tgt_file.create_dataset('targets', (0,n_t), maxshape=(None, n_t))

	timer = time.clock()

	for (curr_month, curr_month_end) in zip(month_start, month_end):

	# READ DATA
	reader.add_data(curr_month, curr_month_end, reset = True, print_log = False)

	# CREATE FEATURE AND TARGET ARRAYS
	tbl = reader.make_table(print_log = False)
	ftrs = tbl.as_matrix(columns = features)
	tgts = tbl.as_matrix(columns = targets)

	# WRITE TO HDF5
	idx = ftr_ds.shape[0]
	ftr_ds.resize(ftr_ds.shape[0]+ftrs.shape[0], axis=0)
	ftr_ds[idx:,:] = ftrs

	tgt_ds.resize(tgt_ds.shape[0]+tgts.shape[0], axis=0)
	tgt_ds[idx:,:] = tgts

	timer = time.clock() - timer
	print ("Finished reading data in %.2f seconds" %(timer))

	# Normalise data (could also standardize to mean and std deviation by using util.standardize)
	ftr_ds[:,:], ftr_norm, ftr_idx = self.normalise_training(ftr_ds, norm_features)
	self.feature_norm = pd.DataFrame(data = ftr_norm, columns = features, index = ftr_idx)
	tgt_ds[:,:], tgt_norm, tgt_idx = self.normalise_training(tgt_ds, norm_targets)
	self.target_norm = pd.DataFrame(data = tgt_norm, columns = targets, index = tgt_idx)

	# Close HDF5 and write all auxilary data
	ftr_file.close()
	tgt_file.close()

	# write all auxilary files
	locs = reader.loc_mask.where(reader.loc_mask[locations]==1).to_dataframe().dropna().reset_index()
	locs[['lon', 'lat','x', 'y']].to_csv(os.path.join(self.model_path,"training_locations.csv"))
	pd.Series({'start_date' : start_date, 'end_date' : end_date, 'format' : raw_format}).to_csv(os.path.join(self.model_path,"training_dates.csv"))
	self.feature_norm.to_csv(os.path.join(self.model_path,"feature_norm.csv"))
	self.target_norm.to_csv(os.path.join(self.model_path,"target_norm.csv"))

	################################################ READ NORMALISATION #######################################################

	def load_normalization(self):
	self.feature_norm = pd.read_csv(os.path.join(self.model_path, "feature_norm.csv"), index_col = 0)
	self.target_norm = pd.read_csv(os.path.join(self.model_path, "target_norm.csv"), index_col = 0)

	def normalise_training(self, X, norm):
	if norm == "range":
	Xnorm, Xmax, Xmin = util.normalize(X)
	norm_index = ['max','min']
	norm_values = [Xmax, Xmin]
	elif norm == "mean":
	Xnorm, Xmean, Xstd = util.standardize(X)
	norm_index = ['mean','std']
	norm_values = [Xmean, Xstd]
	else:
	Xnorm = X
	norm_values = []
	norm_index = []

	return Xnorm, norm_values, norm_index

ds_old.pyNo OneTemporaryActions

File Metadata

ds_old.pyView Options

Event Timeline

ds_old.py
No OneTemporary
Actions

ds_old.py
View Options