features_old.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Apr 28, 17:14

features_old.py
View Options

	import numpy as np
	import pandas as pd
	import xarray as xr
	import os
	import time
	from meteo_data import Meteo_Reader
	import h5py
	import util
	import json

	class Training():

	def __init__(self, path, model_name, features, targets, data_type = 'meteo'):
	self.data_path = os.path.join(path,"raw_data")
	self.model_path = os.path.join(path,"datasets", model_name)
	self.location_path = os.path.join(path,"locations")

	self.features = features
	self.targets = targets

	self.data_type = data_type

	# Create a new folder if it does not exist yet:
	if not os.path.exists(self.model_path):
	os.mkdir(self.model_path)

	###################################### TRAINING DATA ###################################

	def make_dataset(self, start_date, end_date, sample_name = None, test_name = None, k_norm = 100):
	# possible adaption: use "select"-dictionary to make class usable for other data types

	"""
	if os.path.exists(os.path.join(self.model_path,'features.hdf5')):
	print('Overwriting features...')

	if os.path.exists(os.path.join(self.model_path,'targets.hdf5')):
	print('Overwriting targets...')

	# Set up HDF5 files for features and targets
	ftr_name = os.path.join(self.model_path,"features.hdf5")
	tgt_name = os.path.join(self.model_path,"targets.hdf5")

	n_f = len(self.features)
	n_t = len(self.targets)
	ftr_norm = util.Norm(n_f, k_norm)
	tgt_norm = util.Norm(n_t, k_norm)

	ftr_file = h5py.File(ftr_name, "w")
	tgt_file = h5py.File(tgt_name, "w")
	ftr_ds = ftr_file.create_dataset('features', (0,n_f), maxshape=(None, n_f))
	tgt_ds = tgt_file.create_dataset('targets', (0,n_t), maxshape=(None, n_t))

	"""
	ftr_file, ftr_ds, ftr_norm =
	if self.data_type == 'meteo':
	self.load_meteo(ftr_ds, tgt_ds, ftr_norm, tgt_norm,
	start_date, end_date, sample = sample_name)

	ftr_file.close()
	tgt_file.close()

	#################################################################################
	def load_meteo(self, ftrs, tgts, ftr_norm, tgt_norm,
	start_date, end_date,
	ftrs_tst = None, tgts_tst = None,
	sample = None, test = None):

	reader = Meteo_Reader(self.data_path)
	locations = 'CH'

	# read variables that are in features or in targets
	vars_to_read = list(set(reader._all_vars) & set(self.features)\| set(reader._all_vars) & set(self.targets))

	if sample is not None:
	reader.read_sample(filename = sample, sample_raw = False)
	locations = 'sample'

	if test is not None:
	reader.read_sample(filename = test, sample_name = 'sample_test', sample_raw = False)

	month_start = pd.date_range(start_date, end_date, freq = 'MS')
	month_end = pd.date_range(start_date, end_date, freq = 'M')

	timer = time.clock()

	for (curr_month, curr_month_end) in zip(month_start, month_end):
	timer1 = time.clock()
	reader.read_data(curr_month, curr_month_end, vars_to_read, reset=True, print_log = False)
	train_data = reader.get_subset()

	# CREATE FEATURE AND TARGET ARRAYS
	tbl = reader.make_table(indata = train_data, print_log = False)
	self.to_hdf5(ftrs, tbl, self.features, nrm = ftr_norm)
	self.to_hdf5(tgts, tbl, self.targets, nrm = tgt_norm)

	if test is not None:
	test_data = reader.get_subset(sample_name = 'sample_test')
	tbl_tst = reader.make_table(indata = test_data, print_log = False)
	self.to_hdf5(ftrs_tst, tbl_tst, self.features, update = False)
	self.to_hdf5(tgts_tst, tbl_tst, self.targets, update = False)
	#ftr_tbl = tbl.as_matrix(columns = self.features)
	#tgt_tbl = tbl.as_matrix(columns = self.targets)

	#ftr_norm.update(ftr_tbl)
	#tgt_norm.update(tgt_tbl)

	# WRITE TO HDF5
	#idx = ftrs.shape[0]
	#ftrs.resize(ftrs.shape[0]+ftr_tbl.shape[0], axis=0)
	#ftrs[idx:,:] = ftr_tbl

	#tgts.resize(tgts.shape[0]+tgt_tbl.shape[0], axis=0)
	#tgts[idx:,:] = tgt_tbl

	print ("Iteration: %.2f seconds" %(time.clock() - timer1))

	timer = time.clock() - timer
	print ("Finished reading data in %.2f seconds" %(timer))

	ftr_norm.evaluate()
	tgt_norm.evaluate()
	locs = reader.loc_mask.where(reader.loc_mask[locations]==1).to_dataframe().dropna().reset_index()

	# write all auxilary information to hdf5
	locs[['lon', 'lat','x', 'y']].to_csv(os.path.join(self.model_path,"training_locations.csv"))
	pd.Series({'start_date' : start_date, 'end_date' : end_date}).to_csv(os.path.join(self.model_path,"training_dates.csv"))
	ftr_norm.make_table(self.features).to_csv(os.path.join(self.model_path,"feature_norm.csv"))
	tgt_norm.make_table(self.targets).to_csv(os.path.join(self.model_path,"target_norm.csv"))

	metadata = {'start_date': start_date, 'end_date': end_date,
	'norm_features': ftr_norm.make_table(self.features),
	'norm_targets': tgt_norm.make_table(self.targets),
	'locations': locs[['lon', 'lat','x', 'y']]}

	return metadata

	########################### AUXILARIES ###############################################

	def to_hdf5(self, obj, intable, cols, update = True, nrm = None):
	tbl = intable.as_matrix(columns = cols)
	if update: nrm.update(tbl)

	idx = obj.shape[0]
	obj.resize(obj.shape[0]+tbl.shape[0], axis=0)
	obj[idx:,:] = tbl

	def open_hdf5(self, name, cols, make_norm = True, k = None):
	filename = os.path.join(self.model_path,name + ".hdf5")
	if os.path.exists(filename):
	print('Overwriting %s...' %name)

	n = len(cols)
	obj = h5py.File(filename, "w")
	ds = obj.create_dataset(name, (0,n), maxshape=(None, n))

	if make_norm:
	norm = util.Norm(n, k)
	return obj, ds, norm

	return obj, ds

	class Testing():
	def __init__(self, path, model_name, data_type = 'meteo'):
	self.data_path = os.path.join(path,"raw_data")
	self.model_path = os.path.join(path,"datasets", model_name)
	self.location_path = os.path.join(path,"locations")

	self.data_type = data_type

	# Create a new folder if it does not exist yet:
	if not os.path.exists(self.model_path):
	print("Model does not exist - create training data first")
	return

	self.feature_norm = pd.read_csv(os.path.join(self.model_path, "feature_norm.csv"), index_col = 0)
	self.target_norm = pd.read_csv(os.path.join(self.model_path, "target_norm.csv"), index_col = 0)

	def meteo_query(self, name, loc, hour = list(range(3,20)), month = list(range(1,13)), year = None, input_vars = None, normalise = True):
	"""
	write hdf5 feature table for query data

	Inputs:
	name name for folder that contains the query data set
	loc filename for location set
	hour, month, year LISTS of values that should be added to the query set
	input_vars include other variables in the NOT YET IMPLEMENTED
	norm perform normalisation of data
	"""

	# Check that all prerequisite files are in the correct directory
	if not os.path.exists(os.path.join(self.location_path,loc)):
	print("ERROR: Location file not found - Check %s for %s" %(self.location_path,loc))
	return

	if not os.path.exists(os.path.join(self.model_path,'features.hdf5')):
	print("ERROR: Training files not found - Check %s" %(self.model_path))
	return

	timer = time.clock()

	# Create dataset according to features
	pts = pd.read_csv(os.path.join(self.location_path,loc)) # read locations
	pts = pts.reset_index().groupby(['x', 'y']).agg('first').to_xarray().drop('index') # create xarray dataset from locations

	# add data as required
	if hour is not None:
	pts.coords['hour'] = hour

	if month is not None:
	pts.coords['month'] = month

	if year is not None:
	pts.coords['year'] = year

	pts = pts.to_dataframe().dropna().reset_index() # turn back into table of feature points

	# Make feature table
	arr = pts.as_matrix(columns = self.feature_norm.columns)

	if arr.shape[1] != len(self.feature_norm.columns):
	print("ERROR: Not all features given as input")
	return

	# create files and folders
	self.query_path = os.path.join(self.model_path,name)
	self.features_query = os.path.join(self.query_path,"features_query.hdf5")

	if not os.path.exists(self.query_path):
	os.mkdir(self.query_path)

	# normalise and write to HDF5
	ftr_file = h5py.File(self.features_query, "w")
	ftr_ds = ftr_file.create_dataset('features', data=arr)

	ftr_file.close()

	timer = time.clock() - timer
	print("Dataset successfully created in %.2f seconds" %(timer))


	class Table_Object():

	def __init__(self, path, variables, name):
	self.path = path
	self.cols = variables
	self.name = name
	self.n = len(self.cols)

	def add_hdf5(self, add_norm = True, k = None):
	filename = os.path.join(self.path, self.name + ".hdf5")
	if os.path.exists(filename):
	print('Overwriting %s...' %self.name)

	self.file = h5py.File(filename, "w")
	self.ds = self.file.create_dataset(self.name, (0,self.n), maxshape=(None, self.n))

	if add_norm:
	self.norm = util.Norm(self.n, k)

	def close_file(self):
	self.file.close()

features_old.pyNo OneTemporaryActions

File Metadata

features_old.pyView Options

Event Timeline

features_old.py
No OneTemporary
Actions

features_old.py
View Options