Page MenuHomec4science

features_old.py
No OneTemporary

File Metadata

Created
Mon, Apr 28, 17:14

features_old.py

import numpy as np
import pandas as pd
import xarray as xr
import os
import time
from meteo_data import Meteo_Reader
import h5py
import util
import json
class Training():
def __init__(self, path, model_name, features, targets, data_type = 'meteo'):
self.data_path = os.path.join(path,"raw_data")
self.model_path = os.path.join(path,"datasets", model_name)
self.location_path = os.path.join(path,"locations")
self.features = features
self.targets = targets
self.data_type = data_type
# Create a new folder if it does not exist yet:
if not os.path.exists(self.model_path):
os.mkdir(self.model_path)
###################################### TRAINING DATA ###################################
def make_dataset(self, start_date, end_date, sample_name = None, test_name = None, k_norm = 100):
# possible adaption: use "select"-dictionary to make class usable for other data types
"""
if os.path.exists(os.path.join(self.model_path,'features.hdf5')):
print('Overwriting features...')
if os.path.exists(os.path.join(self.model_path,'targets.hdf5')):
print('Overwriting targets...')
# Set up HDF5 files for features and targets
ftr_name = os.path.join(self.model_path,"features.hdf5")
tgt_name = os.path.join(self.model_path,"targets.hdf5")
n_f = len(self.features)
n_t = len(self.targets)
ftr_norm = util.Norm(n_f, k_norm)
tgt_norm = util.Norm(n_t, k_norm)
ftr_file = h5py.File(ftr_name, "w")
tgt_file = h5py.File(tgt_name, "w")
ftr_ds = ftr_file.create_dataset('features', (0,n_f), maxshape=(None, n_f))
tgt_ds = tgt_file.create_dataset('targets', (0,n_t), maxshape=(None, n_t))
"""
ftr_file, ftr_ds, ftr_norm =
if self.data_type == 'meteo':
self.load_meteo(ftr_ds, tgt_ds, ftr_norm, tgt_norm,
start_date, end_date, sample = sample_name)
ftr_file.close()
tgt_file.close()
#################################################################################
def load_meteo(self, ftrs, tgts, ftr_norm, tgt_norm,
start_date, end_date,
ftrs_tst = None, tgts_tst = None,
sample = None, test = None):
reader = Meteo_Reader(self.data_path)
locations = 'CH'
# read variables that are in features or in targets
vars_to_read = list(set(reader._all_vars) & set(self.features)| set(reader._all_vars) & set(self.targets))
if sample is not None:
reader.read_sample(filename = sample, sample_raw = False)
locations = 'sample'
if test is not None:
reader.read_sample(filename = test, sample_name = 'sample_test', sample_raw = False)
month_start = pd.date_range(start_date, end_date, freq = 'MS')
month_end = pd.date_range(start_date, end_date, freq = 'M')
timer = time.clock()
for (curr_month, curr_month_end) in zip(month_start, month_end):
timer1 = time.clock()
reader.read_data(curr_month, curr_month_end, vars_to_read, reset=True, print_log = False)
train_data = reader.get_subset()
# CREATE FEATURE AND TARGET ARRAYS
tbl = reader.make_table(indata = train_data, print_log = False)
self.to_hdf5(ftrs, tbl, self.features, nrm = ftr_norm)
self.to_hdf5(tgts, tbl, self.targets, nrm = tgt_norm)
if test is not None:
test_data = reader.get_subset(sample_name = 'sample_test')
tbl_tst = reader.make_table(indata = test_data, print_log = False)
self.to_hdf5(ftrs_tst, tbl_tst, self.features, update = False)
self.to_hdf5(tgts_tst, tbl_tst, self.targets, update = False)
#ftr_tbl = tbl.as_matrix(columns = self.features)
#tgt_tbl = tbl.as_matrix(columns = self.targets)
#ftr_norm.update(ftr_tbl)
#tgt_norm.update(tgt_tbl)
# WRITE TO HDF5
#idx = ftrs.shape[0]
#ftrs.resize(ftrs.shape[0]+ftr_tbl.shape[0], axis=0)
#ftrs[idx:,:] = ftr_tbl
#tgts.resize(tgts.shape[0]+tgt_tbl.shape[0], axis=0)
#tgts[idx:,:] = tgt_tbl
print ("Iteration: %.2f seconds" %(time.clock() - timer1))
timer = time.clock() - timer
print ("Finished reading data in %.2f seconds" %(timer))
ftr_norm.evaluate()
tgt_norm.evaluate()
locs = reader.loc_mask.where(reader.loc_mask[locations]==1).to_dataframe().dropna().reset_index()
# write all auxilary information to hdf5
locs[['lon', 'lat','x', 'y']].to_csv(os.path.join(self.model_path,"training_locations.csv"))
pd.Series({'start_date' : start_date, 'end_date' : end_date}).to_csv(os.path.join(self.model_path,"training_dates.csv"))
ftr_norm.make_table(self.features).to_csv(os.path.join(self.model_path,"feature_norm.csv"))
tgt_norm.make_table(self.targets).to_csv(os.path.join(self.model_path,"target_norm.csv"))
metadata = {'start_date': start_date, 'end_date': end_date,
'norm_features': ftr_norm.make_table(self.features),
'norm_targets': tgt_norm.make_table(self.targets),
'locations': locs[['lon', 'lat','x', 'y']]}
return metadata
########################### AUXILARIES ###############################################
def to_hdf5(self, obj, intable, cols, update = True, nrm = None):
tbl = intable.as_matrix(columns = cols)
if update: nrm.update(tbl)
idx = obj.shape[0]
obj.resize(obj.shape[0]+tbl.shape[0], axis=0)
obj[idx:,:] = tbl
def open_hdf5(self, name, cols, make_norm = True, k = None):
filename = os.path.join(self.model_path,name + ".hdf5")
if os.path.exists(filename):
print('Overwriting %s...' %name)
n = len(cols)
obj = h5py.File(filename, "w")
ds = obj.create_dataset(name, (0,n), maxshape=(None, n))
if make_norm:
norm = util.Norm(n, k)
return obj, ds, norm
return obj, ds
class Testing():
def __init__(self, path, model_name, data_type = 'meteo'):
self.data_path = os.path.join(path,"raw_data")
self.model_path = os.path.join(path,"datasets", model_name)
self.location_path = os.path.join(path,"locations")
self.data_type = data_type
# Create a new folder if it does not exist yet:
if not os.path.exists(self.model_path):
print("Model does not exist - create training data first")
return
self.feature_norm = pd.read_csv(os.path.join(self.model_path, "feature_norm.csv"), index_col = 0)
self.target_norm = pd.read_csv(os.path.join(self.model_path, "target_norm.csv"), index_col = 0)
def meteo_query(self, name, loc, hour = list(range(3,20)), month = list(range(1,13)), year = None, input_vars = None, normalise = True):
"""
write hdf5 feature table for query data
Inputs:
name name for folder that contains the query data set
loc filename for location set
hour, month, year LISTS of values that should be added to the query set
input_vars include other variables in the NOT YET IMPLEMENTED
norm perform normalisation of data
"""
# Check that all prerequisite files are in the correct directory
if not os.path.exists(os.path.join(self.location_path,loc)):
print("ERROR: Location file not found - Check %s for %s" %(self.location_path,loc))
return
if not os.path.exists(os.path.join(self.model_path,'features.hdf5')):
print("ERROR: Training files not found - Check %s" %(self.model_path))
return
timer = time.clock()
# Create dataset according to features
pts = pd.read_csv(os.path.join(self.location_path,loc)) # read locations
pts = pts.reset_index().groupby(['x', 'y']).agg('first').to_xarray().drop('index') # create xarray dataset from locations
# add data as required
if hour is not None:
pts.coords['hour'] = hour
if month is not None:
pts.coords['month'] = month
if year is not None:
pts.coords['year'] = year
pts = pts.to_dataframe().dropna().reset_index() # turn back into table of feature points
# Make feature table
arr = pts.as_matrix(columns = self.feature_norm.columns)
if arr.shape[1] != len(self.feature_norm.columns):
print("ERROR: Not all features given as input")
return
# create files and folders
self.query_path = os.path.join(self.model_path,name)
self.features_query = os.path.join(self.query_path,"features_query.hdf5")
if not os.path.exists(self.query_path):
os.mkdir(self.query_path)
# normalise and write to HDF5
ftr_file = h5py.File(self.features_query, "w")
ftr_ds = ftr_file.create_dataset('features', data=arr)
ftr_file.close()
timer = time.clock() - timer
print("Dataset successfully created in %.2f seconds" %(timer))
class Table_Object():
def __init__(self, path, variables, name):
self.path = path
self.cols = variables
self.name = name
self.n = len(self.cols)
def add_hdf5(self, add_norm = True, k = None):
filename = os.path.join(self.path, self.name + ".hdf5")
if os.path.exists(filename):
print('Overwriting %s...' %self.name)
self.file = h5py.File(filename, "w")
self.ds = self.file.create_dataset(self.name, (0,self.n), maxshape=(None, self.n))
if add_norm:
self.norm = util.Norm(self.n, k)
def close_file(self):
self.file.close()

Event Timeline