Page MenuHomec4science

ds_old.py
No OneTemporary

File Metadata

Created
Mon, Apr 28, 10:43

ds_old.py

import numpy as np
import pandas as pd
import xarray as xr
import os
import time
from features import Features
import h5py
import util
class Dataset:
def __init__(self, path, model_name, query_name = None):
self.data_path = os.path.join(path,"raw_data")
self.model_path = os.path.join(path,"datasets", model_name)
self.location_path = os.path.join(path,"locations")
# Create a new folder if it does not exist yet:
if not os.path.exists(self.model_path):
os.mkdir(self.model_path)
# LOAD TRAINING DATA
if os.path.exists(os.path.join(self.model_path,"features.hdf5")):
self.features = os.path.join(self.model_path,"features.hdf5")
self.targets = os.path.join(self.model_path,"targets.hdf5")
self.load_normalization()
print("Set features and targets")
else:
print("Features and targets not yet created - run make_dataset before modelling")
return
# LOAD TESTING/QUERY DATA
if query_name is not None:
if os.path.exists(os.path.join(self.query_path,"features_query.hdf5")):
self.get_query(query_name)
else:
print("Query features not yet created - run make_query before modelling")
####################################################### QUERY DATA #######################################################
def get_query(self, query_name):
"""
check for hdf5 feature table for query data
Inputs:
query_name name for folder that contains the query data set
"""
self.query_path = os.path.join(self.model_path,query_name)
self.features_query = os.path.join(self.query_path,"features_query.hdf5")
if not os.path.exists(self.features_query):
print("Query features not yet created - run make_query before modelling")
else:
print("Set %s as query feature set" %self.features_query)
def make_query(self, name, loc, hour = list(range(3,20)), month = list(range(1,13)), year = None, input_vars = None, normalise = True):
"""
write hdf5 feature table for query data
Inputs:
name name for folder that contains the query data set
loc filename for location set
hour, month, year LISTS of values that should be added to the query set
input_vars include other variables in the NOT YET IMPLEMENTED
norm perform normalisation of data
"""
# Check that all prerequisite files are in the correct directory
if not os.path.exists(os.path.join(self.location_path,loc)):
print("ERROR: Location file not found - Check %s for %s" %(self.location_path,loc))
return
if not os.path.exists(os.path.join(self.model_path,'features.hdf5')):
print("ERROR: Training files not found - Check %s" %(self.model_path))
return
timer = time.clock()
# Create dataset according to features
pts = pd.read_csv(os.path.join(self.location_path,loc)) # read locations
pts = pts.reset_index().groupby(['x', 'y']).agg('first').to_xarray().drop('index') # create xarray dataset from locations
# add data as required
if hour is not None:
pts.coords['hour'] = hour
if month is not None:
pts.coords['month'] = month
if year is not None:
pts.coords['year'] = year
pts = pts.to_dataframe().dropna().reset_index() # turn back into table of feature points
# Make feature table
arr = pts.as_matrix(columns = self.feature_norm.columns)
if arr.shape[1] != len(self.feature_norm.columns):
print("ERROR: Not all features given as input")
return
# create files and folders
self.query_path = os.path.join(self.model_path,name)
self.features_query = os.path.join(self.query_path,"features_query.hdf5")
if not os.path.exists(self.query_path):
os.mkdir(self.query_path)
# normalise and write to HDF5
ftr_file = h5py.File(self.features_query, "w")
ftr_ds = ftr_file.create_dataset('features', data=arr)
if normalise: ftr_ds[:,:],_,_ = util.normalize(ftr_ds, Xnorm = self.feature_norm.values)
ftr_file.close()
timer = time.clock() - timer
print("Dataset successfully created in %.2f seconds" %(timer))
################################################### TRAINING DATA ########################################################
def make_dataset(self, features, targets, start_date, end_date, variables_to_read, raw_format = 'H', sample_name = None, norm_features = "mean", norm_targets = "mean", append = False):
"""
Create HDF5 table of training features and targets (4 files: features.hdf5, targets.hdf5, ftr_norm.txt, tgt_norm.txt)
Inputs:
filename name of subfolder for data storage
start_date, end_date range of dates to be considered
variables_to_read, raw_format list of data variables (SIS, SISDIR, SISCF, ALB etc) & format in which data is read
sample_name name of file containing subsample (if None, all locations are considered)
norm_features, norm_targets desired type of normalisation for features and for targets; one of ['range', 'mean', 'none']
append True: add data to end of file - NOT YET IMPLEMENTED!!!!
"""
# Set up reader for raw data
reader = Radiation_Reader(self.data_path, variables = variables_to_read, data_format = raw_format, split_raw = True)
locations = 'CH'
if sample_name is not None:
reader.read_sample(filename = sample_name)
locations = 'sample'
month_start = pd.date_range(start_date, end_date, freq = 'MS')
month_end = pd.date_range(start_date, end_date, freq = 'M')
# Set up HDF5 files for features and targets
self.features = os.path.join(self.model_path,"features.hdf5")
self.targets = os.path.join(self.model_path,"targets.hdf5")
n_f = len(features)
n_t = len(targets)
ftr_file = h5py.File(self.features, "w")
tgt_file = h5py.File(self.targets, "w")
ftr_ds = ftr_file.create_dataset('features', (0,n_f), maxshape=(None, n_f))
tgt_ds = tgt_file.create_dataset('targets', (0,n_t), maxshape=(None, n_t))
timer = time.clock()
for (curr_month, curr_month_end) in zip(month_start, month_end):
# READ DATA
reader.add_data(curr_month, curr_month_end, reset = True, print_log = False)
# CREATE FEATURE AND TARGET ARRAYS
tbl = reader.make_table(print_log = False)
ftrs = tbl.as_matrix(columns = features)
tgts = tbl.as_matrix(columns = targets)
# WRITE TO HDF5
idx = ftr_ds.shape[0]
ftr_ds.resize(ftr_ds.shape[0]+ftrs.shape[0], axis=0)
ftr_ds[idx:,:] = ftrs
tgt_ds.resize(tgt_ds.shape[0]+tgts.shape[0], axis=0)
tgt_ds[idx:,:] = tgts
timer = time.clock() - timer
print ("Finished reading data in %.2f seconds" %(timer))
# Normalise data (could also standardize to mean and std deviation by using util.standardize)
ftr_ds[:,:], ftr_norm, ftr_idx = self.normalise_training(ftr_ds, norm_features)
self.feature_norm = pd.DataFrame(data = ftr_norm, columns = features, index = ftr_idx)
tgt_ds[:,:], tgt_norm, tgt_idx = self.normalise_training(tgt_ds, norm_targets)
self.target_norm = pd.DataFrame(data = tgt_norm, columns = targets, index = tgt_idx)
# Close HDF5 and write all auxilary data
ftr_file.close()
tgt_file.close()
# write all auxilary files
locs = reader.loc_mask.where(reader.loc_mask[locations]==1).to_dataframe().dropna().reset_index()
locs[['lon', 'lat','x', 'y']].to_csv(os.path.join(self.model_path,"training_locations.csv"))
pd.Series({'start_date' : start_date, 'end_date' : end_date, 'format' : raw_format}).to_csv(os.path.join(self.model_path,"training_dates.csv"))
self.feature_norm.to_csv(os.path.join(self.model_path,"feature_norm.csv"))
self.target_norm.to_csv(os.path.join(self.model_path,"target_norm.csv"))
################################################ READ NORMALISATION #######################################################
def load_normalization(self):
self.feature_norm = pd.read_csv(os.path.join(self.model_path, "feature_norm.csv"), index_col = 0)
self.target_norm = pd.read_csv(os.path.join(self.model_path, "target_norm.csv"), index_col = 0)
def normalise_training(self, X, norm):
if norm == "range":
Xnorm, Xmax, Xmin = util.normalize(X)
norm_index = ['max','min']
norm_values = [Xmax, Xmin]
elif norm == "mean":
Xnorm, Xmean, Xstd = util.standardize(X)
norm_index = ['mean','std']
norm_values = [Xmean, Xstd]
else:
Xnorm = X
norm_values = []
norm_index = []
return Xnorm, norm_values, norm_index

Event Timeline