Page MenuHomec4science

hourly_model.py
No OneTemporary

File Metadata

Created
Wed, Apr 30, 23:40

hourly_model.py

import os
import sys
import xarray as xr
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from meteo_data import Meteo_Reader
import util
path = sys.argv[1]
locname = sys.argv[2]
features = sys.argv[3].split(",")
target = sys.argv[4]
model = sys.argv[5] # put 'knn' or 'rf'
print(features)
print(target)
# knn features
n_neighbors = 5
weight_type = 'distance'
# RF features
forest_size = 500
# ### Output collector
# Will write all outputs to an hdf5 file.
name = ('%s_%s_2001_%dD' %(model, target, len(features)))
variables = ['x', 'y', 'month', 'hour', target]
output_table = util.Table_Writer(os.path.join(path, 'results'), variables, name)
output_table.open_hdf5(add_norm = False)
# Load training data (monthly-mean-hourly) as well as the location mask and select the variables of interest for training.
mmh = xr.open_dataset(os.path.join(path,'raw_data', '2001_mmh.nc'))
solar = xr.merge([mmh[target], mmh.hourmask])
reader = Meteo_Reader(os.path.join(path, 'raw_data'))
# Load the hourmask and create an array with month, hour pairs.
mask = mmh.hourmask
hour_month_combinations = mask.to_dataframe().dropna().reset_index().loc[:,['month', 'hour']]
# ## Per - hour execution
for month, hour in zip(hour_month_combinations.month, hour_month_combinations.hour):
t_all = util.Timer()
print('\n\nMonth: %d, hour: %d' %(month, hour))
# select subset of the data for modelling
date = ('2001%02d01' %month)
solar_hour = solar.sel(month = month, date = date, hour = hour)
# convert data into a pandas dataframe and split into feature and target tables
solar_table = reader.make_table(indata = solar_hour, ftrs = features)
training_features = solar_table.loc[:,features]
training_targets = solar_table.loc[:,target]
# read query data from csv file and select the features
pts = pd.read_csv(os.path.join(path,'locations',locname+'.csv'))
query_features = pts.reset_index().loc[:,features]
# declare initialisers for different models and perform initialisation
initialize_model = {
'rf' : RandomForestRegressor(n_estimators = forest_size, max_depth = 100, n_jobs = -1),
'knn' : KNeighborsRegressor(n_neighbors=n_neighbors, weights = weight_type, n_jobs = -1)
}
regressor = initialize_model[model]
# Fit selected model
tt = util.Timer()
regressor.fit(training_features, training_targets)
print('\nFitted model')
tt.stop()
# Predict on the query data
tt = util.Timer()
query_output = regressor.predict(query_features)
print('\nPredicted on query')
tt.stop()
# Prepare query table for writing
query_features[target] = query_output
query_features['month'] = month
query_features['hour'] = hour
# write to the hdf5 file
output_table.write_hdf5(query_features)
print('\nFinished iteration')
t_all.restart()
# close the hdf5 file
output_table.close_hdf5()

Event Timeline