Page MenuHomec4science

crossval_PV.py
No OneTemporary

File Metadata

Created
Fri, May 23, 08:27

crossval_PV.py

import pandas as pd
import xarray as xr
import numpy as np
import util
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.base import clone
from ELM_ensemble import ELM_Ensemble
import os
import sys
## FILE PATHS ##
ws = sys.argv[1]
STATS = sys.argv[2]
SAMPLE_SIZE = int(sys.argv[3])
print(ws)
data = pd.read_csv('/work/hyenergy/output/solar_potential/technical_potential/predict_pv/data_all.csv')
COMMUNE_FP = '/work/hyenergy/raw/GWR_Reg-BL/GDEListe_20150101.txt'
commune_info = pd.read_csv( COMMUNE_FP, sep = '\t', encoding = 'cp1250' )
print('Loaded data successfully')
label_name = ['PV_kWh_m2']
identifier = 'DF_UID'
all_features = ['SIS', 'illum_class', 'AUSRICHTUNG', 'NEIGUNG']
## FUNCTION DEFIINITIONS ##
# ERROR MEASUREMENTS
error = {
'MSE' : lambda y, t: mse(t,y),
'RMSE': lambda y, t: np.sqrt(mse(t,y)),
'MAE' : lambda y, t: mae(t,y),
'MBE' : lambda y, t: np.mean(t - y),
'R2' : lambda y, t: r2(t,y)
}
def get_errors(true, pred, label = ''):
# create a pandas series with errors and print them
true = true.reshape((-1,1))
pred = pred.reshape((-1,1))
err_series = pd.Series(data = np.zeros(len(error)), index = error.keys())
for key, val in error.items():
print('%s %s: %f' %(label, key, val(pred, true)))
err_series[ key ] = val(pred, true)
return err_series
def get_errors_perc(true, pred, label = ''):
# create a pandas series with errors and print them
pred = pred.reshape((-1,))
true = true.reshape((-1,))
pred_ratio = ( pred / true ).reshape((-1,1))
err_series = pd.Series(data = np.zeros(len(error)), index = error.keys())
for key, val in error.items():
if key == 'R2':
print('%s %s: %f' %(label, key, val(pred, true)))
err_series[ key ] = val(pred, true)
else:
print('%s %s (%%): %f' %(label, key, val( pred_ratio, np.ones( pred_ratio.shape ) )))
err_series[ key ] = val( pred_ratio, np.ones( pred_ratio.shape ) )
return err_series
def rescale(vector, training):
return (vector * training.std()) + training.mean()
def normalize(vector, training):
return (vector - training.mean()) / training.std()
####### ======================================================= ##########
# find french-speaking communes in Valais based on the start of their district name ('District' and not 'Bezirk')
districts_valais = commune_info[ commune_info.GDEKT == 'VS' ]
french_communes = [bzg[0:8] == 'District' for bzg in districts_valais.GDEBZNA.values]
french_cantons = ['NE', 'JU', 'VD', 'GE', 'FR'] # plus french communes in Valais; considering FR even though bilangue
# get training and testing data (from Cantons)
testing_data = data[(data.Kanton.isin( french_cantons )) \
| (data.Commune_ID.isin( districts_valais[french_communes].GDENR.values )) ]
testing_data = testing_data.reset_index().drop('index', axis = 1)
training_data = data[ np.logical_not(data.DF_UID.isin( testing_data.DF_UID.values )) ]
training_data = training_data.reset_index().drop('index', axis = 1).sample( SAMPLE_SIZE )
print('Got training and testing data')
# Define features and targets
feature_names = all_features
# NORMALIZE DATA
X = training_data.loc[ : , feature_names ].values
T = training_data.loc[ : , label_name ].values.reshape((-1,))
norm = Normalizer()
x = norm.fit_transform(X)
t = normalize(T, T)
print('Got normalized features and targets')
## DEFINE ML MODELS AND PARAMETERS
# ML MODEL PARAMETERS
n_est = 500
tree_depth = 10
n_samples_leaf = 3
ELM_nodes = 10
ELM_est = 100
SVM_paramC = 1
SVM_gamma = 500
k_CV = 5
# DEFINITION OF ESTIMATORS
def set_estimators():
estimator = {
'LIN': LinearRegression( n_jobs = -1 ),
'RF' : RandomForestRegressor( n_estimators = n_est, min_samples_leaf = n_samples_leaf, n_jobs = -1 ),
'ELM': ELM_Ensemble( n_estimators = ELM_est, n_nodes = ELM_nodes),
'SVM': SVR( C = SVM_paramC, kernel = 'rbf', gamma = SVM_gamma ),
'KNN': KNeighborsRegressor( n_jobs = -1 )
}
return estimator
# cross-validation of ML models
estimator = set_estimators()
err_cv = pd.DataFrame(data = np.zeros((len(error), len(estimator))), index = error.keys(), columns = estimator.keys())
err_pc = pd.DataFrame(data = np.zeros((len(error), len(estimator))), index = error.keys(), columns = estimator.keys())
t_cv_wall = []
t_cv_cpu = []
NAMES = np.hstack(['model', err_cv.index.values, ['time_wall', 'time_cpu']])
STATS_NAME = os.path.join(ws, "%s_%dk.csv" %(STATS, SAMPLE_SIZE/1000))
util.write_row_to_csv( STATS_NAME, NAMES, initialize = True)
STATS_NAME_PERC = os.path.join(ws, "%s_perc_%dk.csv" %(STATS, SAMPLE_SIZE/1000))
util.write_row_to_csv( STATS_NAME_PERC, NAMES, initialize = True)
for name, est in estimator.items():
print('\nTraining %s\n' %name)
est_CV = estimator[name]
tt = util.Timer()
y = cross_val_predict(est_CV, x, t, cv = k_CV) # cannot use n_jobs = -1 as not all are scikit-learn estimators
tt.stop()
Y = rescale(y, T)
err_cv[name] = get_errors(T, Y.reshape((-1,)), label = 'Cross-validation')
err_pc[name] = get_errors_perc(T, Y, label = 'Cross-validation')
t_cv_wall.append(tt.walltime)
t_cv_cpu.append(tt.cputime)
VALUES = np.hstack([name, err_cv[name].values, [tt.walltime, tt.cputime]])
util.write_row_to_csv( STATS_NAME, VALUES )
VALUES_PERC = np.hstack([name, err_pc[name].values, [tt.walltime, tt.cputime]])
util.write_row_to_csv( STATS_NAME_PERC, VALUES_PERC )
err_cv.loc['t_cv_wall', :] = t_cv_wall
err_cv.loc['t_cv_cpu' , :] = t_cv_cpu
err_pc.loc['t_cv_wall', :] = t_cv_wall
err_pc.loc['t_cv_cpu' , :] = t_cv_cpu
err_cv.to_csv( STATS_NAME )
err_pc.to_csv( STATS_NAME_PERC )
print('Error stats saved in %s' %STATS_NAME)

Event Timeline