crossval_PV.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, May 23, 08:27

crossval_PV.py
View Options

	import pandas as pd
	import xarray as xr
	import numpy as np

	import util

	from sklearn import tree
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.linear_model import LinearRegression
	from sklearn.neighbors import KNeighborsRegressor
	from sklearn.neural_network import MLPRegressor
	from sklearn.svm import SVR
	from sklearn.metrics import mean_squared_error as mse
	from sklearn.metrics import mean_absolute_error as mae
	from sklearn.metrics import r2_score as r2
	from sklearn.model_selection import cross_val_predict
	from sklearn.preprocessing import Normalizer
	from sklearn.model_selection import train_test_split
	from sklearn.base import clone
	from ELM_ensemble import ELM_Ensemble

	import os
	import sys

	## FILE PATHS ##

	ws = sys.argv[1]
	STATS = sys.argv[2]
	SAMPLE_SIZE = int(sys.argv[3])

	print(ws)

	data = pd.read_csv('/work/hyenergy/output/solar_potential/technical_potential/predict_pv/data_all.csv')
	COMMUNE_FP = '/work/hyenergy/raw/GWR_Reg-BL/GDEListe_20150101.txt'
	commune_info = pd.read_csv( COMMUNE_FP, sep = '\t', encoding = 'cp1250' )

	print('Loaded data successfully')

	label_name = ['PV_kWh_m2']
	identifier = 'DF_UID'

	all_features = ['SIS', 'illum_class', 'AUSRICHTUNG', 'NEIGUNG']


	## FUNCTION DEFIINITIONS ##
	# ERROR MEASUREMENTS
	error = {
	'MSE' : lambda y, t: mse(t,y),
	'RMSE': lambda y, t: np.sqrt(mse(t,y)),
	'MAE' : lambda y, t: mae(t,y),
	'MBE' : lambda y, t: np.mean(t - y),
	'R2' : lambda y, t: r2(t,y)
	}

	def get_errors(true, pred, label = ''):
	# create a pandas series with errors and print them
	true = true.reshape((-1,1))
	pred = pred.reshape((-1,1))

	err_series = pd.Series(data = np.zeros(len(error)), index = error.keys())
	for key, val in error.items():
	print('%s %s: %f' %(label, key, val(pred, true)))
	err_series[ key ] = val(pred, true)

	return err_series

	def get_errors_perc(true, pred, label = ''):
	# create a pandas series with errors and print them
	pred = pred.reshape((-1,))
	true = true.reshape((-1,))
	pred_ratio = ( pred / true ).reshape((-1,1))

	err_series = pd.Series(data = np.zeros(len(error)), index = error.keys())
	for key, val in error.items():
	if key == 'R2':
	print('%s %s: %f' %(label, key, val(pred, true)))
	err_series[ key ] = val(pred, true)
	else:
	print('%s %s (%%): %f' %(label, key, val( pred_ratio, np.ones( pred_ratio.shape ) )))
	err_series[ key ] = val( pred_ratio, np.ones( pred_ratio.shape ) )

	return err_series

	def rescale(vector, training):
	return (vector * training.std()) + training.mean()

	def normalize(vector, training):
	return (vector - training.mean()) / training.std()

	####### ======================================================= ##########

	# find french-speaking communes in Valais based on the start of their district name ('District' and not 'Bezirk')
	districts_valais = commune_info[ commune_info.GDEKT == 'VS' ]
	french_communes = [bzg[0:8] == 'District' for bzg in districts_valais.GDEBZNA.values]

	french_cantons = ['NE', 'JU', 'VD', 'GE', 'FR'] # plus french communes in Valais; considering FR even though bilangue

	# get training and testing data (from Cantons)
	testing_data = data[(data.Kanton.isin( french_cantons )) \
	\| (data.Commune_ID.isin( districts_valais[french_communes].GDENR.values )) ]
	testing_data = testing_data.reset_index().drop('index', axis = 1)

	training_data = data[ np.logical_not(data.DF_UID.isin( testing_data.DF_UID.values )) ]
	training_data = training_data.reset_index().drop('index', axis = 1).sample( SAMPLE_SIZE )

	print('Got training and testing data')

	# Define features and targets
	feature_names = all_features

	# NORMALIZE DATA
	X = training_data.loc[ : , feature_names ].values
	T = training_data.loc[ : , label_name ].values.reshape((-1,))

	norm = Normalizer()
	x = norm.fit_transform(X)
	t = normalize(T, T)

	print('Got normalized features and targets')

	## DEFINE ML MODELS AND PARAMETERS
	# ML MODEL PARAMETERS
	n_est = 500
	tree_depth = 10
	n_samples_leaf = 3
	ELM_nodes = 10
	ELM_est = 100
	SVM_paramC = 1
	SVM_gamma = 500

	k_CV = 5

	# DEFINITION OF ESTIMATORS
	def set_estimators():
	estimator = {
	'LIN': LinearRegression( n_jobs = -1 ),
	'RF' : RandomForestRegressor( n_estimators = n_est, min_samples_leaf = n_samples_leaf, n_jobs = -1 ),
	'ELM': ELM_Ensemble( n_estimators = ELM_est, n_nodes = ELM_nodes),
	'SVM': SVR( C = SVM_paramC, kernel = 'rbf', gamma = SVM_gamma ),
	'KNN': KNeighborsRegressor( n_jobs = -1 )
	}
	return estimator

	# cross-validation of ML models
	estimator = set_estimators()
	err_cv = pd.DataFrame(data = np.zeros((len(error), len(estimator))), index = error.keys(), columns = estimator.keys())
	err_pc = pd.DataFrame(data = np.zeros((len(error), len(estimator))), index = error.keys(), columns = estimator.keys())
	t_cv_wall = []
	t_cv_cpu = []

	NAMES = np.hstack(['model', err_cv.index.values, ['time_wall', 'time_cpu']])
	STATS_NAME = os.path.join(ws, "%s_%dk.csv" %(STATS, SAMPLE_SIZE/1000))
	util.write_row_to_csv( STATS_NAME, NAMES, initialize = True)

	STATS_NAME_PERC = os.path.join(ws, "%s_perc_%dk.csv" %(STATS, SAMPLE_SIZE/1000))
	util.write_row_to_csv( STATS_NAME_PERC, NAMES, initialize = True)

	for name, est in estimator.items():
	print('\nTraining %s\n' %name)

	est_CV = estimator[name]

	tt = util.Timer()
	y = cross_val_predict(est_CV, x, t, cv = k_CV) # cannot use n_jobs = -1 as not all are scikit-learn estimators
	tt.stop()

	Y = rescale(y, T)
	err_cv[name] = get_errors(T, Y.reshape((-1,)), label = 'Cross-validation')
	err_pc[name] = get_errors_perc(T, Y, label = 'Cross-validation')
	t_cv_wall.append(tt.walltime)
	t_cv_cpu.append(tt.cputime)

	VALUES = np.hstack([name, err_cv[name].values, [tt.walltime, tt.cputime]])
	util.write_row_to_csv( STATS_NAME, VALUES )
	VALUES_PERC = np.hstack([name, err_pc[name].values, [tt.walltime, tt.cputime]])
	util.write_row_to_csv( STATS_NAME_PERC, VALUES_PERC )

	err_cv.loc['t_cv_wall', :] = t_cv_wall
	err_cv.loc['t_cv_cpu' , :] = t_cv_cpu

	err_pc.loc['t_cv_wall', :] = t_cv_wall
	err_pc.loc['t_cv_cpu' , :] = t_cv_cpu

	err_cv.to_csv( STATS_NAME )
	err_pc.to_csv( STATS_NAME_PERC )
	print('Error stats saved in %s' %STATS_NAME)

crossval_PV.pyNo OneTemporaryActions

File Metadata

crossval_PV.pyView Options

Event Timeline

crossval_PV.py
No OneTemporary
Actions

crossval_PV.py
View Options