Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F114074382
crossval_PV.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, May 23, 08:27
Size
6 KB
Mime Type
text/x-python
Expires
Sun, May 25, 08:27 (1 d, 8 h)
Engine
blob
Format
Raw Data
Handle
26348323
Attached To
R8800 solar_potential
crossval_PV.py
View Options
import
pandas
as
pd
import
xarray
as
xr
import
numpy
as
np
import
util
from
sklearn
import
tree
from
sklearn.ensemble
import
RandomForestRegressor
from
sklearn.linear_model
import
LinearRegression
from
sklearn.neighbors
import
KNeighborsRegressor
from
sklearn.neural_network
import
MLPRegressor
from
sklearn.svm
import
SVR
from
sklearn.metrics
import
mean_squared_error
as
mse
from
sklearn.metrics
import
mean_absolute_error
as
mae
from
sklearn.metrics
import
r2_score
as
r2
from
sklearn.model_selection
import
cross_val_predict
from
sklearn.preprocessing
import
Normalizer
from
sklearn.model_selection
import
train_test_split
from
sklearn.base
import
clone
from
ELM_ensemble
import
ELM_Ensemble
import
os
import
sys
## FILE PATHS ##
ws
=
sys
.
argv
[
1
]
STATS
=
sys
.
argv
[
2
]
SAMPLE_SIZE
=
int
(
sys
.
argv
[
3
])
print
(
ws
)
data
=
pd
.
read_csv
(
'/work/hyenergy/output/solar_potential/technical_potential/predict_pv/data_all.csv'
)
COMMUNE_FP
=
'/work/hyenergy/raw/GWR_Reg-BL/GDEListe_20150101.txt'
commune_info
=
pd
.
read_csv
(
COMMUNE_FP
,
sep
=
'
\t
'
,
encoding
=
'cp1250'
)
print
(
'Loaded data successfully'
)
label_name
=
[
'PV_kWh_m2'
]
identifier
=
'DF_UID'
all_features
=
[
'SIS'
,
'illum_class'
,
'AUSRICHTUNG'
,
'NEIGUNG'
]
## FUNCTION DEFIINITIONS ##
# ERROR MEASUREMENTS
error
=
{
'MSE'
:
lambda
y
,
t
:
mse
(
t
,
y
),
'RMSE'
:
lambda
y
,
t
:
np
.
sqrt
(
mse
(
t
,
y
)),
'MAE'
:
lambda
y
,
t
:
mae
(
t
,
y
),
'MBE'
:
lambda
y
,
t
:
np
.
mean
(
t
-
y
),
'R2'
:
lambda
y
,
t
:
r2
(
t
,
y
)
}
def
get_errors
(
true
,
pred
,
label
=
''
):
# create a pandas series with errors and print them
true
=
true
.
reshape
((
-
1
,
1
))
pred
=
pred
.
reshape
((
-
1
,
1
))
err_series
=
pd
.
Series
(
data
=
np
.
zeros
(
len
(
error
)),
index
=
error
.
keys
())
for
key
,
val
in
error
.
items
():
print
(
'
%s
%s
:
%f
'
%
(
label
,
key
,
val
(
pred
,
true
)))
err_series
[
key
]
=
val
(
pred
,
true
)
return
err_series
def
get_errors_perc
(
true
,
pred
,
label
=
''
):
# create a pandas series with errors and print them
pred
=
pred
.
reshape
((
-
1
,))
true
=
true
.
reshape
((
-
1
,))
pred_ratio
=
(
pred
/
true
)
.
reshape
((
-
1
,
1
))
err_series
=
pd
.
Series
(
data
=
np
.
zeros
(
len
(
error
)),
index
=
error
.
keys
())
for
key
,
val
in
error
.
items
():
if
key
==
'R2'
:
print
(
'
%s
%s
:
%f
'
%
(
label
,
key
,
val
(
pred
,
true
)))
err_series
[
key
]
=
val
(
pred
,
true
)
else
:
print
(
'
%s
%s
(
%%
):
%f
'
%
(
label
,
key
,
val
(
pred_ratio
,
np
.
ones
(
pred_ratio
.
shape
)
)))
err_series
[
key
]
=
val
(
pred_ratio
,
np
.
ones
(
pred_ratio
.
shape
)
)
return
err_series
def
rescale
(
vector
,
training
):
return
(
vector
*
training
.
std
())
+
training
.
mean
()
def
normalize
(
vector
,
training
):
return
(
vector
-
training
.
mean
())
/
training
.
std
()
####### ======================================================= ##########
# find french-speaking communes in Valais based on the start of their district name ('District' and not 'Bezirk')
districts_valais
=
commune_info
[
commune_info
.
GDEKT
==
'VS'
]
french_communes
=
[
bzg
[
0
:
8
]
==
'District'
for
bzg
in
districts_valais
.
GDEBZNA
.
values
]
french_cantons
=
[
'NE'
,
'JU'
,
'VD'
,
'GE'
,
'FR'
]
# plus french communes in Valais; considering FR even though bilangue
# get training and testing data (from Cantons)
testing_data
=
data
[(
data
.
Kanton
.
isin
(
french_cantons
))
\
|
(
data
.
Commune_ID
.
isin
(
districts_valais
[
french_communes
]
.
GDENR
.
values
))
]
testing_data
=
testing_data
.
reset_index
()
.
drop
(
'index'
,
axis
=
1
)
training_data
=
data
[
np
.
logical_not
(
data
.
DF_UID
.
isin
(
testing_data
.
DF_UID
.
values
))
]
training_data
=
training_data
.
reset_index
()
.
drop
(
'index'
,
axis
=
1
)
.
sample
(
SAMPLE_SIZE
)
print
(
'Got training and testing data'
)
# Define features and targets
feature_names
=
all_features
# NORMALIZE DATA
X
=
training_data
.
loc
[
:
,
feature_names
]
.
values
T
=
training_data
.
loc
[
:
,
label_name
]
.
values
.
reshape
((
-
1
,))
norm
=
Normalizer
()
x
=
norm
.
fit_transform
(
X
)
t
=
normalize
(
T
,
T
)
print
(
'Got normalized features and targets'
)
## DEFINE ML MODELS AND PARAMETERS
# ML MODEL PARAMETERS
n_est
=
500
tree_depth
=
10
n_samples_leaf
=
3
ELM_nodes
=
10
ELM_est
=
100
SVM_paramC
=
1
SVM_gamma
=
500
k_CV
=
5
# DEFINITION OF ESTIMATORS
def
set_estimators
():
estimator
=
{
'LIN'
:
LinearRegression
(
n_jobs
=
-
1
),
'RF'
:
RandomForestRegressor
(
n_estimators
=
n_est
,
min_samples_leaf
=
n_samples_leaf
,
n_jobs
=
-
1
),
'ELM'
:
ELM_Ensemble
(
n_estimators
=
ELM_est
,
n_nodes
=
ELM_nodes
),
'SVM'
:
SVR
(
C
=
SVM_paramC
,
kernel
=
'rbf'
,
gamma
=
SVM_gamma
),
'KNN'
:
KNeighborsRegressor
(
n_jobs
=
-
1
)
}
return
estimator
# cross-validation of ML models
estimator
=
set_estimators
()
err_cv
=
pd
.
DataFrame
(
data
=
np
.
zeros
((
len
(
error
),
len
(
estimator
))),
index
=
error
.
keys
(),
columns
=
estimator
.
keys
())
err_pc
=
pd
.
DataFrame
(
data
=
np
.
zeros
((
len
(
error
),
len
(
estimator
))),
index
=
error
.
keys
(),
columns
=
estimator
.
keys
())
t_cv_wall
=
[]
t_cv_cpu
=
[]
NAMES
=
np
.
hstack
([
'model'
,
err_cv
.
index
.
values
,
[
'time_wall'
,
'time_cpu'
]])
STATS_NAME
=
os
.
path
.
join
(
ws
,
"
%s
_
%d
k.csv"
%
(
STATS
,
SAMPLE_SIZE
/
1000
))
util
.
write_row_to_csv
(
STATS_NAME
,
NAMES
,
initialize
=
True
)
STATS_NAME_PERC
=
os
.
path
.
join
(
ws
,
"
%s
_perc_
%d
k.csv"
%
(
STATS
,
SAMPLE_SIZE
/
1000
))
util
.
write_row_to_csv
(
STATS_NAME_PERC
,
NAMES
,
initialize
=
True
)
for
name
,
est
in
estimator
.
items
():
print
(
'
\n
Training
%s
\n
'
%
name
)
est_CV
=
estimator
[
name
]
tt
=
util
.
Timer
()
y
=
cross_val_predict
(
est_CV
,
x
,
t
,
cv
=
k_CV
)
# cannot use n_jobs = -1 as not all are scikit-learn estimators
tt
.
stop
()
Y
=
rescale
(
y
,
T
)
err_cv
[
name
]
=
get_errors
(
T
,
Y
.
reshape
((
-
1
,)),
label
=
'Cross-validation'
)
err_pc
[
name
]
=
get_errors_perc
(
T
,
Y
,
label
=
'Cross-validation'
)
t_cv_wall
.
append
(
tt
.
walltime
)
t_cv_cpu
.
append
(
tt
.
cputime
)
VALUES
=
np
.
hstack
([
name
,
err_cv
[
name
]
.
values
,
[
tt
.
walltime
,
tt
.
cputime
]])
util
.
write_row_to_csv
(
STATS_NAME
,
VALUES
)
VALUES_PERC
=
np
.
hstack
([
name
,
err_pc
[
name
]
.
values
,
[
tt
.
walltime
,
tt
.
cputime
]])
util
.
write_row_to_csv
(
STATS_NAME_PERC
,
VALUES_PERC
)
err_cv
.
loc
[
't_cv_wall'
,
:]
=
t_cv_wall
err_cv
.
loc
[
't_cv_cpu'
,
:]
=
t_cv_cpu
err_pc
.
loc
[
't_cv_wall'
,
:]
=
t_cv_wall
err_pc
.
loc
[
't_cv_cpu'
,
:]
=
t_cv_cpu
err_cv
.
to_csv
(
STATS_NAME
)
err_pc
.
to_csv
(
STATS_NAME_PERC
)
print
(
'Error stats saved in
%s
'
%
STATS_NAME
)
Event Timeline
Log In to Comment