Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F120625927
utils_randomforest.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Jul 5, 17:27
Size
7 KB
Mime Type
text/x-python
Expires
Mon, Jul 7, 17:27 (2 d)
Engine
blob
Format
Raw Data
Handle
27206224
Attached To
R9699 ML_students_works
utils_randomforest.py
View Options
#!/usr/bin/env python
# coding: utf-8
import
pandas
as
pd
import
numpy
as
np
from
sklearn.model_selection
import
RandomizedSearchCV
from
sklearn.model_selection
import
GridSearchCV
from
sklearn.ensemble
import
RandomForestRegressor
from
sklearn.preprocessing
import
Normalizer
from
sklearn.preprocessing
import
StandardScaler
from
sklearn
import
metrics
import
matplotlib.pyplot
as
plt
# Split data into three different sets.
# ratio_t is the propotion of the test set
# ratio_v is the propotion of the validation set
# The rest will be the training set
def
split_data
(
X
,
y
,
ratio_t
,
ratio_v
):
s
=
X
.
shape
[
0
]
s1
=
int
(
ratio_t
*
s
)
s2
=
int
(
ratio_v
*
s
)
+
s1
# make sure that the sizes of all sets is ol
assert
(
s1
<
s2
and
s2
<=
s
)
indices
=
np
.
arange
(
s
)
np
.
random
.
shuffle
(
indices
)
# prepare indices of three different set
idx_test
=
indices
[:
s1
]
idx_val
=
indices
[
s1
:
s2
]
idx_train
=
indices
[
s2
:]
X_test
=
X
[
idx_test
]
X_val
=
X
[
idx_val
]
X_train
=
X
[
idx_train
]
y_test
=
y
[
idx_test
]
y_val
=
y
[
idx_val
]
y_train
=
y
[
idx_train
]
return
X_train
,
X_val
,
X_test
,
y_train
,
y_val
,
y_test
# Plot feature importance for the model
def
plot_importance
(
cols
,
model
):
features
=
cols
importances
=
model
.
feature_importances_
indices
=
np
.
argsort
(
model
.
feature_importances_
)
plt
.
subplots
(
figsize
=
(
11
,
9
))
plt
.
title
(
'Feature Importances'
)
plt
.
barh
(
range
(
len
(
indices
)),
importances
[
indices
],
color
=
'pink'
,
align
=
'center'
)
plt
.
yticks
(
range
(
len
(
indices
)),
[
features
[
i
]
for
i
in
indices
])
plt
.
xlabel
(
'Relative Importance'
)
plt
.
show
()
# Normalize ou standarize the train set
# Then apply transform to validation and test set
def
pre_processing
(
X_train
,
X_val
,
X_test
,
norm
=
False
,
std
=
False
):
"""
Apply a transformation to the data.
Options are standardization or normalization (using L2 norm).
By default : no transformation applied to data
"""
if
norm
:
n
=
Normalizer
()
X_train
=
n
.
fit_transform
(
X_train
)
X_test
=
n
.
transform
(
X_test
)
X_val
=
n
.
transform
(
X_val
)
if
std
:
sc
=
StandardScaler
()
X_train
=
sc
.
fit_transform
(
X_train
)
X_test
=
sc
.
transform
(
X_test
)
X_val
=
sc
.
transform
(
X_val
)
return
X_train
,
X_val
,
X_test
# Combine data for 2016 and 2017
# Default data set proportion setting : train 0.7 - val 0.2 - test 0.1
# Also return the column list
def
prepare_combined_data_forCV
(
file_names
,
ratio_t
=
0.1
,
ratio_v
=
0.2
,
norm
=
False
,
std
=
False
)
:
if
type
(
file_names
)
==
str
:
file_names
=
[
file_names
]
dfs
=
[]
for
file_name
in
file_names
:
local_df
=
pd
.
read_csv
(
file_name
)
dfs
.
append
(
local_df
)
df
=
pd
.
concat
(
dfs
,
sort
=
False
)
y
=
df
[
'Produktion [kWh]**'
]
.
values
df
=
df
.
drop
(
columns
=
[
"Produktion [kWh]**"
,
"Anlage_Ort # Emplacement de installation"
])
cols
=
df
.
columns
X_u
=
df
.
values
X_train
,
X_val
,
X_test
,
y_train
,
y_val
,
y_test
=
split_data
(
X_u
,
y
,
ratio_t
,
ratio_v
)
X_train
,
X_val
,
X_test
=
pre_processing
(
X_train
,
X_val
,
X_test
,
norm
,
std
)
print
(
"There are: "
)
print
(
len
(
cols
),
"features for each data points."
)
print
(
X_train
.
shape
[
0
],
"data points in training set."
)
print
(
X_val
.
shape
[
0
],
"data points in validation set."
)
print
(
X_test
.
shape
[
0
],
"data points in test set."
)
return
X_train
,
X_val
,
X_test
,
y_train
,
y_val
,
y_test
,
cols
# Drop useless columns
# Also return the REMIANING feature list
def
prepare_data_drop
(
file_names
,
cols_to_drop
,
norm
=
False
,
std
=
False
):
if
type
(
file_names
)
==
str
:
file_names
=
[
file_names
]
dfs
=
[]
for
file_name
in
file_names
:
local_df
=
pd
.
read_csv
(
file_name
)
dfs
.
append
(
local_df
)
df
=
pd
.
concat
(
dfs
,
sort
=
False
)
y
=
y
=
df
[
'Produktion [kWh]**'
]
.
values
# per 1000 habitants and drop habitants col
df
=
df
.
drop
(
columns
=
[
"Produktion [kWh]**"
,
"Anlage_Ort # Emplacement de installation"
])
for
c
in
cols_to_drop
:
if
(
c
in
df
):
df
=
df
.
drop
(
columns
=
c
)
cols
=
df
.
columns
X_u
=
df
.
values
X
=
StandardScaler
()
.
fit_transform
(
X_u
)
print
(
"There are: "
)
print
(
len
(
cols
),
"reduced features for each data points."
)
print
(
X
.
shape
[
0
],
"data points in total."
)
return
X
,
y
,
cols
# Prepare data for training set
# Drop useless columns
# Return X and y for prediction, as well as the remaining columns
def
prepare_data
(
file_names
,
norm
=
False
,
std
=
False
):
if
type
(
file_names
)
==
str
:
file_names
=
[
file_names
]
dfs
=
[]
for
file_name
in
file_names
:
local_df
=
pd
.
read_csv
(
file_name
)
dfs
.
append
(
local_df
)
df
=
pd
.
concat
(
dfs
,
sort
=
False
)
y
=
y
=
df
[
'Produktion [kWh]**'
]
.
values
df
=
df
.
drop
(
columns
=
[
"Produktion [kWh]**"
,
"Anlage_Ort # Emplacement de installation"
])
cols
=
df
.
columns
X_u
=
df
.
values
# data split
X
=
StandardScaler
()
.
fit_transform
(
X_u
)
print
(
"There are: "
)
print
(
len
(
cols
),
"reduced features for each data points."
)
print
(
X
.
shape
[
0
],
"data points."
)
return
X
,
y
,
cols
# Perform a cross validation for Random Forest
# Hyperparameters to tune is "n_estimators" and "max_depths"
# Aim to reduce the rmse
def
rf_cv
(
X_train
,
X_val
,
y_train
,
y_val
,
max_depths
,
n_estimators
):
rmse_val
=
[]
best_depths
=
[]
for
n_e
in
n_estimators
:
rmse_val_tmp
=
[]
for
d
in
max_depths
:
# create rf
regr
=
RandomForestRegressor
(
max_depth
=
d
,
n_estimators
=
n_e
,
random_state
=
0
)
# param à tuner
# train
regr
.
fit
(
X_train
,
y_train
)
# eval
y_pred_val
=
regr
.
predict
(
X_val
)
rmse_te
=
np
.
sqrt
(
metrics
.
mean_squared_error
(
y_val
,
y_pred_val
))
rmse_val_tmp
.
append
(
rmse_te
)
best_d_tmp
=
np
.
argmin
(
rmse_val_tmp
)
best_rmse_val_d
=
rmse_val_tmp
[
best_d_tmp
]
best_depths
.
append
(
best_d_tmp
)
# best depth for rmses
rmse_val
.
append
(
best_rmse_val_d
)
# best rmse for best depth
print
(
"n_estimators="
,
n_e
,
" depths="
,
max_depths
[
best_d_tmp
],
" with rmse_val "
,
best_rmse_val_d
)
idx
=
np
.
argmin
(
rmse_val
)
best_estimator
=
n_estimators
[
idx
]
# best n_estimator for best rmse
best_depth
=
max_depths
[
best_depths
[
idx
]
]
# best depth
print
(
"idx is "
,
idx
)
print
(
"best n_estimator is"
,
best_estimator
)
print
(
"best_depth"
,
best_depth
)
return
best_estimator
,
best_depth
,
rmse_val
,
best_depths
# Based on the best "n_estimators" and "max_depths" found, perform a second cross validation
# Hyperparameter to tune is "max_features"
# Aim to reduce the rmse
def
rf_cv_2f
(
X_train
,
X_val
,
y_train
,
y_val
,
bestD
,
bestN
,
max_features_range
):
rmse_val
=
[]
for
f
in
range
(
len
(
max_features_range
)):
regr
=
RandomForestRegressor
(
max_features
=
int
(
max_features_range
[
f
])
,
max_depth
=
bestD
,
n_estimators
=
bestN
,
random_state
=
0
)
# param à tuner
print
(
"Tesing max_features ="
,
int
(
max_features_range
[
f
]))
regr
.
fit
(
X_train
,
y_train
)
# eval
y_pred_val
=
regr
.
predict
(
X_val
)
rmse_te
=
np
.
sqrt
(
metrics
.
mean_squared_error
(
y_val
,
y_pred_val
))
rmse_val
.
append
(
rmse_te
)
idx
=
np
.
argmin
(
rmse_val
)
best_f
=
max_features_range
[
idx
]
# best max_features
print
(
"idx is "
,
idx
)
print
(
"best max_features is"
,
best_f
,
" for bestD "
,
bestD
,
" and bestN"
,
bestN
)
return
int
(
best_f
),
rmse_val
Event Timeline
Log In to Comment