Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F111155537
RF_variableImportance.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, Apr 29, 22:23
Size
4 KB
Mime Type
text/x-python
Expires
Thu, May 1, 22:23 (2 d)
Engine
blob
Format
Raw Data
Handle
25847153
Attached To
R8800 solar_potential
RF_variableImportance.py
View Options
import
numpy
as
np
import
matplotlib.pyplot
as
plt
from
matplotlib.colors
import
LinearSegmentedColormap
import
pandas
as
pd
import
xarray
as
xr
import
os
from
sklearn.ensemble
import
RandomForestRegressor
from
sklearn.metrics
import
mean_squared_error
as
mse
from
meteo_data
import
Meteo_Reader
from
sklearn.externals
import
joblib
import
util
data_path
=
os
.
path
.
abspath
(
"/mnt/sda1/hyenergy/data/meteo"
)
# data_path = os.path.abspath("/Users/alinawalch/Documents/EPFL/data/meteo")
sample0
=
pd
.
read_pickle
(
os
.
path
.
join
(
data_path
,
'datasets'
,
'hourly_1M_sample.pkl'
))
ftr_table0
=
sample0
.
drop
(
columns
=
[
'index'
,
'lon'
,
'lat'
,
'date'
,
'year'
,
'CH'
,
'day'
,
'timestamp'
,
'SIS'
])
tgt_table0
=
sample0
[[
'SIS'
]]
ftr_table
=
util
.
standardize
(
ftr_table0
,
ftr_table0
.
mean
(),
ftr_table0
.
std
())
tgt_table
=
util
.
standardize
(
tgt_table0
,
tgt_table0
.
mean
(),
tgt_table0
.
std
())
x0
=
ftr_table0
.
as_matrix
()
t0
=
tgt_table0
.
as_matrix
()
.
reshape
((
-
1
,))
x_all
=
ftr_table
.
as_matrix
()
t_all
=
tgt_table
.
as_matrix
()
.
reshape
((
-
1
,))
importances
=
np
.
zeros
((
7
,
x_all
.
shape
[
1
]))
stds
=
np
.
zeros
((
7
,
x_all
.
shape
[
1
]))
inds
=
np
.
zeros
((
7
,
x_all
.
shape
[
1
]))
# Random Forest
n_est
=
500
#################### MONTHLY ANALYSIS ##################################
data
=
pd
.
read_pickle
(
os
.
path
.
join
(
data_path
,
'datasets'
,
'hourly_all_jan.pkl'
))
hrs
=
[
9
,
11
,
13
]
for
i
in
[
0
,
1
,
2
]:
h
=
hrs
[
i
]
sample
=
data
.
loc
[(
data
.
hour
==
h
)]
ftr_jan0
=
sample
.
drop
(
columns
=
[
'lon'
,
'lat'
,
'year'
,
'CH'
,
'day'
,
'SIS'
])
.
reindex
(
columns
=
ftr_table0
.
columns
)
tgt_jan0
=
sample
[[
'SIS'
]]
ftr_jan
=
util
.
standardize
(
ftr_jan0
,
ftr_jan0
.
mean
(),
ftr_jan0
.
std
())
tgt_jan
=
util
.
standardize
(
tgt_jan0
,
tgt_jan0
.
mean
(),
tgt_jan0
.
std
())
x0
=
ftr_jan0
.
as_matrix
()
t0
=
tgt_jan0
.
as_matrix
()
.
reshape
((
-
1
,))
x
=
ftr_jan
.
as_matrix
()
t
=
tgt_jan
.
as_matrix
()
.
reshape
((
-
1
,))
print
(
x
.
shape
)
print
(
t
.
shape
)
rf
=
RandomForestRegressor
(
n_estimators
=
n_est
,
max_depth
=
100
,
n_jobs
=
-
1
)
print
(
'Training random forest... '
)
timer_0
=
util
.
Timer
()
rf
.
fit
(
x
,
t
)
timer_0
.
stop
()
joblib
.
dump
(
rf
,
os
.
path
.
join
(
data_path
,
'datasets'
,
(
'model_importances_jan_
%d
.pkl'
%
h
)))
importances
[
i
+
1
,:]
=
np
.
mean
([
tree
.
feature_importances_
for
tree
in
rf
.
estimators_
],
axis
=
0
)
stds
[
i
+
1
,:]
=
np
.
std
([
tree
.
feature_importances_
for
tree
in
rf
.
estimators_
],
axis
=
0
)
inds
[
i
+
1
,:]
=
np
.
argsort
(
importances
[
i
+
1
,:])[
-
30
:]
print
(
importances
[
i
+
1
,:])
print
(
stds
[
i
+
1
,:])
print
(
inds
[
i
+
1
,:])
### June
data
=
pd
.
read_pickle
(
os
.
path
.
join
(
data_path
,
'datasets'
,
'hourly_all_jun.pkl'
))
hrs
=
[
6
,
11
,
16
]
for
i
in
[
0
,
1
,
2
]:
h
=
hrs
[
i
]
sample
=
data
.
loc
[(
data
.
hour
==
h
)]
ftr_jun0
=
sample
.
drop
(
columns
=
[
'lon'
,
'lat'
,
'year'
,
'CH'
,
'day'
,
'SIS'
])
.
reindex
(
columns
=
ftr_table0
.
columns
)
tgt_jun0
=
sample
[[
'SIS'
]]
ftr_jun
=
util
.
standardize
(
ftr_jun0
,
ftr_jun0
.
mean
(),
ftr_jun0
.
std
())
tgt_jun
=
util
.
standardize
(
tgt_jun0
,
tgt_jun0
.
mean
(),
tgt_jun0
.
std
())
x0
=
ftr_jun0
.
as_matrix
()
t0
=
tgt_jun0
.
as_matrix
()
.
reshape
((
-
1
,))
x
=
ftr_jun
.
as_matrix
()
t
=
tgt_jun
.
as_matrix
()
.
reshape
((
-
1
,))
rf
=
RandomForestRegressor
(
n_estimators
=
n_est
,
max_depth
=
100
,
n_jobs
=
-
1
)
print
(
'Training random forest... '
)
timer_0
=
util
.
Timer
()
rf
.
fit
(
x
,
t
)
timer_0
.
stop
()
joblib
.
dump
(
rf
,
os
.
path
.
join
(
data_path
,
'datasets'
,
(
'model_importances_jun_
%d
.pkl'
%
h
)))
importances
[
i
+
4
,:]
=
np
.
mean
([
tree
.
feature_importances_
for
tree
in
rf
.
estimators_
],
axis
=
0
)
stds
[
i
+
4
,:]
=
np
.
std
([
tree
.
feature_importances_
for
tree
in
rf
.
estimators_
],
axis
=
0
)
inds
[
i
+
4
,:]
=
np
.
argsort
(
importances
[
i
+
4
,:])[
-
30
:]
print
(
importances
[
i
+
4
,:])
print
(
stds
[
i
+
4
,:])
print
(
inds
[
i
+
4
,:])
rf
=
RandomForestRegressor
(
n_estimators
=
n_est
,
max_depth
=
100
,
n_jobs
=
-
1
)
print
(
'Training random forest... '
)
timer_0
=
util
.
Timer
()
rf
.
fit
(
x_all
,
t_all
)
timer_0
.
stop
()
joblib
.
dump
(
rf
,
os
.
path
.
join
(
data_path
,
'datasets'
,
'model_importances.pkl'
))
importances
[
0
,:]
=
np
.
mean
([
tree
.
feature_importances_
for
tree
in
rf
.
estimators_
],
axis
=
0
)
stds
[
0
,:]
=
np
.
std
([
tree
.
feature_importances_
for
tree
in
rf
.
estimators_
],
axis
=
0
)
inds
[
0
,:]
=
np
.
argsort
(
importances
[
0
,:])[
-
30
:]
print
(
importances
[
0
,:])
print
(
stds
[
0
,:])
print
(
inds
[
0
,:])
pd
.
DataFrame
(
data
=
importances
,
columns
=
ftr_table0
.
columns
)
.
to_csv
(
'importances.csv'
)
pd
.
DataFrame
(
data
=
stds
,
columns
=
ftr_table0
.
columns
)
.
to_csv
(
'stds.csv'
)
pd
.
DataFrame
(
data
=
inds
,
columns
=
ftr_table0
.
columns
)
.
to_csv
(
'inds.csv'
)
# plt.figure(figsize=(15,15))
# plt.title("Feature Importances for Random Forest (%d trees)" %n_est)
# plt.barh(range(30),importance[ind],
# color="r", xerr=std[ind], alpha=0.4, align="center")
# plt.yticks(range(30), feature_names[ind])
# plt.ylim([-1, len(ind)])
# plt.savefig("importances.png")
Event Timeline
Log In to Comment