Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F110852127
ds.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Apr 28, 10:53
Size
7 KB
Mime Type
text/x-python
Expires
Wed, Apr 30, 10:53 (2 d)
Engine
blob
Format
Raw Data
Handle
25807885
Attached To
R8800 solar_potential
ds.py
View Options
import
numpy
as
np
import
pandas
as
pd
import
xarray
as
xr
import
os
import
time
import
h5py
import
util
from
features
import
Testing
import
norms
class
Dataset
():
def
__init__
(
self
,
path
,
ds_name
,
query_name
=
None
):
self
.
data_path
=
os
.
path
.
join
(
path
,
"raw_data"
)
self
.
ds_path
=
os
.
path
.
join
(
path
,
"datasets"
,
ds_name
)
self
.
location_path
=
os
.
path
.
join
(
path
,
"locations"
)
self
.
train_path
=
os
.
path
.
join
(
self
.
ds_path
,
'train'
)
self
.
test_path
=
os
.
path
.
join
(
self
.
ds_path
,
'test'
)
self
.
query_path
=
None
# Create a new folder if it does not exist yet:
if
not
os
.
path
.
exists
(
self
.
ds_path
):
print
(
"Features and targets not yet create - make features first"
)
return
self
.
load_training
()
self
.
load_validation
()
self
.
load_testing
()
self
.
load_metadata
()
# LOAD TESTING/QUERY DATA
if
query_name
is
not
None
:
self
.
load_query
(
query_name
)
################################################ READ Filenames #######################################################
def
load_normalization
(
self
):
self
.
feature_norm
=
util
.
Normalizer
(
os
.
path
.
join
(
self
.
ds_path
,
"norm_features.csv"
))
self
.
target_norm
=
util
.
Normalizer
(
os
.
path
.
join
(
self
.
ds_path
,
"norm_targets.csv"
))
def
load_metadata
(
self
):
dates
=
pd
.
read_csv
(
os
.
path
.
join
(
self
.
train_path
,
'train_dates.csv'
),
header
=
None
,
index_col
=
0
)
self
.
train_start
=
str
(
dates
.
loc
[
'start_date'
]
.
as_matrix
()[
0
])
self
.
train_end
=
str
(
dates
.
loc
[
'end_date'
]
.
as_matrix
()[
0
])
self
.
train_locs
=
pd
.
read_csv
(
os
.
path
.
join
(
self
.
train_path
,
'train_locations.csv'
),
index_col
=
0
)
def
load_training
(
self
):
# LOAD TRAINING DATA
if
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
train_path
,
"features.hdf5"
))
and
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
train_path
,
"targets.hdf5"
)):
self
.
train_X
=
os
.
path
.
join
(
self
.
train_path
,
"train_features.hdf5"
)
self
.
train_X0
=
os
.
path
.
join
(
self
.
train_path
,
"features.hdf5"
)
self
.
train_T
=
os
.
path
.
join
(
self
.
train_path
,
"train_targets.hdf5"
)
self
.
train_T0
=
os
.
path
.
join
(
self
.
train_path
,
"targets.hdf5"
)
self
.
load_normalization
()
print
(
"Set training features and targets"
)
else
:
print
(
"Training features and targets not yet created"
)
return
def
load_validation
(
self
):
# LOAD VALIDATION DATA
if
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
train_path
,
"features.hdf5"
))
and
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
train_path
,
"targets.hdf5"
)):
self
.
val_X
=
os
.
path
.
join
(
self
.
train_path
,
"val_features.hdf5"
)
self
.
val_T
=
os
.
path
.
join
(
self
.
train_path
,
"val_targets.hdf5"
)
print
(
"Set validation features and targets"
)
self
.
val_Y
=
os
.
path
.
join
(
self
.
train_path
,
"val_prediction.hdf5"
)
else
:
print
(
"Validation features and targets not yet created"
)
def
load_testing
(
self
):
# LOAD TESTING DATA
if
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
test_path
,
"features.hdf5"
))
and
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
test_path
,
"targets.hdf5"
)):
self
.
test_X
=
os
.
path
.
join
(
self
.
test_path
,
"test_features.hdf5"
)
self
.
test_X0
=
os
.
path
.
join
(
self
.
test_path
,
"features.hdf5"
)
self
.
test_T
=
os
.
path
.
join
(
self
.
test_path
,
"test_targets.hdf5"
)
self
.
test_T0
=
os
.
path
.
join
(
self
.
test_path
,
"targets.hdf5"
)
print
(
"Set testing features and targets"
)
else
:
print
(
"Testing features and targets not yet created"
)
def
load_query
(
self
,
query_name
):
self
.
query_path
=
os
.
path
.
join
(
self
.
ds_path
,
'query'
,
query_name
)
# LOAD QUERY DATA
if
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
query_path
,
"features.hdf5"
)):
self
.
query_X
=
os
.
path
.
join
(
self
.
query_path
,
"query_features.hdf5"
)
self
.
query_X0
=
os
.
path
.
join
(
self
.
query_path
,
"features.hdf5"
)
print
(
"Set query features and targets"
)
else
:
print
(
"Query features and targets not yet created"
)
def
rescale_query
(
self
,
force_rescaling
=
False
):
self
.
rescale_output
(
'query_prediction.hdf5'
,
'prediction'
,
self
.
query_path_out
,
force_rescaling
=
force_rescaling
)
############################### add model ######################################
def
add_model
(
self
,
modelname
,
query_name
=
None
):
self
.
model_path
=
os
.
path
.
join
(
self
.
ds_path
,
modelname
)
self
.
train_path_out
=
os
.
path
.
join
(
self
.
model_path
,
'train'
)
self
.
test_path_out
=
os
.
path
.
join
(
self
.
model_path
,
'test'
)
if
query_name
is
not
None
:
self
.
query_path_out
=
os
.
path
.
join
(
self
.
model_path
,
'query'
,
query_name
)
if
os
.
path
.
exists
(
self
.
model_path
):
print
(
"Model directory already exists - no new folders created"
)
else
:
os
.
mkdir
(
self
.
model_path
)
os
.
mkdir
(
os
.
path
.
join
(
self
.
model_path
,
'train'
))
os
.
mkdir
(
os
.
path
.
join
(
self
.
model_path
,
'test'
))
os
.
mkdir
(
os
.
path
.
join
(
self
.
model_path
,
'query'
))
for
subdir
,
dirs
,
files
in
os
.
walk
(
os
.
path
.
join
(
self
.
ds_path
,
'query'
)):
for
directory
in
dirs
:
os
.
mkdir
(
os
.
path
.
join
(
self
.
model_path
,
'query'
,
directory
))
break
self
.
train_Y
=
os
.
path
.
join
(
self
.
train_path_out
,
"train_prediction.hdf5"
)
self
.
train_Y0
=
os
.
path
.
join
(
self
.
train_path_out
,
"prediction.hdf5"
)
self
.
test_Y
=
os
.
path
.
join
(
self
.
test_path_out
,
"test_prediction.hdf5"
)
self
.
test_Y0
=
os
.
path
.
join
(
self
.
test_path_out
,
"prediction.hdf5"
)
if
query_name
is
not
None
:
if
os
.
path
.
exists
(
self
.
query_path_out
):
self
.
query_Y
=
os
.
path
.
join
(
self
.
query_path_out
,
"query_prediction.hdf5"
)
self
.
query_Y0
=
os
.
path
.
join
(
self
.
query_path_out
,
"prediction.hdf5"
)
else
:
print
(
'ERROR: Query not found'
)
def
add_query_to_model
(
self
,
query_name
,
modelname
=
None
):
if
modelname
is
not
None
:
self
.
model_path
=
os
.
path
.
join
(
self
.
ds_path
,
modelname
)
self
.
query_path_out
=
os
.
path
.
join
(
self
.
model_path
,
'query'
,
query_name
)
if
os
.
path
.
exists
(
self
.
query_path_out
):
self
.
query_Y
=
os
.
path
.
join
(
self
.
query_path_out
,
"query_prediction.hdf5"
)
self
.
query_Y0
=
os
.
path
.
join
(
self
.
query_path_out
,
"prediction.hdf5"
)
else
:
print
(
'ERROR: Query not found'
)
############################### make dataset #####################################
def
make_xarray
(
self
,
x
,
y
,
rescale
):
coords
=
[
'x'
,
'y'
,
'hour'
,
'month'
]
tbl_idx
=
list
(
set
(
coords
)
&
set
(
self
.
feature_norm
.
names
))
if
type
(
x
)
==
str
:
x
=
util
.
get_matrix
(
x
)
if
type
(
y
)
==
str
:
y
=
util
.
get_matrix
(
y
)
if
rescale
:
x
=
self
.
feature_norm
.
rescale
(
x
)
y
=
self
.
target_norm
.
rescale
(
y
)
X
=
pd
.
DataFrame
(
data
=
x
,
columns
=
self
.
feature_norm
.
names
)
Y
=
pd
.
DataFrame
(
data
=
y
,
columns
=
self
.
target_norm
.
names
)
tbl
=
pd
.
concat
([
X
,
Y
],
axis
=
1
)
tbl
.
set_index
(
tbl_idx
,
inplace
=
True
)
tbl
=
tbl
.
groupby
(
tbl_idx
)
.
mean
()
# make table unique
return
tbl
.
to_xarray
()
###################################################################################
def
get_matrices
(
self
,
variables
=
[
'train'
,
'test'
,
'val'
]):
for
v
in
variables
:
if
v
==
'train'
:
self
.
train_x
=
util
.
get_matrix
(
self
.
train_X
)
self
.
train_t
=
util
.
get_matrix
(
self
.
train_T
)
if
v
==
'val'
:
self
.
val_x
=
util
.
get_matrix
(
self
.
val_X
)
self
.
val_t
=
util
.
get_matrix
(
self
.
val_T
)
if
v
==
'test'
:
self
.
test_x
=
util
.
get_matrix
(
self
.
test_X
)
self
.
test_t
=
util
.
get_matrix
(
self
.
test_T
)
if
v
==
'query'
and
self
.
query_path
is
not
None
:
self
.
query_x
=
util
.
get_matrix
(
self
.
query_X
)
def
rescale_output
(
self
,
h5file
,
target_name
,
path
,
set_norm
=
False
,
norm_type
=
'none'
,
force_rescaling
=
False
):
infile
=
os
.
path
.
join
(
path
,
h5file
)
outfile
=
os
.
path
.
join
(
path
,
target_name
)
if
os
.
path
.
exists
(
infile
):
if
set_norm
:
self
.
target_norm
.
set_status
(
norm_type
)
norms
.
rescale_hdf5_copy
(
infile
,
outfile
,
self
.
target_norm
,
force_rescaling
=
force_rescaling
)
else
:
print
(
"Query outputs to rescale do not exist"
)
Event Timeline
Log In to Comment