Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F110850551
ds_old.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Apr 28, 10:43
Size
8 KB
Mime Type
text/x-python
Expires
Wed, Apr 30, 10:43 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
25856355
Attached To
R8800 solar_potential
ds_old.py
View Options
import
numpy
as
np
import
pandas
as
pd
import
xarray
as
xr
import
os
import
time
from
features
import
Features
import
h5py
import
util
class
Dataset
:
def
__init__
(
self
,
path
,
model_name
,
query_name
=
None
):
self
.
data_path
=
os
.
path
.
join
(
path
,
"raw_data"
)
self
.
model_path
=
os
.
path
.
join
(
path
,
"datasets"
,
model_name
)
self
.
location_path
=
os
.
path
.
join
(
path
,
"locations"
)
# Create a new folder if it does not exist yet:
if
not
os
.
path
.
exists
(
self
.
model_path
):
os
.
mkdir
(
self
.
model_path
)
# LOAD TRAINING DATA
if
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
model_path
,
"features.hdf5"
)):
self
.
features
=
os
.
path
.
join
(
self
.
model_path
,
"features.hdf5"
)
self
.
targets
=
os
.
path
.
join
(
self
.
model_path
,
"targets.hdf5"
)
self
.
load_normalization
()
print
(
"Set features and targets"
)
else
:
print
(
"Features and targets not yet created - run make_dataset before modelling"
)
return
# LOAD TESTING/QUERY DATA
if
query_name
is
not
None
:
if
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
query_path
,
"features_query.hdf5"
)):
self
.
get_query
(
query_name
)
else
:
print
(
"Query features not yet created - run make_query before modelling"
)
####################################################### QUERY DATA #######################################################
def
get_query
(
self
,
query_name
):
"""
check for hdf5 feature table for query data
Inputs:
query_name name for folder that contains the query data set
"""
self
.
query_path
=
os
.
path
.
join
(
self
.
model_path
,
query_name
)
self
.
features_query
=
os
.
path
.
join
(
self
.
query_path
,
"features_query.hdf5"
)
if
not
os
.
path
.
exists
(
self
.
features_query
):
print
(
"Query features not yet created - run make_query before modelling"
)
else
:
print
(
"Set
%s
as query feature set"
%
self
.
features_query
)
def
make_query
(
self
,
name
,
loc
,
hour
=
list
(
range
(
3
,
20
)),
month
=
list
(
range
(
1
,
13
)),
year
=
None
,
input_vars
=
None
,
normalise
=
True
):
"""
write hdf5 feature table for query data
Inputs:
name name for folder that contains the query data set
loc filename for location set
hour, month, year LISTS of values that should be added to the query set
input_vars include other variables in the NOT YET IMPLEMENTED
norm perform normalisation of data
"""
# Check that all prerequisite files are in the correct directory
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
location_path
,
loc
)):
print
(
"ERROR: Location file not found - Check
%s
for
%s
"
%
(
self
.
location_path
,
loc
))
return
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
model_path
,
'features.hdf5'
)):
print
(
"ERROR: Training files not found - Check
%s
"
%
(
self
.
model_path
))
return
timer
=
time
.
clock
()
# Create dataset according to features
pts
=
pd
.
read_csv
(
os
.
path
.
join
(
self
.
location_path
,
loc
))
# read locations
pts
=
pts
.
reset_index
()
.
groupby
([
'x'
,
'y'
])
.
agg
(
'first'
)
.
to_xarray
()
.
drop
(
'index'
)
# create xarray dataset from locations
# add data as required
if
hour
is
not
None
:
pts
.
coords
[
'hour'
]
=
hour
if
month
is
not
None
:
pts
.
coords
[
'month'
]
=
month
if
year
is
not
None
:
pts
.
coords
[
'year'
]
=
year
pts
=
pts
.
to_dataframe
()
.
dropna
()
.
reset_index
()
# turn back into table of feature points
# Make feature table
arr
=
pts
.
as_matrix
(
columns
=
self
.
feature_norm
.
columns
)
if
arr
.
shape
[
1
]
!=
len
(
self
.
feature_norm
.
columns
):
print
(
"ERROR: Not all features given as input"
)
return
# create files and folders
self
.
query_path
=
os
.
path
.
join
(
self
.
model_path
,
name
)
self
.
features_query
=
os
.
path
.
join
(
self
.
query_path
,
"features_query.hdf5"
)
if
not
os
.
path
.
exists
(
self
.
query_path
):
os
.
mkdir
(
self
.
query_path
)
# normalise and write to HDF5
ftr_file
=
h5py
.
File
(
self
.
features_query
,
"w"
)
ftr_ds
=
ftr_file
.
create_dataset
(
'features'
,
data
=
arr
)
if
normalise
:
ftr_ds
[:,:],
_
,
_
=
util
.
normalize
(
ftr_ds
,
Xnorm
=
self
.
feature_norm
.
values
)
ftr_file
.
close
()
timer
=
time
.
clock
()
-
timer
print
(
"Dataset successfully created in
%.2f
seconds"
%
(
timer
))
################################################### TRAINING DATA ########################################################
def
make_dataset
(
self
,
features
,
targets
,
start_date
,
end_date
,
variables_to_read
,
raw_format
=
'H'
,
sample_name
=
None
,
norm_features
=
"mean"
,
norm_targets
=
"mean"
,
append
=
False
):
"""
Create HDF5 table of training features and targets (4 files: features.hdf5, targets.hdf5, ftr_norm.txt, tgt_norm.txt)
Inputs:
filename name of subfolder for data storage
start_date, end_date range of dates to be considered
variables_to_read, raw_format list of data variables (SIS, SISDIR, SISCF, ALB etc) & format in which data is read
sample_name name of file containing subsample (if None, all locations are considered)
norm_features, norm_targets desired type of normalisation for features and for targets; one of ['range', 'mean', 'none']
append True: add data to end of file - NOT YET IMPLEMENTED!!!!
"""
# Set up reader for raw data
reader
=
Radiation_Reader
(
self
.
data_path
,
variables
=
variables_to_read
,
data_format
=
raw_format
,
split_raw
=
True
)
locations
=
'CH'
if
sample_name
is
not
None
:
reader
.
read_sample
(
filename
=
sample_name
)
locations
=
'sample'
month_start
=
pd
.
date_range
(
start_date
,
end_date
,
freq
=
'MS'
)
month_end
=
pd
.
date_range
(
start_date
,
end_date
,
freq
=
'M'
)
# Set up HDF5 files for features and targets
self
.
features
=
os
.
path
.
join
(
self
.
model_path
,
"features.hdf5"
)
self
.
targets
=
os
.
path
.
join
(
self
.
model_path
,
"targets.hdf5"
)
n_f
=
len
(
features
)
n_t
=
len
(
targets
)
ftr_file
=
h5py
.
File
(
self
.
features
,
"w"
)
tgt_file
=
h5py
.
File
(
self
.
targets
,
"w"
)
ftr_ds
=
ftr_file
.
create_dataset
(
'features'
,
(
0
,
n_f
),
maxshape
=
(
None
,
n_f
))
tgt_ds
=
tgt_file
.
create_dataset
(
'targets'
,
(
0
,
n_t
),
maxshape
=
(
None
,
n_t
))
timer
=
time
.
clock
()
for
(
curr_month
,
curr_month_end
)
in
zip
(
month_start
,
month_end
):
# READ DATA
reader
.
add_data
(
curr_month
,
curr_month_end
,
reset
=
True
,
print_log
=
False
)
# CREATE FEATURE AND TARGET ARRAYS
tbl
=
reader
.
make_table
(
print_log
=
False
)
ftrs
=
tbl
.
as_matrix
(
columns
=
features
)
tgts
=
tbl
.
as_matrix
(
columns
=
targets
)
# WRITE TO HDF5
idx
=
ftr_ds
.
shape
[
0
]
ftr_ds
.
resize
(
ftr_ds
.
shape
[
0
]
+
ftrs
.
shape
[
0
],
axis
=
0
)
ftr_ds
[
idx
:,:]
=
ftrs
tgt_ds
.
resize
(
tgt_ds
.
shape
[
0
]
+
tgts
.
shape
[
0
],
axis
=
0
)
tgt_ds
[
idx
:,:]
=
tgts
timer
=
time
.
clock
()
-
timer
print
(
"Finished reading data in
%.2f
seconds"
%
(
timer
))
# Normalise data (could also standardize to mean and std deviation by using util.standardize)
ftr_ds
[:,:],
ftr_norm
,
ftr_idx
=
self
.
normalise_training
(
ftr_ds
,
norm_features
)
self
.
feature_norm
=
pd
.
DataFrame
(
data
=
ftr_norm
,
columns
=
features
,
index
=
ftr_idx
)
tgt_ds
[:,:],
tgt_norm
,
tgt_idx
=
self
.
normalise_training
(
tgt_ds
,
norm_targets
)
self
.
target_norm
=
pd
.
DataFrame
(
data
=
tgt_norm
,
columns
=
targets
,
index
=
tgt_idx
)
# Close HDF5 and write all auxilary data
ftr_file
.
close
()
tgt_file
.
close
()
# write all auxilary files
locs
=
reader
.
loc_mask
.
where
(
reader
.
loc_mask
[
locations
]
==
1
)
.
to_dataframe
()
.
dropna
()
.
reset_index
()
locs
[[
'lon'
,
'lat'
,
'x'
,
'y'
]]
.
to_csv
(
os
.
path
.
join
(
self
.
model_path
,
"training_locations.csv"
))
pd
.
Series
({
'start_date'
:
start_date
,
'end_date'
:
end_date
,
'format'
:
raw_format
})
.
to_csv
(
os
.
path
.
join
(
self
.
model_path
,
"training_dates.csv"
))
self
.
feature_norm
.
to_csv
(
os
.
path
.
join
(
self
.
model_path
,
"feature_norm.csv"
))
self
.
target_norm
.
to_csv
(
os
.
path
.
join
(
self
.
model_path
,
"target_norm.csv"
))
################################################ READ NORMALISATION #######################################################
def
load_normalization
(
self
):
self
.
feature_norm
=
pd
.
read_csv
(
os
.
path
.
join
(
self
.
model_path
,
"feature_norm.csv"
),
index_col
=
0
)
self
.
target_norm
=
pd
.
read_csv
(
os
.
path
.
join
(
self
.
model_path
,
"target_norm.csv"
),
index_col
=
0
)
def
normalise_training
(
self
,
X
,
norm
):
if
norm
==
"range"
:
Xnorm
,
Xmax
,
Xmin
=
util
.
normalize
(
X
)
norm_index
=
[
'max'
,
'min'
]
norm_values
=
[
Xmax
,
Xmin
]
elif
norm
==
"mean"
:
Xnorm
,
Xmean
,
Xstd
=
util
.
standardize
(
X
)
norm_index
=
[
'mean'
,
'std'
]
norm_values
=
[
Xmean
,
Xstd
]
else
:
Xnorm
=
X
norm_values
=
[]
norm_index
=
[]
return
Xnorm
,
norm_values
,
norm_index
Event Timeline
Log In to Comment