Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F110902198
features_old.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Apr 28, 17:14
Size
8 KB
Mime Type
text/x-python
Expires
Wed, Apr 30, 17:14 (2 d)
Engine
blob
Format
Raw Data
Handle
25850699
Attached To
R8800 solar_potential
features_old.py
View Options
import
numpy
as
np
import
pandas
as
pd
import
xarray
as
xr
import
os
import
time
from
meteo_data
import
Meteo_Reader
import
h5py
import
util
import
json
class
Training
():
def
__init__
(
self
,
path
,
model_name
,
features
,
targets
,
data_type
=
'meteo'
):
self
.
data_path
=
os
.
path
.
join
(
path
,
"raw_data"
)
self
.
model_path
=
os
.
path
.
join
(
path
,
"datasets"
,
model_name
)
self
.
location_path
=
os
.
path
.
join
(
path
,
"locations"
)
self
.
features
=
features
self
.
targets
=
targets
self
.
data_type
=
data_type
# Create a new folder if it does not exist yet:
if
not
os
.
path
.
exists
(
self
.
model_path
):
os
.
mkdir
(
self
.
model_path
)
###################################### TRAINING DATA ###################################
def
make_dataset
(
self
,
start_date
,
end_date
,
sample_name
=
None
,
test_name
=
None
,
k_norm
=
100
):
# possible adaption: use "select"-dictionary to make class usable for other data types
"""
if os.path.exists(os.path.join(self.model_path,'features.hdf5')):
print('Overwriting features...')
if os.path.exists(os.path.join(self.model_path,'targets.hdf5')):
print('Overwriting targets...')
# Set up HDF5 files for features and targets
ftr_name = os.path.join(self.model_path,"features.hdf5")
tgt_name = os.path.join(self.model_path,"targets.hdf5")
n_f = len(self.features)
n_t = len(self.targets)
ftr_norm = util.Norm(n_f, k_norm)
tgt_norm = util.Norm(n_t, k_norm)
ftr_file = h5py.File(ftr_name, "w")
tgt_file = h5py.File(tgt_name, "w")
ftr_ds = ftr_file.create_dataset('features', (0,n_f), maxshape=(None, n_f))
tgt_ds = tgt_file.create_dataset('targets', (0,n_t), maxshape=(None, n_t))
"""
ftr_file
,
ftr_ds
,
ftr_norm
=
if
self
.
data_type
==
'meteo'
:
self
.
load_meteo
(
ftr_ds
,
tgt_ds
,
ftr_norm
,
tgt_norm
,
start_date
,
end_date
,
sample
=
sample_name
)
ftr_file
.
close
()
tgt_file
.
close
()
#################################################################################
def
load_meteo
(
self
,
ftrs
,
tgts
,
ftr_norm
,
tgt_norm
,
start_date
,
end_date
,
ftrs_tst
=
None
,
tgts_tst
=
None
,
sample
=
None
,
test
=
None
):
reader
=
Meteo_Reader
(
self
.
data_path
)
locations
=
'CH'
# read variables that are in features or in targets
vars_to_read
=
list
(
set
(
reader
.
_all_vars
)
&
set
(
self
.
features
)
|
set
(
reader
.
_all_vars
)
&
set
(
self
.
targets
))
if
sample
is
not
None
:
reader
.
read_sample
(
filename
=
sample
,
sample_raw
=
False
)
locations
=
'sample'
if
test
is
not
None
:
reader
.
read_sample
(
filename
=
test
,
sample_name
=
'sample_test'
,
sample_raw
=
False
)
month_start
=
pd
.
date_range
(
start_date
,
end_date
,
freq
=
'MS'
)
month_end
=
pd
.
date_range
(
start_date
,
end_date
,
freq
=
'M'
)
timer
=
time
.
clock
()
for
(
curr_month
,
curr_month_end
)
in
zip
(
month_start
,
month_end
):
timer1
=
time
.
clock
()
reader
.
read_data
(
curr_month
,
curr_month_end
,
vars_to_read
,
reset
=
True
,
print_log
=
False
)
train_data
=
reader
.
get_subset
()
# CREATE FEATURE AND TARGET ARRAYS
tbl
=
reader
.
make_table
(
indata
=
train_data
,
print_log
=
False
)
self
.
to_hdf5
(
ftrs
,
tbl
,
self
.
features
,
nrm
=
ftr_norm
)
self
.
to_hdf5
(
tgts
,
tbl
,
self
.
targets
,
nrm
=
tgt_norm
)
if
test
is
not
None
:
test_data
=
reader
.
get_subset
(
sample_name
=
'sample_test'
)
tbl_tst
=
reader
.
make_table
(
indata
=
test_data
,
print_log
=
False
)
self
.
to_hdf5
(
ftrs_tst
,
tbl_tst
,
self
.
features
,
update
=
False
)
self
.
to_hdf5
(
tgts_tst
,
tbl_tst
,
self
.
targets
,
update
=
False
)
#ftr_tbl = tbl.as_matrix(columns = self.features)
#tgt_tbl = tbl.as_matrix(columns = self.targets)
#ftr_norm.update(ftr_tbl)
#tgt_norm.update(tgt_tbl)
# WRITE TO HDF5
#idx = ftrs.shape[0]
#ftrs.resize(ftrs.shape[0]+ftr_tbl.shape[0], axis=0)
#ftrs[idx:,:] = ftr_tbl
#tgts.resize(tgts.shape[0]+tgt_tbl.shape[0], axis=0)
#tgts[idx:,:] = tgt_tbl
print
(
"Iteration:
%.2f
seconds"
%
(
time
.
clock
()
-
timer1
))
timer
=
time
.
clock
()
-
timer
print
(
"Finished reading data in
%.2f
seconds"
%
(
timer
))
ftr_norm
.
evaluate
()
tgt_norm
.
evaluate
()
locs
=
reader
.
loc_mask
.
where
(
reader
.
loc_mask
[
locations
]
==
1
)
.
to_dataframe
()
.
dropna
()
.
reset_index
()
# write all auxilary information to hdf5
locs
[[
'lon'
,
'lat'
,
'x'
,
'y'
]]
.
to_csv
(
os
.
path
.
join
(
self
.
model_path
,
"training_locations.csv"
))
pd
.
Series
({
'start_date'
:
start_date
,
'end_date'
:
end_date
})
.
to_csv
(
os
.
path
.
join
(
self
.
model_path
,
"training_dates.csv"
))
ftr_norm
.
make_table
(
self
.
features
)
.
to_csv
(
os
.
path
.
join
(
self
.
model_path
,
"feature_norm.csv"
))
tgt_norm
.
make_table
(
self
.
targets
)
.
to_csv
(
os
.
path
.
join
(
self
.
model_path
,
"target_norm.csv"
))
metadata
=
{
'start_date'
:
start_date
,
'end_date'
:
end_date
,
'norm_features'
:
ftr_norm
.
make_table
(
self
.
features
),
'norm_targets'
:
tgt_norm
.
make_table
(
self
.
targets
),
'locations'
:
locs
[[
'lon'
,
'lat'
,
'x'
,
'y'
]]}
return
metadata
########################### AUXILARIES ###############################################
def
to_hdf5
(
self
,
obj
,
intable
,
cols
,
update
=
True
,
nrm
=
None
):
tbl
=
intable
.
as_matrix
(
columns
=
cols
)
if
update
:
nrm
.
update
(
tbl
)
idx
=
obj
.
shape
[
0
]
obj
.
resize
(
obj
.
shape
[
0
]
+
tbl
.
shape
[
0
],
axis
=
0
)
obj
[
idx
:,:]
=
tbl
def
open_hdf5
(
self
,
name
,
cols
,
make_norm
=
True
,
k
=
None
):
filename
=
os
.
path
.
join
(
self
.
model_path
,
name
+
".hdf5"
)
if
os
.
path
.
exists
(
filename
):
print
(
'Overwriting
%s
...'
%
name
)
n
=
len
(
cols
)
obj
=
h5py
.
File
(
filename
,
"w"
)
ds
=
obj
.
create_dataset
(
name
,
(
0
,
n
),
maxshape
=
(
None
,
n
))
if
make_norm
:
norm
=
util
.
Norm
(
n
,
k
)
return
obj
,
ds
,
norm
return
obj
,
ds
class
Testing
():
def
__init__
(
self
,
path
,
model_name
,
data_type
=
'meteo'
):
self
.
data_path
=
os
.
path
.
join
(
path
,
"raw_data"
)
self
.
model_path
=
os
.
path
.
join
(
path
,
"datasets"
,
model_name
)
self
.
location_path
=
os
.
path
.
join
(
path
,
"locations"
)
self
.
data_type
=
data_type
# Create a new folder if it does not exist yet:
if
not
os
.
path
.
exists
(
self
.
model_path
):
print
(
"Model does not exist - create training data first"
)
return
self
.
feature_norm
=
pd
.
read_csv
(
os
.
path
.
join
(
self
.
model_path
,
"feature_norm.csv"
),
index_col
=
0
)
self
.
target_norm
=
pd
.
read_csv
(
os
.
path
.
join
(
self
.
model_path
,
"target_norm.csv"
),
index_col
=
0
)
def
meteo_query
(
self
,
name
,
loc
,
hour
=
list
(
range
(
3
,
20
)),
month
=
list
(
range
(
1
,
13
)),
year
=
None
,
input_vars
=
None
,
normalise
=
True
):
"""
write hdf5 feature table for query data
Inputs:
name name for folder that contains the query data set
loc filename for location set
hour, month, year LISTS of values that should be added to the query set
input_vars include other variables in the NOT YET IMPLEMENTED
norm perform normalisation of data
"""
# Check that all prerequisite files are in the correct directory
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
location_path
,
loc
)):
print
(
"ERROR: Location file not found - Check
%s
for
%s
"
%
(
self
.
location_path
,
loc
))
return
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
model_path
,
'features.hdf5'
)):
print
(
"ERROR: Training files not found - Check
%s
"
%
(
self
.
model_path
))
return
timer
=
time
.
clock
()
# Create dataset according to features
pts
=
pd
.
read_csv
(
os
.
path
.
join
(
self
.
location_path
,
loc
))
# read locations
pts
=
pts
.
reset_index
()
.
groupby
([
'x'
,
'y'
])
.
agg
(
'first'
)
.
to_xarray
()
.
drop
(
'index'
)
# create xarray dataset from locations
# add data as required
if
hour
is
not
None
:
pts
.
coords
[
'hour'
]
=
hour
if
month
is
not
None
:
pts
.
coords
[
'month'
]
=
month
if
year
is
not
None
:
pts
.
coords
[
'year'
]
=
year
pts
=
pts
.
to_dataframe
()
.
dropna
()
.
reset_index
()
# turn back into table of feature points
# Make feature table
arr
=
pts
.
as_matrix
(
columns
=
self
.
feature_norm
.
columns
)
if
arr
.
shape
[
1
]
!=
len
(
self
.
feature_norm
.
columns
):
print
(
"ERROR: Not all features given as input"
)
return
# create files and folders
self
.
query_path
=
os
.
path
.
join
(
self
.
model_path
,
name
)
self
.
features_query
=
os
.
path
.
join
(
self
.
query_path
,
"features_query.hdf5"
)
if
not
os
.
path
.
exists
(
self
.
query_path
):
os
.
mkdir
(
self
.
query_path
)
# normalise and write to HDF5
ftr_file
=
h5py
.
File
(
self
.
features_query
,
"w"
)
ftr_ds
=
ftr_file
.
create_dataset
(
'features'
,
data
=
arr
)
ftr_file
.
close
()
timer
=
time
.
clock
()
-
timer
print
(
"Dataset successfully created in
%.2f
seconds"
%
(
timer
))
class
Table_Object
():
def
__init__
(
self
,
path
,
variables
,
name
):
self
.
path
=
path
self
.
cols
=
variables
self
.
name
=
name
self
.
n
=
len
(
self
.
cols
)
def
add_hdf5
(
self
,
add_norm
=
True
,
k
=
None
):
filename
=
os
.
path
.
join
(
self
.
path
,
self
.
name
+
".hdf5"
)
if
os
.
path
.
exists
(
filename
):
print
(
'Overwriting
%s
...'
%
self
.
name
)
self
.
file
=
h5py
.
File
(
filename
,
"w"
)
self
.
ds
=
self
.
file
.
create_dataset
(
self
.
name
,
(
0
,
self
.
n
),
maxshape
=
(
None
,
self
.
n
))
if
add_norm
:
self
.
norm
=
util
.
Norm
(
self
.
n
,
k
)
def
close_file
(
self
):
self
.
file
.
close
()
Event Timeline
Log In to Comment