Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F119594845
util_hdf5.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Jun 27, 21:58
Size
8 KB
Mime Type
text/x-python
Expires
Sun, Jun 29, 21:58 (2 d)
Engine
blob
Format
Raw Data
Handle
27039096
Attached To
R8800 solar_potential
util_hdf5.py
View Options
import
numpy
as
np
import
pandas
as
pd
import
xarray
as
xr
import
csv
from
tables
import
open_file
,
Atom
,
Filters
import
h5py
import
os
import
fasteners
# inter-process file lock
import
time
import
norms
import
pickle
import
util
def
get_matrix_pointer
(
h5file
):
# open hdf5 file
h5
=
open_file
(
h5file
)
# detect the only dataset in the file, which will be pointed to by 'node'
for
node
in
h5
.
walk_nodes
():
pass
# find a node with whatever name
return
node
,
h5
def
get_matrix
(
h5file
,
return_shape
=
False
):
node
,
h5
=
get_matrix_pointer
(
h5file
)
dt
=
node
.
dtype
N
,
d
=
node
.
shape
# HDF5 files are transposed, for Matlab compatibility
try
:
X
=
node
[:,:]
except
:
print
(
"Cannot load full matrix - matrix size probably exceeds memory"
)
X
=
[]
h5
.
close
()
# closing file
if
return_shape
:
return
X
,
N
,
d
,
dt
else
:
return
X
def
write_hdf5
(
data
,
file
):
# check if data exists and delete file if necessary
if
os
.
path
.
exists
(
file
):
os
.
remove
(
file
)
make_hdf5
(
data
,
file
)
def
make_hdf5
(
x
,
h5file
):
with
h5py
.
File
(
h5file
,
'w'
)
as
f
:
dset
=
f
.
create_dataset
(
'data'
,
data
=
x
)
######################### NORMALIZATION ########################################
class
Normalizer
():
def
__init__
(
self
,
filename
):
self
.
load_info
(
filename
)
def
load_info
(
self
,
filename
):
norm_pd
=
pd
.
read_csv
(
filename
,
index_col
=
0
)
self
.
mean
=
norm_pd
.
loc
[
'mean'
,:]
.
values
.
astype
(
np
.
float
)
self
.
std
=
norm_pd
.
loc
[
'std'
,:]
.
values
.
astype
(
np
.
float
)
self
.
min
=
norm_pd
.
loc
[
'min'
,:]
.
values
.
astype
(
np
.
float
)
self
.
max
=
norm_pd
.
loc
[
'max'
,:]
.
values
.
astype
(
np
.
float
)
self
.
status
=
norm_pd
.
loc
[
'status'
,:]
.
values
[
0
]
self
.
m
=
len
(
self
.
mean
)
self
.
names
=
norm_pd
.
columns
self
.
std
[
self
.
std
==
0
]
=
1
# prevent division by zero for std=0
self
.
file
=
filename
def
normalize
(
self
,
X
,
lower_bound
=
0
,
upper_bound
=
1
):
if
self
.
status
==
'mean'
:
return
self
.
normalize_mean
(
X
)
elif
self
.
status
==
'range'
:
return
self
.
normalize_range
(
X
,
lower_bound
,
upper_bound
)
else
:
print
(
'cannot normalize - mode not known'
)
return
X
def
rescale
(
self
,
X
,
lower_bound
=
0
,
upper_bound
=
1
):
if
self
.
status
==
'mean'
:
return
self
.
rescale_mean
(
X
)
elif
self
.
status
==
'range'
:
return
self
.
rescale_range
(
X
,
lower_bound
,
upper_bound
)
else
:
print
(
'cannot rescale - mode not known'
)
return
X
def
normalize_range
(
self
,
X
,
lower_bound
=
0
,
upper_bound
=
1
):
return
lower_bound
+
(
X
-
self
.
min
)
*
(
upper_bound
-
lower_bound
)
/
(
self
.
max
-
self
.
min
)
def
rescale_range
(
self
,
X
,
lower_bound
=
0
,
upper_bound
=
1
):
return
self
.
min
+
(
X
-
lower_bound
)
*
(
self
.
max
-
self
.
min
)
/
(
upper_bound
-
lower_bound
)
def
normalize_mean
(
self
,
X
):
return
(
X
-
self
.
mean
)
/
(
self
.
std
)
def
rescale_mean
(
self
,
X
):
return
self
.
mean
+
X
*
self
.
std
def
set_status
(
self
,
new_status
):
norm_pd
=
pd
.
read_csv
(
self
.
file
,
index_col
=
0
)
norm_pd
.
loc
[
'status'
]
=
np
.
repeat
(
new_status
,
self
.
m
)
.
reshape
((
1
,
-
1
))
norm_pd
.
to_csv
(
self
.
file
)
self
.
status
=
new_status
print
(
"set normalization status to "
+
new_status
)
def
load_pickle
(
self
,
refname
):
ref
=
pd
.
read_pickle
(
refname
)
x0
=
ref
.
as_matrix
(
columns
=
self
.
names
)
x
=
self
.
normalize
(
x0
)
return
x0
,
x
class
Norm
():
def
__init__
(
self
,
m
=
0
,
k
=
100
,
column_names
=
None
):
if
m
>
0
:
# initialise tables
self
.
norm_tbl
=
np
.
zeros
((
0
,
2
*
m
+
1
))
self
.
max
=
np
.
ones
((
1
,
m
))
*
(
float
(
'-inf'
))
self
.
min
=
np
.
ones
((
1
,
m
))
*
(
float
(
'inf'
))
self
.
mean
=
np
.
zeros
((
1
,
m
))
self
.
std
=
np
.
zeros
((
1
,
m
))
self
.
names
=
column_names
self
.
k
=
k
self
.
m
=
m
def
update
(
self
,
X
):
# minimum and maximum
Xmax
=
np
.
amax
(
X
,
axis
=
0
)
Xmin
=
np
.
amin
(
X
,
axis
=
0
)
self
.
max
=
np
.
maximum
(
self
.
max
,
Xmax
)
self
.
min
=
np
.
minimum
(
self
.
min
,
Xmin
)
# mean and std
divided_sum_of_elements
=
np
.
sum
(
X
,
axis
=
0
)
/
self
.
k
divided_squared_sum_of_elements
=
np
.
sum
(
np
.
square
(
X
/
self
.
k
),
axis
=
0
)
n_items
=
X
.
shape
[
0
]
new_row
=
np
.
hstack
([
divided_sum_of_elements
,
divided_squared_sum_of_elements
,
n_items
])
self
.
norm_tbl
=
np
.
vstack
([
self
.
norm_tbl
,
new_row
])
def
evaluate
(
self
):
sigma
=
np
.
sum
(
self
.
norm_tbl
[:,:
self
.
m
],
axis
=
0
)
sigma_sq
=
np
.
sum
(
self
.
norm_tbl
[:,
self
.
m
:
-
1
],
axis
=
0
)
N
=
np
.
sum
(
self
.
norm_tbl
[:,
-
1
])
self
.
mean
=
sigma
*
self
.
k
/
N
self
.
std
=
np
.
sqrt
(
self
.
k
**
2
/
(
N
)
*
(
sigma_sq
-
np
.
square
(
sigma
)
/
N
))
def
make_table
(
self
):
tbl
=
np
.
vstack
([
self
.
mean
,
self
.
std
,
self
.
max
,
self
.
min
])
lst
=
[
'mean'
,
'std'
,
'max'
,
'min'
]
tbl
=
pd
.
DataFrame
(
data
=
tbl
,
columns
=
self
.
names
,
index
=
lst
)
sts
=
pd
.
DataFrame
(
data
=
np
.
repeat
(
'none'
,
self
.
m
)
.
reshape
((
1
,
-
1
)),
index
=
[
'status'
],
columns
=
self
.
names
)
return
pd
.
concat
([
tbl
,
sts
])
####################### OPERATIONS ON HDF5 TABLES ###############################
def
merge_files
(
Y
,
M
,
norm
=
None
,
outname
=
None
,
batches
=
False
):
path
=
os
.
path
.
split
(
Y
)[
0
]
body
=
os
.
path
.
splitext
(
Y
)[
0
]
name
=
os
.
path
.
split
(
body
)[
1
]
if
batches
:
norms
.
merge_files
(
path
,
name
,
M
,
outname
,
norm
)
return
0
,
0
else
:
f
=
open_file
(
os
.
path
.
join
(
path
,
'
%s
_00.hdf5'
%
(
name
)),
"r"
)
for
node
in
f
.
walk_nodes
():
pass
# find a node with whatever name
sigma
=
np
.
zeros
(
node
.
shape
)
sigma_sq
=
np
.
zeros
(
node
.
shape
)
for
i
in
range
(
M
):
h5file
=
os
.
path
.
join
(
path
,
'
%s
_
%02d
.hdf5'
%
(
name
,
i
))
y
=
get_matrix
(
h5file
)
os
.
remove
(
h5file
)
if
norm
is
not
None
:
y
=
norm
.
rescale
(
y
)
sigma
+=
y
sigma_sq
+=
y
**
2
mean
=
sigma
/
M
var
=
1.0
/
M
*
(
sigma_sq
-
sigma
**
2
/
M
)
return
mean
,
var
class
Table_Writer
():
def
__init__
(
self
,
path
,
variables
,
name
):
self
.
path
=
path
self
.
cols
=
variables
self
.
name
=
name
self
.
m
=
len
(
self
.
cols
)
self
.
n
=
0
self
.
make_norm
=
False
def
open_hdf5
(
self
,
add_norm
=
True
,
k
=
100
,
dtype
=
'Float64'
):
filename
=
os
.
path
.
join
(
self
.
path
,
self
.
name
+
".hdf5"
)
if
os
.
path
.
exists
(
filename
):
print
(
'Overwriting
%s
...'
%
self
.
name
)
self
.
file
=
h5py
.
File
(
filename
,
"w"
)
self
.
ds
=
self
.
file
.
create_dataset
(
self
.
name
,
(
0
,
self
.
m
),
maxshape
=
(
None
,
self
.
m
),
dtype
=
dtype
)
if
add_norm
:
self
.
make_norm
=
True
self
.
norm
=
Norm
(
self
.
m
,
k
,
self
.
cols
)
def
write_hdf5
(
self
,
arr
,
table
=
True
):
if
table
:
arr
=
arr
.
as_matrix
(
columns
=
self
.
cols
)
idx
=
self
.
ds
.
shape
[
0
]
self
.
ds
.
resize
(
self
.
ds
.
shape
[
0
]
+
arr
.
shape
[
0
],
axis
=
0
)
self
.
ds
[
idx
:,:]
=
arr
if
self
.
make_norm
:
self
.
norm
.
update
(
arr
)
self
.
n
=
self
.
ds
.
shape
[
0
]
def
close_hdf5
(
self
):
self
.
file
.
close
()
if
self
.
make_norm
:
self
.
norm
.
evaluate
()
return
self
.
norm
.
make_table
()
return
None
class
Split_Training
():
def
__init__
(
self
,
path
,
data_type
,
set_norm
,
norm_type
,
split_ratio
):
data_path
=
os
.
path
.
join
(
path
,
'train'
)
train_name
=
'train_'
+
data_type
val_name
=
'val_'
+
data_type
train_found
=
os
.
path
.
exists
(
os
.
path
.
join
(
data_path
,
train_name
+
'.hdf5'
))
val_found
=
os
.
path
.
exists
(
os
.
path
.
join
(
data_path
,
val_name
+
'.hdf5'
))
self
.
file_found
=
train_found
or
val_found
self
.
file
=
os
.
path
.
join
(
data_path
,
data_type
+
'.hdf5'
)
# ASSUME THAT norm_file IS LOCATED ONE LEVEL ABOVE HDF5 files
norm_file
=
os
.
path
.
join
(
path
,
"norm_"
+
data_type
+
".csv"
)
self
.
norm
=
Normalizer
(
norm_file
)
if
set_norm
:
self
.
norm
.
set_status
(
norm_type
)
self
.
train
=
Table_Writer
(
data_path
,
self
.
norm
.
names
,
train_name
)
self
.
val
=
Table_Writer
(
data_path
,
self
.
norm
.
names
,
val_name
)
self
.
split_ratio
=
split_ratio
def
open_all
(
self
):
self
.
train
.
open_hdf5
(
add_norm
=
False
)
self
.
val
.
open_hdf5
(
add_norm
=
False
)
def
split_and_norm
(
self
,
X
,
nums
=
None
):
n
=
X
.
shape
[
0
]
normed_data
=
self
.
norm
.
normalize
(
X
)
if
nums
is
None
:
nums
=
np
.
random
.
choice
([
0
,
1
],
size
=
n
,
p
=
[
1
-
self
.
split_ratio
,
self
.
split_ratio
])
tr
=
normed_data
[
nums
==
1
,:]
vl
=
normed_data
[
nums
==
0
,:]
self
.
train
.
write_hdf5
(
tr
,
table
=
False
)
self
.
val
.
write_hdf5
(
vl
,
table
=
False
)
return
nums
def
close_all
(
self
):
self
.
train
.
close_hdf5
()
self
.
val
.
close_hdf5
()
Event Timeline
Log In to Comment