Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F110949260
meteo_data.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Apr 28, 22:22
Size
9 KB
Mime Type
text/x-python
Expires
Wed, Apr 30, 22:22 (2 d)
Engine
blob
Format
Raw Data
Handle
25841223
Attached To
R8800 solar_potential
meteo_data.py
View Options
import
numpy
as
np
import
time
import
pandas
as
pd
import
xarray
as
xr
import
os
import
util
from
meteo_raw
import
Meteo_Raw
class
Meteo_Reader
():
def
__init__
(
self
,
data_path
,
variables
=
None
,
data_format
=
'H'
):
'''
Initialisation settings:
- path: root of raw data, with subfolder 'global', 'direct', 'albedo'
- vars: list of variables of any combination of ['SIS', 'SISDIR', 'ALB', 'KI', 'SISCF']
- data_format: 'H': raw data (hourly values); 'MMH': monthly-mean-hourly
'''
self
.
_all_vars
=
[
'SIS'
,
'SISDIR'
,
'SISCF'
,
'SISDIRCF'
,
'KI'
,
'ALB'
,
'KI_SIS'
,
'KI_SISDIR'
]
self
.
path
=
data_path
self
.
format
=
data_format
if
variables
is
None
:
self
.
variables
=
self
.
_all_vars
else
:
self
.
variables
=
variables
self
.
drop_vars
=
list
(
set
(
self
.
_all_vars
)
-
set
(
self
.
variables
))
print
(
data_path
)
print
(
os
.
path
.
join
(
data_path
,
'locations_mask.nc'
))
self
.
loc_mask
=
xr
.
open_dataset
(
os
.
path
.
join
(
data_path
,
'locations_mask.nc'
))
self
.
_loc_ftrs
=
self
.
loc_mask
.
to_dataframe
()
.
columns
# self.loc_mask['sample'] = self.loc_mask['CH']
# INITIALISATIONS
self
.
init_data
=
True
self
.
init_reading
=
True
self
.
sample
=
False
self
.
loaded_hourmask
=
False
##########################################################################################
def
read_yearly
(
self
,
year
,
variables
=
None
,
hourmask
=
True
):
if
variables
is
not
None
:
self
.
variables
=
variables
self
.
drop_vars
=
list
(
set
(
self
.
_all_vars
)
-
set
(
self
.
variables
))
filename
=
str
(
year
)
+
'.nc'
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
path
,
filename
)):
print
(
"file
%s
does not exist - please create file from raw data"
%
filename
)
try
:
self
.
data
=
xr
.
open_dataset
(
os
.
path
.
join
(
self
.
path
,
filename
))
self
.
data
=
self
.
data
.
drop
(
self
.
drop_vars
)
if
self
.
sample
:
self
.
data
=
self
.
data
.
where
(
self
.
loc_mask
[
'sample'
]
==
1
)
except
:
print
(
"failed to read data in
%s
"
%
filename
)
if
hourmask
:
self
.
data
=
self
.
add_hourmask
(
year
=
year
)
self
.
start
=
pd
.
to_datetime
(
self
.
data
.
date
.
values
[
0
])
.
strftime
(
"%Y%m
%d
"
)
self
.
end
=
pd
.
to_datetime
(
self
.
data
.
date
.
values
[
-
1
])
.
strftime
(
"%Y%m
%d
"
)
def
read_data
(
self
,
start_date
,
end_date
,
variables
=
None
,
hourmask
=
True
,
reset
=
False
,
print_log
=
True
):
if
variables
is
not
None
:
self
.
variables
=
variables
self
.
drop_vars
=
list
(
set
(
self
.
_all_vars
)
-
set
(
self
.
variables
))
reader
=
Meteo_Raw
(
self
.
path
,
variables
=
self
.
_all_vars
)
if
reset
:
self
.
init_reading
=
True
month_start
=
pd
.
date_range
(
start_date
,
end_date
,
freq
=
'MS'
)
month_end
=
pd
.
date_range
(
start_date
,
end_date
,
freq
=
'M'
)
timer
=
time
.
clock
()
init
=
True
for
curr_month
,
curr_month_end
in
zip
(
month_start
,
month_end
):
monthfile
=
curr_month
.
strftime
(
'%Y-%m'
)
+
'.nc'
print
(
monthfile
)
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
path
,
monthfile
)):
print
(
"file
%s
does not exist - creating file from raw data"
%
monthfile
)
reader
.
make_monthly
(
curr_month
,
curr_month_end
)
try
:
new_data
=
xr
.
open_dataset
(
os
.
path
.
join
(
self
.
path
,
monthfile
))
new_data
=
new_data
.
drop
(
self
.
drop_vars
)
if
self
.
sample
:
new_data
=
new_data
.
where
(
self
.
loc_mask
[
'sample'
]
==
1
)
if
self
.
format
==
'MMH'
:
new_data
=
new_data
.
mean
(
dim
=
'date'
)
# create monthly mean hourly for the current month
new_data
.
coords
[
'date'
]
=
curr_month
elif
self
.
format
!=
'H'
:
print
(
"Unknown format to read data"
)
if
hourmask
:
new_data
=
self
.
add_hourmask
(
year
=
curr_month
.
year
,
month
=
curr_month
.
month
,
data
=
new_data
)
if
init
:
dataset
=
new_data
init
=
False
else
:
dataset
=
xr
.
concat
([
dataset
,
new_data
],
'date'
)
except
:
print
(
"failed to read or concatenate data in
%s
"
%
monthfile
)
if
self
.
init_reading
:
self
.
data
=
dataset
self
.
init_reading
=
False
else
:
self
.
data
=
xr
.
merge
([
self
.
data
,
dataset
])
timer
=
time
.
clock
()
-
timer
if
print_log
:
print
(
"Finished reading in
%.2f
seconds"
%
(
timer
))
if
hourmask
:
self
.
data
=
self
.
add_hourmask
()
self
.
start
=
month_start
[
0
]
self
.
end
=
month_end
[
-
1
]
.
strftime
(
"%Y%m
%d
"
)
########################################### SUBSETS ###################################################
def
read_sample
(
self
,
matrix_in
=
None
,
filename
=
None
,
sample_name
=
'sample'
,
sample_raw
=
True
):
# read input matrix for sampling locations
if
filename
is
not
None
:
matrix_in
=
np
.
loadtxt
(
filename
)
try
:
self
.
loc_mask
[
sample_name
]
=
xr
.
DataArray
(
matrix_in
,
coords
=
self
.
loc_mask
.
coords
)
except
ValueError
:
self
.
loc_mask
[
sample_name
]
=
xr
.
DataArray
(
matrix_in
.
T
,
coords
=
self
.
loc_mask
.
coords
)
if
sample_raw
:
self
.
sample
=
True
def
create_regular_sample
(
self
,
downsampling_ratio
,
filename
=
None
,
sample_raw
=
True
):
n_lat
=
len
(
self
.
loc_mask
.
lat
.
values
)
n_lon
=
len
(
self
.
loc_mask
.
lon
.
values
)
smpl_int
=
int
(
np
.
floor
(
np
.
sqrt
(
downsampling_ratio
)))
# CREATE MATRIX OF THE SAME SIZE AS LOC_MASK
matrix_reg
=
np
.
zeros
((
n_lat
,
n_lon
))
matrix_reg
[::
smpl_int
,::
smpl_int
]
=
1
matrix_reg
=
matrix_reg
*
self
.
loc_mask
.
CH
.
values
.
T
self
.
loc_mask
[
'sample'
]
=
xr
.
DataArray
(
matrix_reg
,
coords
=
self
.
loc_mask
.
coords
)
if
sample_raw
:
self
.
sample
=
True
if
filename
is
not
None
:
np
.
savetxt
(
filename
,
matrix_reg
)
def
create_random_sample
(
self
,
n_sample
,
filename
=
None
,
sample_raw
=
True
):
n_lat
=
len
(
self
.
loc_mask
.
lat
.
values
)
n_lon
=
len
(
self
.
loc_mask
.
lon
.
values
)
# CREATE MATRIX OF THE SAME SIZE AS LOC_MASK
loc_mask_local
=
self
.
loc_mask
.
where
(
self
.
loc_mask
[
'CH'
]
==
1
)
indices_CH
=
loc_mask_local
.
to_dataframe
()
.
reset_index
()
.
dropna
()
.
index
# obtain list of indices in original dataset that are in CH
indices_random
=
np
.
random
.
permutation
(
indices_CH
)[:
n_sample
]
# obtain list of random indices
matrix_rand
=
np
.
zeros
((
n_lon
*
n_lat
,
1
))
matrix_rand
[
indices_random
]
=
1
# get matrix with random values set to 1
matrix_rand
=
matrix_rand
.
reshape
((
n_lat
,
n_lon
))
self
.
loc_mask
[
'sample'
]
=
xr
.
DataArray
(
matrix_rand
,
coords
=
self
.
loc_mask
.
coords
)
if
sample_raw
:
self
.
sample
=
True
if
filename
is
not
None
:
np
.
savetxt
(
filename
,
matrix_rand
)
def
split_sample
(
self
,
ratio
=
0.8
,
n_split
=
None
,
filename
=
None
):
n_lat
=
len
(
self
.
loc_mask
.
lat
.
values
)
n_lon
=
len
(
self
.
loc_mask
.
lon
.
values
)
if
n_split
is
None
:
n_split
=
int
(
round
(
ratio
*
np
.
sum
(
self
.
loc_mask
[
'sample'
]
.
values
)))
loc_mask_local
=
self
.
loc_mask
.
where
(
self
.
loc_mask
[
'sample'
]
==
1
)
indices_sub
=
loc_mask_local
.
to_dataframe
()
.
reset_index
()
.
dropna
()
.
index
# obtain list of indices in original dataset that are in CH
indices_random
=
np
.
random
.
permutation
(
indices_sub
)[:
n_split
]
# obtain list of random indices
matrix_rand
=
np
.
zeros
((
n_lon
*
n_lat
,
1
))
matrix_rand
[
indices_random
]
=
1
matrix_rand
=
matrix_rand
.
reshape
((
n_lat
,
n_lon
))
self
.
loc_mask
[
'sample_train'
]
=
xr
.
DataArray
(
matrix_rand
,
coords
=
self
.
loc_mask
.
coords
)
self
.
loc_mask
[
'sample_test'
]
=
self
.
loc_mask
[
'sample'
]
-
self
.
loc_mask
[
'sample_train'
]
if
filename
is
not
None
:
path
=
os
.
path
.
splitext
(
filename
)[
0
]
train_file
=
path
+
'_train.txt'
test_file
=
path
+
'_test.txt'
np
.
savetxt
(
train_file
,
matrix_rand
)
np
.
savetxt
(
test_file
,
self
.
loc_mask
[
'sample_test'
]
.
values
)
def
get_subset
(
self
,
matrix_in
=
None
,
filename
=
None
,
sample_name
=
None
,
print_log
=
True
):
if
sample_name
is
None
:
sample_name
=
'sample'
if
not
(
matrix_in
is
None
and
filename
is
None
):
self
.
read_sample
(
matrix_in
,
filename
,
sample_name
,
sample_raw
=
False
)
subset
=
self
.
data
.
where
(
self
.
loc_mask
[
sample_name
]
==
1
)
return
subset
def
add_hourmask
(
self
,
year
=
None
,
month
=
None
,
data
=
None
):
if
data
is
None
:
data
=
self
.
data
if
self
.
format
==
'MMH'
:
if
year
is
None
:
self
.
hourmask
=
xr
.
open_dataset
(
os
.
path
.
join
(
self
.
path
,
'hour_mask_mmh.nc'
))
return
xr
.
merge
([
data
,
self
.
hourmask
])
else
:
return
data
elif
year
is
not
None
:
self
.
hourmask
=
xr
.
open_dataset
(
os
.
path
.
join
(
self
.
path
,
'hour_mask_day.nc'
))
dates
=
list
(
pd
.
date_range
(
str
(
year
)
+
'0101'
,
str
(
year
)
+
'1231'
,
freq
=
'D'
))
if
len
(
dates
)
==
366
:
dates
.
pop
(
59
)
self
.
hourmask
.
coords
[
'date'
]
=
dates
if
month
is
not
None
:
month_start
,
month_end
=
util
.
month_to_dates
(
year
,
month
)
self
.
hourmask
=
self
.
hourmask
.
sel
(
date
=
slice
(
month_start
,
month_end
))
return
xr
.
merge
([
data
,
self
.
hourmask
])
else
:
return
data
########################################### CREATE TABLES ############################################
def
make_table
(
self
,
indata
=
None
,
ftrs
=
None
,
print_log
=
True
):
if
indata
is
None
:
indata
=
self
.
data
timer
=
time
.
clock
()
if
ftrs
is
None
:
ftrs
=
self
.
_loc_ftrs
for
f
in
ftrs
:
if
f
in
self
.
_loc_ftrs
:
indata
[
f
]
=
self
.
loc_mask
[
f
]
table
=
indata
.
to_dataframe
()
.
dropna
()
.
reset_index
()
table
[
'month'
]
=
pd
.
to_datetime
(
table
[
'date'
]
.
values
)
.
month
table
[
'day'
]
=
pd
.
to_datetime
(
table
[
'date'
]
.
values
)
.
dayofyear
table
[
'timestamp'
]
=
util
.
to_timestamp
(
table
[
'date'
]
.
values
,
table
[
'hour'
]
.
values
)
if
print_log
:
print
(
"Created table in
%.2f
seconds"
%
(
time
.
clock
()
-
timer
))
return
table
###################################### UTIL ##############################################
def
date_range
(
self
,
start_date
,
end_date
):
if
self
.
format
==
'H'
:
return
pd
.
date_range
(
start_date
,
end_date
,
freq
=
'D'
)
elif
self
.
format
==
'MMH'
:
return
pd
.
date_range
(
start_date
,
end_date
,
freq
=
'MS'
)
Event Timeline
Log In to Comment