Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F110804487
meteo_raw.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Apr 28, 04:50
Size
13 KB
Mime Type
text/x-python
Expires
Wed, Apr 30, 04:50 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
25831867
Attached To
R8800 solar_potential
meteo_raw.py
View Options
import
numpy
as
np
from
tqdm
import
tqdm
import
time
import
pandas
as
pd
import
xarray
as
xr
import
os
import
util
class
Meteo_Raw
():
def
__init__
(
self
,
data_path
,
start_time
=
3
,
end_time
=
19
,
variables
=
None
,
data_format
=
'H'
,
split_raw
=
True
):
'''
Initialisation settings:
- path: root of raw data, with subfolder 'global', 'direct', 'albedo'
- start_time, end_time: define mask for time selection (min 0, max 23)
- vars: list of variables of any combination of ['SIS', 'SISDIR', 'ALB', 'KI', 'SISCF']
- data_format: 'H': raw data (hourly values); 'MMH': monthly-mean-hourly; 'WWH': weekly-mean-hourly
- split_data: Only for 'H' - True for performing split into hour and date
'''
_all_vars
=
[
'SIS'
,
'SISDIR'
,
'SISCF'
,
'SISDIRCF'
,
'KI'
,
'ALB'
]
self
.
path
=
data_path
self
.
offset
=
abs
((
start_time
+
end_time
)
/
2
)
# for dropping of irrelevant data
self
.
rnge
=
abs
((
end_time
-
start_time
)
/
2
)
# for dropping of irrelevant data
if
variables
==
None
:
self
.
variables
=
_all_vars
else
:
self
.
variables
=
variables
self
.
format
=
data_format
self
.
split_raw
=
split_raw
if
not
split_raw
and
data_format
==
'H'
:
self
.
split_dim
=
'time'
else
:
self
.
split_dim
=
'date'
# dimension for concatenation of new data
self
.
loc_mask
=
xr
.
open_dataset
(
os
.
path
.
join
(
data_path
,
'locations_mask.nc'
))
self
.
_loc_ftrs
=
self
.
loc_mask
.
to_dataframe
()
.
columns
# INITIALISATIONS
self
.
init_data
=
True
self
.
init_reading
=
True
self
.
sample
=
False
##########################################################################################################
########################################## RAW DATA READING ############################################
##########################################################################################################
def
make_monthly
(
self
,
start_date
,
end_date
):
month_start
=
pd
.
date_range
(
start_date
,
end_date
,
freq
=
'MS'
)
month_end
=
pd
.
date_range
(
start_date
,
end_date
,
freq
=
'M'
)
for
curr_month
,
curr_month_end
in
zip
(
month_start
,
month_end
):
try
:
self
.
add_data
(
curr_month
,
curr_month_end
,
reset
=
True
)
file
=
curr_month
.
strftime
(
"%Y-%m"
)
+
'.nc'
self
.
data
.
to_netcdf
(
os
.
path
.
join
(
self
.
path
,
file
))
except
:
print
(
"Could not process month
%s
"
%
curr_month
.
strftime
(
"%Y-%m"
))
self
.
start
=
None
self
.
end
=
None
def
add_data
(
self
,
start_date
,
end_date
,
reset
=
False
,
print_log
=
True
):
self
.
start
=
start_date
self
.
end
=
end_date
if
reset
:
self
.
init_data
=
True
init_merge
=
True
for
ds
in
self
.
variables
:
if
ds
==
'SIS'
:
if
print_log
:
print
(
'Adding global ... '
)
new_data
=
self
.
read_global
(
print_log
)
elif
ds
==
'SISDIR'
:
if
print_log
:
print
(
'Adding direct ... '
)
new_data
=
self
.
read_direct
(
print_log
)
elif
ds
==
'ALB'
:
if
print_log
:
print
(
'Adding albedo ... '
)
new_data
=
self
.
read_albedo
(
print_log
)
elif
ds
==
'KI'
:
if
print_log
:
print
(
'Adding clearness index ... '
)
new_data
=
self
.
read_clearness
(
print_log
)
elif
ds
==
'SISCF'
:
if
print_log
:
print
(
'Adding global clear sky ... '
)
new_data
=
self
.
read_clearsky_global
(
print_log
)
elif
ds
==
'SISDIRCF'
:
if
print_log
:
print
(
'Adding direct clear sky ... '
)
new_data
=
self
.
read_clearsky_direct
(
print_log
)
else
:
print
(
"ERROR: VARIABLE NOT FOUND"
)
return
if
init_merge
:
new_dataset
=
new_data
init_merge
=
False
else
:
print
(
'Merging data ... '
)
new_dataset
=
xr
.
merge
([
new_dataset
,
new_data
])
if
self
.
init_data
:
self
.
data
=
new_dataset
self
.
init_data
=
False
else
:
print
(
'Adding dataset ... '
)
self
.
data
=
xr
.
concat
([
self
.
data
,
new_dataset
],
self
.
split_dim
)
if
print_log
:
print
(
"Reading completed"
)
######################################### DATA SELECTION ###############################################
def
choose_format
(
self
,
filename
,
data_path
,
print_log
=
True
):
if
self
.
format
==
'H'
:
if
print_log
:
print
(
'Adding raw data ... '
)
data_out
=
self
.
read_data
(
self
.
start
,
self
.
end
,
filename
,
data_path
,
split
=
self
.
split_raw
)
elif
self
.
format
==
'MMH'
:
if
print_log
:
print
(
'Adding monthly mean hourly data ... '
)
data_out
=
self
.
read_monthly_mean_hourly
(
filename
,
data_path
)
elif
self
.
format
==
'WMH'
:
if
print_log
:
print
(
'Adding weekly mean hourly data ... '
)
data_out
=
self
.
read_weekly_mean_hourly
(
filename
,
data_path
)
else
:
print
(
"ERROR: DATA FORMAT NOT FOUND"
)
return
return
data_out
################################# INITIALIZE DATA TYPE READERS #########################################
def
read_global
(
self
,
print_log
=
True
):
data_path
=
os
.
path
.
join
(
self
.
path
,
'global'
)
filename
=
lambda
date
:
'msg.SIS.H_ch02.lonlat_'
+
date
+
'000000.nc'
# create data set:
if
print_log
:
print
(
"Reading global radiation data from
%s
to
%s
..."
%
(
pd
.
to_datetime
(
self
.
start
)
.
strftime
(
"%Y-%m-
%d
"
),
pd
.
to_datetime
(
self
.
end
)
.
strftime
(
"%Y-%m-
%d
"
)))
timer
=
time
.
clock
()
global_data
=
self
.
choose_format
(
filename
,
data_path
,
print_log
)
timer
=
time
.
clock
()
-
timer
if
print_log
:
print
(
"Finished reading in
%.2f
seconds
\n
"
%
(
timer
))
return
global_data
def
read_direct
(
self
,
print_log
=
True
):
data_path
=
os
.
path
.
join
(
self
.
path
,
'direct'
)
filename
=
lambda
date
:
'msg.SISDIR.H_ch02.lonlat_'
+
date
+
'000000.nc'
# create data set:
if
print_log
:
print
(
"Reading direct radiation data from
%s
to
%s
...."
%
(
pd
.
to_datetime
(
self
.
start
)
.
strftime
(
"%Y-%m-
%d
"
),
pd
.
to_datetime
(
self
.
start
)
.
strftime
(
"%Y-%m-
%d
"
)))
timer
=
time
.
clock
()
direct_data
=
self
.
choose_format
(
filename
,
data_path
,
print_log
)
timer
=
time
.
clock
()
-
timer
if
print_log
:
print
(
"Finished reading in
%.2f
seconds
\n
"
%
(
timer
))
return
direct_data
def
read_albedo
(
self
,
print_log
=
True
):
data_path
=
os
.
path
.
join
(
self
.
path
,
'albedo'
)
filename
=
lambda
date
:
'msg.ALB.H_ch02.lonlat_'
+
date
+
'000000.nc'
# create data set:
if
print_log
:
print
(
"Reading surface albedo data from
%s
to
%s
...."
%
(
pd
.
to_datetime
(
self
.
start
)
.
strftime
(
"%Y-%m-
%d
"
),
pd
.
to_datetime
(
self
.
end
)
.
strftime
(
"%Y-%m-
%d
"
)))
timer
=
time
.
clock
()
albedo_data
=
self
.
choose_format
(
filename
,
data_path
,
print_log
)
timer
=
time
.
clock
()
-
timer
if
print_log
:
print
(
"Finished reading in
%.2f
seconds
\n
"
%
(
timer
))
return
albedo_data
def
read_clearness
(
self
,
print_log
=
True
):
data_path
=
os
.
path
.
join
(
self
.
path
,
'clearness'
)
filename
=
lambda
date
:
'msg.KI.H_ch02.lonlat_'
+
date
+
'000000.nc'
# create data set:
if
print_log
:
print
(
"Reading clearness index data from
%s
to
%s
...."
%
(
pd
.
to_datetime
(
self
.
start
)
.
strftime
(
"%Y-%m-
%d
"
),
pd
.
to_datetime
(
self
.
end
)
.
strftime
(
"%Y-%m-
%d
"
)))
timer
=
time
.
clock
()
clearness_data
=
self
.
choose_format
(
filename
,
data_path
,
print_log
)
timer
=
time
.
clock
()
-
timer
if
print_log
:
print
(
"Finished reading in
%.2f
seconds
\n
"
%
(
timer
))
return
clearness_data
def
read_clearsky_global
(
self
,
print_log
=
True
):
data_path
=
os
.
path
.
join
(
self
.
path
,
'clearsky-global'
)
filename
=
lambda
date
:
'msg.SISCF.H_ch02.lonlat_'
+
date
+
'000000.nc'
# create data set:
if
print_log
:
print
(
"Reading global clear sky radiation data from
%s
to
%s
...."
%
(
pd
.
to_datetime
(
self
.
start
)
.
strftime
(
"%Y-%m-
%d
"
),
pd
.
to_datetime
(
self
.
end
)
.
strftime
(
"%Y-%m-
%d
"
)))
timer
=
time
.
clock
()
clearness_data
=
self
.
choose_format
(
filename
,
data_path
,
print_log
)
timer
=
time
.
clock
()
-
timer
if
print_log
:
print
(
"Finished reading in
%.2f
seconds
\n
"
%
(
timer
))
return
clearness_data
def
read_clearsky_direct
(
self
,
print_log
=
True
):
data_path
=
os
.
path
.
join
(
self
.
path
,
'clearsky-direct'
)
filename
=
lambda
date
:
'msg.SISDIRCF.H_ch02.lonlat_'
+
date
+
'000000.nc'
# create data set:
if
print_log
:
print
(
"Reading direct clear sky radiation data from
%s
to
%s
...."
%
(
pd
.
to_datetime
(
self
.
start
)
.
strftime
(
"%Y-%m-
%d
"
),
pd
.
to_datetime
(
self
.
end
)
.
strftime
(
"%Y-%m-
%d
"
)))
timer
=
time
.
clock
()
clearness_data
=
self
.
choose_format
(
filename
,
data_path
,
print_log
)
timer
=
time
.
clock
()
-
timer
if
print_log
:
print
(
"Finished reading in
%.2f
seconds
\n
"
%
(
timer
))
return
clearness_data
########################################## MONTHLY MEAN HOURLY ##########################################
def
read_monthly_mean_hourly
(
self
,
filename
,
data_path
,
print_log
=
False
):
month_start
=
pd
.
date_range
(
self
.
start
,
self
.
end
,
freq
=
'MS'
)
# create list of first day in month
month_end
=
pd
.
date_range
(
self
.
start
,
self
.
end
,
freq
=
'M'
)
# create list in last day of month
self
.
start
=
month_start
[
0
]
# homogenise start and end
self
.
end
=
month_end
[
-
1
]
if
print_log
:
print
(
"Reading all data from
%s
to
%s
...."
%
(
pd
.
to_datetime
(
self
.
start
)
.
strftime
(
"%Y-%m-
%d
"
),
pd
.
to_datetime
(
self
.
end
)
.
strftime
(
"%Y-%m-
%d
"
)))
timer
=
time
.
clock
()
init
=
True
for
(
curr_month
,
curr_month_end
)
in
zip
(
month_start
,
month_end
):
new_data
=
self
.
read_data
(
curr_month
,
curr_month_end
,
filename
,
data_path
,
split
=
False
)
hourly_mean
=
new_data
.
groupby
(
'time.hour'
)
.
mean
(
dim
=
'time'
)
# create monthly mean hourly for the current month
hourly_mean
.
coords
[
'date'
]
=
curr_month
# label current data - get only month number with: curr_month.month
if
init
:
# for first iteration
data_out
=
hourly_mean
init
=
False
else
:
data_out
=
xr
.
concat
([
data_out
,
hourly_mean
],
'date'
)
# add new slice
timer
=
time
.
clock
()
-
timer
if
print_log
:
print
(
"Finished reading in
%.2f
seconds"
%
(
timer
))
return
data_out
########################################## WEEKLY MEAN HOURLY #######################################
def
read_weekly_mean_hourly
(
self
,
filename
,
data_path
,
print_log
=
False
):
week_start
=
pd
.
date_range
(
self
.
start
,
self
.
end
,
freq
=
'W-MON'
)
# create list of first day in month
week_end
=
pd
.
date_range
(
week_start
[
0
],
self
.
end
,
freq
=
'W-SUN'
)
# create list in last day of month
self
.
start
=
week_start
[
0
]
# homogenise start and end
self
.
end
=
week_end
[
-
1
]
if
print_log
:
print
(
"Reading all data from
%s
to
%s
...."
%
(
pd
.
to_datetime
(
self
.
start
)
.
strftime
(
"%Y-%m-
%d
"
),
pd
.
to_datetime
(
self
.
end
)
.
strftime
(
"%Y-%m-
%d
"
)))
timer
=
time
.
clock
()
init
=
True
for
(
curr_week
,
curr_week_end
)
in
zip
(
week_start
,
week_end
):
new_data
=
self
.
read_data
(
curr_week
,
curr_week_end
,
filename
,
data_path
,
split
=
False
)
hourly_mean
=
new_data
.
groupby
(
'time.hour'
)
.
mean
(
dim
=
'time'
)
# create monthly mean hourly for the current month
hourly_mean
.
coords
[
'date'
]
=
curr_week
# label current data
if
init
:
# for first iteration
data_out
=
hourly_mean
init
=
False
else
:
data_out
=
xr
.
concat
([
data_out
,
hourly_mean
],
'date'
)
# add new slice
timer
=
time
.
clock
()
-
timer
if
print_log
:
print
(
"Finished reading in
%.2f
seconds"
%
(
timer
))
return
data_out
##################################### READING RAW DATA ###############################################
def
read_data
(
self
,
start_date
,
end_date
,
filename
,
data_path
,
split
):
if
split
:
split_dim
=
'date'
else
:
split_dim
=
'time'
dates
=
pd
.
date_range
(
start_date
,
end_date
)
init
=
True
for
date
in
tqdm
(
dates
):
file
=
filename
(
date
.
strftime
(
"%Y%m
%d
"
))
try
:
new_data
=
xr
.
open_dataset
(
os
.
path
.
join
(
data_path
,
file
))
if
init
:
# for first iteration
self
.
loc_mask
=
self
.
loc_mask
.
reindex_like
(
new_data
,
'nearest'
)
new_data
=
new_data
.
where
(
abs
(
new_data
[
'time.hour'
]
-
self
.
offset
)
<=
self
.
rnge
,
drop
=
True
)
# only keep relevant hours
new_data
=
new_data
.
where
(
self
.
loc_mask
[
'CH'
]
==
1
,
drop
=
True
)
.
drop
(
'geographic_projection'
)
# only keep relevant locations & data
if
self
.
sample
:
new_data
=
new_data
.
where
(
self
.
loc_mask
[
'sample'
]
==
1
)
# remove all samples that are not of interest, but do not reduce overall size
if
split
:
new_data
=
new_data
.
groupby
(
'time.hour'
)
.
mean
(
dim
=
'time'
)
# create monthly mean hourly for the current month
new_data
.
coords
[
'date'
]
=
date
if
init
:
# for first iteration
data_out
=
new_data
# drop geographic projection (cut out unnecessary data)
init
=
False
else
:
data_out
=
xr
.
concat
([
data_out
,
new_data
],
split_dim
)
except
:
print
(
"Unexpected error:"
,
sys
.
exc_info
()[
0
])
print
(
"Check file in
%s
"
%
(
os
.
path
.
join
(
data_path
,
file
)))
return
data_out
####################################### SPLITTING DATASET #######################################
def
split_dataset
(
self
,
dates
=
[],
print_log
=
True
):
try
:
if
len
(
dates
)
==
0
:
# if nothing was passed to the funcion
dates
=
self
.
date_range
(
self
.
start
,
self
.
end
)
date_range
=
pd
.
to_datetime
(
dates
)
.
strftime
(
"%Y%m
%d
"
)
if
print_log
:
print
(
"Splitting all data from
%s
to
%s
...
\n
This is faster if split_raw is selected in initialisation of reader!"
%
(
pd
.
to_datetime
(
dates
[
0
])
.
strftime
(
"%Y-%m-
%d
"
),
pd
.
to_datetime
(
dates
[
-
1
])
.
strftime
(
"%Y-%m-
%d
"
)))
timer
=
time
.
clock
()
init
=
True
for
day
in
date_range
:
day_data
=
self
.
data
.
sel
(
time
=
slice
(
day
))
data
=
day_data
.
groupby
(
'time.hour'
)
.
mean
(
dim
=
'time'
)
data
.
coords
[
'date'
]
=
pd
.
to_datetime
(
day
)
if
init
:
splitted_data
=
data
init
=
False
else
:
splitted_data
=
xr
.
concat
([
splitted_data
,
data
],
'date'
)
# add new slice
timer
=
time
.
clock
()
-
timer
if
print_log
:
print
(
"Finished splitting data in
%.2f
seconds"
%
(
timer
))
except
:
print
(
"Error: Verify that date has right format; set ['datestring']"
)
return
return
splitted_data
###################################### UTIL ################################################
def
date_range
(
self
,
start_date
,
end_date
):
if
self
.
format
==
'H'
:
return
pd
.
date_range
(
start_date
,
end_date
,
freq
=
'D'
)
elif
self
.
format
==
'MMH'
:
return
pd
.
date_range
(
start_date
,
end_date
,
freq
=
'MS'
)
elif
self
.
format
==
'WMH'
:
return
pd
.
date_range
(
start_date
,
end_date
,
freq
=
'W-MON'
)
Event Timeline
Log In to Comment