Page MenuHomec4science

meteo_raw.py
No OneTemporary

File Metadata

Created
Mon, Apr 28, 04:50

meteo_raw.py

import numpy as np
from tqdm import tqdm
import time
import pandas as pd
import xarray as xr
import os
import util
class Meteo_Raw():
def __init__(self, data_path, start_time = 3, end_time = 19, variables = None, data_format = 'H', split_raw = True):
'''
Initialisation settings:
- path: root of raw data, with subfolder 'global', 'direct', 'albedo'
- start_time, end_time: define mask for time selection (min 0, max 23)
- vars: list of variables of any combination of ['SIS', 'SISDIR', 'ALB', 'KI', 'SISCF']
- data_format: 'H': raw data (hourly values); 'MMH': monthly-mean-hourly; 'WWH': weekly-mean-hourly
- split_data: Only for 'H' - True for performing split into hour and date
'''
_all_vars = ['SIS', 'SISDIR', 'SISCF', 'SISDIRCF', 'KI', 'ALB']
self.path = data_path
self.offset = abs((start_time + end_time)/2) # for dropping of irrelevant data
self.rnge = abs((end_time - start_time)/2) # for dropping of irrelevant data
if variables == None:
self.variables = _all_vars
else:
self.variables = variables
self.format = data_format
self.split_raw = split_raw
if not split_raw and data_format == 'H':
self.split_dim = 'time'
else: self.split_dim = 'date' # dimension for concatenation of new data
self.loc_mask = xr.open_dataset(os.path.join(data_path,'locations_mask.nc'))
self._loc_ftrs = self.loc_mask.to_dataframe().columns
# INITIALISATIONS
self.init_data = True
self.init_reading = True
self.sample = False
##########################################################################################################
########################################## RAW DATA READING ############################################
##########################################################################################################
def make_monthly(self, start_date, end_date):
month_start = pd.date_range(start_date, end_date, freq = 'MS')
month_end = pd.date_range(start_date, end_date, freq = 'M')
for curr_month, curr_month_end in zip(month_start, month_end):
try:
self.add_data(curr_month, curr_month_end, reset = True)
file = curr_month.strftime("%Y-%m") +'.nc'
self.data.to_netcdf(os.path.join(self.path,file))
except:
print("Could not process month %s" %curr_month.strftime("%Y-%m"))
self.start = None
self.end = None
def add_data(self, start_date, end_date, reset = False, print_log = True):
self.start = start_date
self.end = end_date
if reset: self.init_data = True
init_merge = True
for ds in self.variables:
if ds == 'SIS':
if print_log: print('Adding global ... ')
new_data = self.read_global(print_log)
elif ds == 'SISDIR':
if print_log: print('Adding direct ... ')
new_data = self.read_direct(print_log)
elif ds == 'ALB':
if print_log: print('Adding albedo ... ')
new_data = self.read_albedo(print_log)
elif ds == 'KI':
if print_log: print('Adding clearness index ... ')
new_data = self.read_clearness(print_log)
elif ds == 'SISCF':
if print_log: print('Adding global clear sky ... ')
new_data = self.read_clearsky_global(print_log)
elif ds == 'SISDIRCF':
if print_log: print('Adding direct clear sky ... ')
new_data = self.read_clearsky_direct(print_log)
else:
print("ERROR: VARIABLE NOT FOUND")
return
if init_merge:
new_dataset = new_data
init_merge = False
else:
print('Merging data ... ')
new_dataset = xr.merge([new_dataset, new_data])
if self.init_data:
self.data = new_dataset
self.init_data = False
else:
print('Adding dataset ... ')
self.data = xr.concat([self.data,new_dataset], self.split_dim)
if print_log: print ("Reading completed")
######################################### DATA SELECTION ###############################################
def choose_format(self, filename, data_path, print_log = True):
if self.format == 'H':
if print_log: print('Adding raw data ... ')
data_out = self.read_data(self.start, self.end, filename, data_path, split = self.split_raw)
elif self.format == 'MMH':
if print_log: print('Adding monthly mean hourly data ... ')
data_out = self.read_monthly_mean_hourly(filename, data_path)
elif self.format == 'WMH':
if print_log: print('Adding weekly mean hourly data ... ')
data_out = self.read_weekly_mean_hourly(filename, data_path)
else:
print("ERROR: DATA FORMAT NOT FOUND")
return
return data_out
################################# INITIALIZE DATA TYPE READERS #########################################
def read_global(self, print_log = True):
data_path = os.path.join(self.path,'global')
filename = lambda date: 'msg.SIS.H_ch02.lonlat_' + date + '000000.nc'
# create data set:
if print_log: print ("Reading global radiation data from %s to %s ..."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
global_data = self.choose_format(filename, data_path, print_log)
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds\n" %(timer))
return global_data
def read_direct(self, print_log = True):
data_path = os.path.join(self.path,'direct')
filename = lambda date: 'msg.SISDIR.H_ch02.lonlat_' + date + '000000.nc'
# create data set:
if print_log: print ("Reading direct radiation data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.start).strftime("%Y-%m-%d")))
timer = time.clock()
direct_data = self.choose_format(filename, data_path, print_log)
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds\n" %(timer))
return direct_data
def read_albedo(self, print_log = True):
data_path = os.path.join(self.path,'albedo')
filename = lambda date: 'msg.ALB.H_ch02.lonlat_' + date + '000000.nc'
# create data set:
if print_log: print ("Reading surface albedo data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
albedo_data = self.choose_format(filename, data_path, print_log)
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds\n" %(timer))
return albedo_data
def read_clearness(self, print_log = True):
data_path = os.path.join(self.path,'clearness')
filename = lambda date: 'msg.KI.H_ch02.lonlat_' + date + '000000.nc'
# create data set:
if print_log: print ("Reading clearness index data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
clearness_data = self.choose_format(filename, data_path, print_log)
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds\n" %(timer))
return clearness_data
def read_clearsky_global(self, print_log = True):
data_path = os.path.join(self.path,'clearsky-global')
filename = lambda date: 'msg.SISCF.H_ch02.lonlat_' + date + '000000.nc'
# create data set:
if print_log: print ("Reading global clear sky radiation data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
clearness_data = self.choose_format(filename, data_path, print_log)
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds\n" %(timer))
return clearness_data
def read_clearsky_direct(self, print_log = True):
data_path = os.path.join(self.path,'clearsky-direct')
filename = lambda date: 'msg.SISDIRCF.H_ch02.lonlat_' + date + '000000.nc'
# create data set:
if print_log: print ("Reading direct clear sky radiation data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
clearness_data = self.choose_format(filename, data_path, print_log)
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds\n" %(timer))
return clearness_data
########################################## MONTHLY MEAN HOURLY ##########################################
def read_monthly_mean_hourly(self, filename, data_path, print_log = False):
month_start = pd.date_range(self.start, self.end, freq = 'MS') # create list of first day in month
month_end = pd.date_range(self.start, self.end, freq = 'M') # create list in last day of month
self.start = month_start[0] # homogenise start and end
self.end = month_end[-1]
if print_log: print ("Reading all data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
init = True
for (curr_month, curr_month_end) in zip(month_start, month_end):
new_data = self.read_data(curr_month, curr_month_end, filename, data_path, split = False)
hourly_mean = new_data.groupby('time.hour').mean(dim = 'time') # create monthly mean hourly for the current month
hourly_mean.coords['date'] = curr_month # label current data - get only month number with: curr_month.month
if init: # for first iteration
data_out = hourly_mean
init = False
else:
data_out = xr.concat([data_out, hourly_mean], 'date') # add new slice
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds" %(timer))
return data_out
########################################## WEEKLY MEAN HOURLY #######################################
def read_weekly_mean_hourly(self, filename, data_path, print_log = False):
week_start = pd.date_range(self.start, self.end, freq = 'W-MON') # create list of first day in month
week_end = pd.date_range(week_start[0], self.end, freq = 'W-SUN') # create list in last day of month
self.start = week_start[0] # homogenise start and end
self.end = week_end[-1]
if print_log: print ("Reading all data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
init = True
for (curr_week, curr_week_end) in zip(week_start, week_end):
new_data = self.read_data(curr_week, curr_week_end, filename, data_path, split = False)
hourly_mean = new_data.groupby('time.hour').mean(dim = 'time') # create monthly mean hourly for the current month
hourly_mean.coords['date'] = curr_week # label current data
if init: # for first iteration
data_out = hourly_mean
init = False
else:
data_out = xr.concat([data_out, hourly_mean], 'date') # add new slice
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds" %(timer))
return data_out
##################################### READING RAW DATA ###############################################
def read_data(self, start_date, end_date, filename, data_path, split):
if split: split_dim = 'date'
else: split_dim = 'time'
dates = pd.date_range(start_date, end_date)
init = True
for date in tqdm(dates):
file = filename(date.strftime("%Y%m%d"))
try:
new_data = xr.open_dataset(os.path.join(data_path,file))
if init: # for first iteration
self.loc_mask = self.loc_mask.reindex_like(new_data,'nearest')
new_data = new_data.where(abs(new_data['time.hour'] - self.offset) <= self.rnge, drop = True) # only keep relevant hours
new_data = new_data.where(self.loc_mask['CH'] == 1, drop=True).drop('geographic_projection') # only keep relevant locations & data
if self.sample: new_data = new_data.where(self.loc_mask['sample'] == 1) # remove all samples that are not of interest, but do not reduce overall size
if split:
new_data = new_data.groupby('time.hour').mean(dim = 'time') # create monthly mean hourly for the current month
new_data.coords['date'] = date
if init: # for first iteration
data_out = new_data # drop geographic projection (cut out unnecessary data)
init = False
else:
data_out = xr.concat([data_out,new_data], split_dim)
except:
print("Unexpected error:", sys.exc_info()[0])
print("Check file in %s" %(os.path.join(data_path,file)))
return data_out
####################################### SPLITTING DATASET #######################################
def split_dataset(self, dates = [], print_log = True):
try:
if len(dates) == 0: # if nothing was passed to the funcion
dates = self.date_range(self.start, self.end)
date_range = pd.to_datetime(dates).strftime("%Y%m%d")
if print_log: print ("Splitting all data from %s to %s ...\nThis is faster if split_raw is selected in initialisation of reader!"
%(pd.to_datetime(dates[0]).strftime("%Y-%m-%d"), pd.to_datetime(dates[-1]).strftime("%Y-%m-%d")))
timer = time.clock()
init = True
for day in date_range:
day_data = self.data.sel(time = slice(day))
data = day_data.groupby('time.hour').mean(dim = 'time')
data.coords['date'] = pd.to_datetime(day)
if init:
splitted_data = data
init = False
else:
splitted_data = xr.concat([splitted_data, data], 'date') # add new slice
timer = time.clock() - timer
if print_log: print ("Finished splitting data in %.2f seconds" %(timer))
except:
print("Error: Verify that date has right format; set ['datestring']")
return
return splitted_data
###################################### UTIL ################################################
def date_range(self, start_date, end_date):
if self.format == 'H':
return pd.date_range(start_date, end_date, freq = 'D')
elif self.format == 'MMH':
return pd.date_range(start_date, end_date, freq = 'MS')
elif self.format == 'WMH':
return pd.date_range(start_date, end_date, freq = 'W-MON')

Event Timeline