Page MenuHomec4science

Radiation_Reader.py
No OneTemporary

File Metadata

Created
Thu, May 1, 04:53

Radiation_Reader.py

import numpy as np
import scipy as sc
from tqdm import tqdm
import time
import pandas as pd
import xarray as xr
import os
import sys
import util
class Radiation_Reader:
# sunrise = [6, 6, 5, 4, 3, 3, 3, 4, 4, 5, 6, 6] # actual sunrise hours (in GMT)
# sunset = [16, 17, 17, 18, 19, 19, 19, 18, 18, 17, 16, 15] # actual sunset hours (in GMT)
def __init__(self, data_path, start_time = 3, end_time = 19, variables = ['SIS', 'SISDIR'], data_format = 'H', split_raw = True):
'''
Initialisation settings:
- path: root of raw data, with subfolder 'global', 'direct', 'albedo'
- start_time, end_time: define mask for time selection (min 0, max 23)
- vars: list of variables of any combination of ['SIS', 'SISDIR', 'ALB', 'KI', 'SISCF']
- data_format: 'H': raw data (hourly values); 'MMH': monthly-mean-hourly; 'WWH': weekly-mean-hourly
- split_data: Only for 'H' - True for performing split into hour and date
'''
self.path = data_path
self.offset = abs((start_time + end_time)/2) # for dropping of irrelevant data
self.rnge = abs((end_time - start_time)/2) # for dropping of irrelevant data
self.variables = variables
self.format = data_format
self.split_raw = split_raw
if not split_raw and data_format == 'H':
self.split_dim = 'time'
else: self.split_dim = 'date' # dimension for concatenation of new data
self.loc_mask = xr.open_dataset(os.path.join(data_path,'locations_mask.nc'))
# INITIALISATIONS
self.init_data = True
self.init_reading = True
self.sample = False
def date_range(self, start_date, end_date):
if self.format == 'H':
return pd.date_range(start_date, end_date, freq = 'D')
elif self.format == 'MMH':
return pd.date_range(start_date, end_date, freq = 'MS')
elif self.format == 'WMH':
return pd.date_range(start_date, end_date, freq = 'W-MON')
########################################### READ DATA #############################################
def read_data(self, start_date, end_date, reset = False, print_log = True):
if reset:
self.init_reading = True
# VEEEEEERY CRUDE JUST TO MAKE OLD AND NEW WORK TOGETHER - SHOULD BE RE-WRITTEN
elif not init_reading:
old_data = self.data
month_start = pd.date_range(start_date, end_date, freq = 'MS')
month_end = pd.date_range(start_date, end_date, freq = 'M')
init = True
for curr_month, curr_month_end in zip(month_start, month_end):
monthfile = curr_month.strftime('%Y-%m') + '.nc'
print(monthfile)
if not os.path.exists(os.path.join(self.path,monthfile)):
print("file %s does not exist - creating file from raw data" %monthfile)
self.make_monthly(curr_month, curr_month_end)
try:
new_data = xr.open_dataset(os.path.join(self.path,monthfile))
if init:
dataset = new_data
init = False
else:
dataset = xr.concat([dataset,new_data], self.split_dim)
except:
print("failed to read or concatenate data in %s" %monthfile)
if self.init_reading:
self.data = dataset
self.init_reading = False
else:
self.data = xr.merge([old_data,dataset])
self.start = start_date
self.end = month_end[-1].strftime("%Y%m%d")
########################################## MONTHLY MEAN HOURLY ##########################################
def read_monthly_mean_hourly(self, filename, data_path, print_log = False):
month_start = pd.date_range(self.start, self.end, freq = 'MS') # create list of first day in month
month_end = pd.date_range(self.start, self.end, freq = 'M') # create list in last day of month
self.start = month_start[0] # homogenise start and end
self.end = month_end[-1]
if print_log: print ("Reading all data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
init = True
for (curr_month, curr_month_end) in zip(month_start, month_end):
new_data = self.read_data(curr_month, curr_month_end, filename, data_path, split = False)
hourly_mean = new_data.groupby('time.hour').mean(dim = 'time') # create monthly mean hourly for the current month
hourly_mean.coords['date'] = curr_month # label current data - get only month number with: curr_month.month
if init: # for first iteration
data_out = hourly_mean
init = False
else:
data_out = xr.concat([data_out, hourly_mean], 'date') # add new slice
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds" %(timer))
return data_out
########################################## WEEKLY MEAN HOURLY #######################################
def read_weekly_mean_hourly(self, filename, data_path, print_log = False):
week_start = pd.date_range(self.start, self.end, freq = 'W-MON') # create list of first day in month
week_end = pd.date_range(week_start[0], self.end, freq = 'W-SUN') # create list in last day of month
self.start = week_start[0] # homogenise start and end
self.end = week_end[-1]
if print_log: print ("Reading all data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
init = True
for (curr_week, curr_week_end) in zip(week_start, week_end):
new_data = self.read_data(curr_week, curr_week_end, filename, data_path, split = False)
hourly_mean = new_data.groupby('time.hour').mean(dim = 'time') # create monthly mean hourly for the current month
hourly_mean.coords['date'] = curr_week # label current data
if init: # for first iteration
data_out = hourly_mean
init = False
else:
data_out = xr.concat([data_out, hourly_mean], 'date') # add new slice
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds" %(timer))
return data_out
########################################### SUBSETS ###################################################
def read_sample(self, matrix_in = None, filename = None, sample_raw = True):
# read input matrix for sampling locations
if filename is not None:
matrix_in = np.loadtxt(filename)
self.loc_mask['sample'] = xr.DataArray(matrix_in, coords = self.loc_mask.coords)
if sample_raw: self.sample = True
idx = np.argwhere(self.loc_mask.sample.where(self.loc_mask['CH'] == 1,drop = True).values==1)
return idx
def create_regular_sample(self, downsampling_ratio, filename = None, sample_raw = True):
n_lat = len(self.loc_mask.lat.values)
n_lon = len(self.loc_mask.lon.values)
smpl_int = int(np.floor(np.sqrt(downsampling_ratio)))
# CREATE MATRIX OF THE SAME SIZE AS LOC_MASK
matrix_reg = np.zeros((n_lat,n_lon))
matrix_reg[::smpl_int,::smpl_int] = 1
matrix_reg = matrix_reg * self.loc_mask.CH.values.T
self.loc_mask['sample'] = xr.DataArray(matrix_reg, coords = self.loc_mask.coords)
if sample_raw: self.sample = True
if filename is not None:
np.savetxt(filename, matrix_reg)
idx = np.argwhere(self.loc_mask.sample.where(self.loc_mask['CH'] == 1,drop = True).values==1)
return idx
def create_random_sample(self, n_sample, filename = None, sample_raw = True):
n_lat = len(self.loc_mask.lat.values)
n_lon = len(self.loc_mask.lon.values)
# CREATE MATRIX OF THE SAME SIZE AS LOC_MASK
loc_mask_local = self.loc_mask.where(self.loc_mask['CH'] == 1)
indices_CH = loc_mask_local.to_dataframe().reset_index().dropna().index # obtain list of indices in original dataset that are in CH
indices_random = np.random.permutation(indices_CH)[:n_sample] # obtain list of random indices
matrix_rand = np.zeros((n_lon*n_lat,1))
matrix_rand[indices_random] = 1 # get matrix with random values set to 1
matrix_rand = matrix_rand.reshape((n_lat, n_lon))
self.loc_mask['sample'] = xr.DataArray(matrix_rand, coords = self.loc_mask.coords)
if sample_raw: self.sample = True
if filename is not None:
np.savetxt(filename, matrix_rand)
idx = np.argwhere(self.loc_mask.sample.where(self.loc_mask['CH'] == 1,drop = True).values==1)
return idx, matrix_rand
def get_subset(self, lon_idx, lat_idx, print_log = True):
loc_mask_local = self.loc_mask.where(self.loc_mask['CH'] == 1, drop=True)
timer = time.clock()
init = True
for lon, lat in tqdm(zip(lon_idx, lat_idx)):
loc_mask_local.CH.values[lon,lat] = 2
new_slice,_,_,_ = self.get_locslice(lat_idx = lat, lon_idx = lon, random = False)
new_tbl = self.make_table(new_slice, print_log = False)
if init:
subset = new_tbl
init = False
else:
subset = pd.concat([subset, new_tbl])
if print_log: print ("Created subset in %.2f seconds" %(time.clock() - timer))
return subset, loc_mask_local
########################################### CREATE TABLES ############################################
def make_table(self, indata = None, print_log = True):
if indata is None:
indata = self.data
timer = time.clock()
coords = self.loc_mask.to_dataframe().dropna().reset_index()
table = indata.to_dataframe().dropna().reset_index()
table['month'] = pd.to_datetime(table['date'].values).month
table['day'] = pd.to_datetime(table['date'].values).dayofyear
table['timestamp'] = util.to_timestamp(table['date'].values,table['hour'].values)
table = pd.merge(left=table, right=coords, how="left", left_on=["lon", "lat"], right_on=["lon", "lat"])
if print_log: print ("Created table in %.2f seconds" %(time.clock() - timer))
return table
########################################## DATA SLICES ################################################
def get_timeslice(self, hour = -1, date = -1, hour_idx = -1, date_idx = -1):
"""
DESCRIPTION
Inputs:
hour / date: input to select data
hour_idx/ date_idx: input to select data by index
Outputs:
"""
try:
if self.split_dim == 'date': # IF DATA IS SPLIT
if hour == -1 and hour_idx == -1: # select random time
print("Random selection of hour")
hour_idx = np.random.randint(self.data.sizes['hour'])
if date == -1 and date_idx == -1: # select random date
print("Random selection of date")
date_idx = np.random.randint(self.data.sizes['date'])
if hour == -1:
hour = self.data.hour.values[hour_idx]
if date == -1: # if time was chosen by index
date = self.data.date.values[date_idx]
timeslice = self.data.sel(date = pd.to_datetime(date), hour = hour)
else: # IF DATA IS NOT SPLIT YET
if hour == -1 or date == -1:
print("Error: Please specify date and time")
return
timeslice = self.split_dataset(dates = date).sel(hour = hour)
except:
print("Unexpected error:", sys.exc_info())
print("Verify that date (%s) and hour (%d) are in the data set!" %(date, hour))
return
return timeslice, date, hour
def get_locslice(self, lat_idx = -1, lon_idx = -1, start_date = -1, end_date = -1, random = True):
"""
DESCRIPTION
Inputs:
Outputs:
"""
if random == True: # select random time
CH = False
while CH == False:
lat_idx = np.random.randint(self.data.sizes['lat'])
lon_idx = np.random.randint(self.data.sizes['lon'])
if self.loc_mask.isel(lon = lon_idx, lat = lat_idx).CH.values == 1: CH = True
print("find random location in Switzerland ...")
loc_mask_local = self.loc_mask.where(self.loc_mask['CH'] == 1, drop=True)
loc_mask_local.CH.values[lon_idx,lat_idx] = 2
if start_date == -1:
start_date = self.start
if end_date == -1:
end_date = self.end
dates = self.date_range(start_date, end_date).strftime("%Y%m%d")
if self.split_dim == 'time':
locslice = self.split_dataset(dates = dates)
else:
locslice = self.data.sel(date = pd.to_datetime(dates))
try:
locslice = locslice.isel(lon = lon_idx, lat = lat_idx)
lon = self.data.lon.item(lon_idx)
lat = self.data.lat.item(lat_idx)
except:
print("Unexpected error:", sys.exc_info()[0])
print("Verify that longitude (idx %d) and latitude (idx %d) are in the data set!" %(lon_idx, lat_idx))
return
return locslice, lon, lat, loc_mask_local
##########################################################################################################
########################################## RAW DATA READING ############################################
##########################################################################################################
########################################## KEY FUNCTIONS #############################################
def make_monthly(self, start_date, end_date):
month_start = pd.date_range(start_date, end_date, freq = 'MS')
month_end = pd.date_range(start_date, end_date, freq = 'M')
for curr_month, curr_month_end in zip(month_start, month_end):
try:
self.add_data(curr_month, curr_month_end, reset = True)
file = curr_month.strftime("%Y-%m") +'.nc'
self.data.to_netcdf(os.path.join(self.path,file))
except:
print("Could not process month %s" %curr_month.strftime("%Y-%m"))
self.start = start_date
self.end = month_end[-1].strftime("%Y%m%d")
def add_data(self, start_date, end_date, reset = False, print_log = True):
self.start = start_date
self.end = end_date
if reset: self.init_data = True
init_merge = True
for ds in self.variables:
if ds == 'SIS':
if print_log: print('Adding global ... ')
new_data = self.read_global(print_log)
elif ds == 'SISDIR':
if print_log: print('Adding direct ... ')
new_data = self.read_direct(print_log)
elif ds == 'ALB':
if print_log: print('Adding albedo ... ')
new_data = self.read_albedo(print_log)
elif ds == 'KI':
if print_log: print('Adding clearness index ... ')
new_data = self.read_clearness(print_log)
elif ds == 'SISCF':
if print_log: print('Adding global clear sky ... ')
new_data = self.read_clearsky_global(print_log)
elif ds == 'SISDIRCF':
if print_log: print('Adding direct clear sky ... ')
new_data = self.read_clearsky_direct(print_log)
else:
print("ERROR: VARIABLE NOT FOUND")
return
if init_merge:
new_dataset = new_data
init_merge = False
else:
print('Merging data ... ')
new_dataset = xr.merge([new_dataset, new_data])
if self.init_data:
self.data = new_dataset
self.init_data = False
else:
print('Adding dataset ... ')
self.data = xr.concat([self.data,new_dataset], self.split_dim)
if print_log: print ("Reading completed")
######################################### DATA SELECTION ###############################################
def choose_format(self, filename, data_path, print_log = True):
if self.format == 'H':
if print_log: print('Adding raw data ... ')
data_out = self.read_data(self.start, self.end, filename, data_path, split = self.split_raw)
elif self.format == 'MMH':
if print_log: print('Adding monthly mean hourly data ... ')
data_out = self.read_monthly_mean_hourly(filename, data_path)
elif self.format == 'WMH':
if print_log: print('Adding weekly mean hourly data ... ')
data_out = self.read_weekly_mean_hourly(filename, data_path)
else:
print("ERROR: DATA FORMAT NOT FOUND")
return
return data_out
################################# INITIALIZE DATA TYPE READERS #########################################
def read_global(self, print_log = True):
data_path = os.path.join(self.path,'global')
filename = lambda date: 'msg.SIS.H_ch02.lonlat_' + date + '000000.nc'
# create data set:
if print_log: print ("Reading global radiation data from %s to %s ..."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
global_data = self.choose_format(filename, data_path, print_log)
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds\n" %(timer))
return global_data
def read_direct(self, print_log = True):
data_path = os.path.join(self.path,'direct')
filename = lambda date: 'msg.SISDIR.H_ch02.lonlat_' + date + '000000.nc'
# create data set:
if print_log: print ("Reading direct radiation data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.start).strftime("%Y-%m-%d")))
timer = time.clock()
direct_data = self.choose_format(filename, data_path, print_log)
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds\n" %(timer))
return direct_data
def read_albedo(self, print_log = True):
data_path = os.path.join(self.path,'albedo')
filename = lambda date: 'msg.ALB.H_ch02.lonlat_' + date + '000000.nc'
# create data set:
if print_log: print ("Reading surface albedo data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
albedo_data = self.choose_format(filename, data_path, print_log)
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds\n" %(timer))
return albedo_data
def read_clearness(self, print_log = True):
data_path = os.path.join(self.path,'clearness')
filename = lambda date: 'msg.KI.H_ch02.lonlat_' + date + '000000.nc'
# create data set:
if print_log: print ("Reading clearness index data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
clearness_data = self.choose_format(filename, data_path, print_log)
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds\n" %(timer))
return clearness_data
def read_clearsky_global(self, print_log = True):
data_path = os.path.join(self.path,'clearsky-global')
filename = lambda date: 'msg.SISCF.H_ch02.lonlat_' + date + '000000.nc'
# create data set:
if print_log: print ("Reading global clear sky radiation data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
clearness_data = self.choose_format(filename, data_path, print_log)
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds\n" %(timer))
return clearness_data
def read_clearsky_direct(self, print_log = True):
data_path = os.path.join(self.path,'clearsky-direct')
filename = lambda date: 'msg.SISDIRCF.H_ch02.lonlat_' + date + '000000.nc'
# create data set:
if print_log: print ("Reading direct clear sky radiation data from %s to %s ...."
%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
timer = time.clock()
clearness_data = self.choose_format(filename, data_path, print_log)
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds\n" %(timer))
return clearness_data
##################################### READING RAW DATA ###############################################
def read_data(self, start_date, end_date, filename, data_path, split):
if split: split_dim = 'date'
else: split_dim = 'time'
dates = pd.date_range(start_date, end_date)
init = True
for date in tqdm(dates):
file = filename(date.strftime("%Y%m%d"))
try:
new_data = xr.open_dataset(os.path.join(data_path,file))
if init: # for first iteration
self.loc_mask = self.loc_mask.reindex_like(new_data,'nearest')
new_data = new_data.where(abs(new_data['time.hour'] - self.offset) <= self.rnge, drop = True) # only keep relevant hours
new_data = new_data.where(self.loc_mask['CH'] == 1, drop=True).drop('geographic_projection') # only keep relevant locations & data
if self.sample: new_data = new_data.where(self.loc_mask['sample'] == 1) # remove all samples that are not of interest, but do not reduce overall size
if split:
new_data = new_data.groupby('time.hour').mean(dim = 'time') # create monthly mean hourly for the current month
new_data.coords['date'] = date
if init: # for first iteration
data_out = new_data # drop geographic projection (cut out unnecessary data)
init = False
else:
data_out = xr.concat([data_out,new_data], split_dim)
except:
print("Unexpected error:", sys.exc_info()[0])
print("Check file in %s" %(os.path.join(data_path,file)))
return data_out
####################################### SPLITTING DATASET #######################################
def split_dataset(self, dates = [], print_log = True):
try:
if len(dates) == 0: # if nothing was passed to the funcion
dates = self.date_range(self.start, self.end)
date_range = pd.to_datetime(dates).strftime("%Y%m%d")
if print_log: print ("Splitting all data from %s to %s ...\nThis is faster if split_raw is selected in initialisation of reader!"
%(pd.to_datetime(dates[0]).strftime("%Y-%m-%d"), pd.to_datetime(dates[-1]).strftime("%Y-%m-%d")))
timer = time.clock()
init = True
for day in date_range:
day_data = self.data.sel(time = slice(day))
data = day_data.groupby('time.hour').mean(dim = 'time')
data.coords['date'] = pd.to_datetime(day)
if init:
splitted_data = data
init = False
else:
splitted_data = xr.concat([splitted_data, data], 'date') # add new slice
timer = time.clock() - timer
if print_log: print ("Finished splitting data in %.2f seconds" %(timer))
except:
print("Error: Verify that date has right format; set ['datestring']")
return
return splitted_data

Event Timeline