Radiation_Reader.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Jul 16, 20:46

Radiation_Reader.py
View Options


	import numpy as np
	import scipy as sc
	from tqdm import tqdm
	import time
	import pandas as pd
	import xarray as xr
	import os
	import sys
	import util

	class Radiation_Reader:

	# sunrise = [6, 6, 5, 4, 3, 3, 3, 4, 4, 5, 6, 6] # actual sunrise hours (in GMT)
	# sunset = [16, 17, 17, 18, 19, 19, 19, 18, 18, 17, 16, 15] # actual sunset hours (in GMT)

	def __init__(self, data_path, start_time = 3, end_time = 19, variables = ['SIS', 'SISDIR'], data_format = 'H', split_raw = True):
	'''
	Initialisation settings:
	- path: root of raw data, with subfolder 'global', 'direct', 'albedo'
	- start_time, end_time: define mask for time selection (min 0, max 23)
	- vars: list of variables of any combination of ['SIS', 'SISDIR', 'ALB', 'KI', 'SISCF']
	- data_format: 'H': raw data (hourly values); 'MMH': monthly-mean-hourly; 'WWH': weekly-mean-hourly
	- split_data: Only for 'H' - True for performing split into hour and date
	'''
	self.path = data_path

	self.offset = abs((start_time + end_time)/2) # for dropping of irrelevant data
	self.rnge = abs((end_time - start_time)/2) # for dropping of irrelevant data

	self.variables = variables
	self.format = data_format
	self.split_raw = split_raw
	if not split_raw and data_format == 'H':
	self.split_dim = 'time'
	else: self.split_dim = 'date' # dimension for concatenation of new data

	self.loc_mask = xr.open_dataset(os.path.join(data_path,'locations_mask.nc'))

	# INITIALISATIONS
	self.init_data = True
	self.init_reading = True
	self.sample = False

	def date_range(self, start_date, end_date):

	if self.format == 'H':
	return pd.date_range(start_date, end_date, freq = 'D')

	elif self.format == 'MMH':
	return pd.date_range(start_date, end_date, freq = 'MS')

	elif self.format == 'WMH':
	return pd.date_range(start_date, end_date, freq = 'W-MON')

	########################################### READ DATA #############################################

	def read_data(self, start_date, end_date, reset = False, print_log = True):
	if reset:
	self.init_reading = True

	# VEEEEEERY CRUDE JUST TO MAKE OLD AND NEW WORK TOGETHER - SHOULD BE RE-WRITTEN
	elif not init_reading:
	old_data = self.data

	month_start = pd.date_range(start_date, end_date, freq = 'MS')
	month_end = pd.date_range(start_date, end_date, freq = 'M')

	init = True
	for curr_month, curr_month_end in zip(month_start, month_end):

	monthfile = curr_month.strftime('%Y-%m') + '.nc'
	print(monthfile)

	if not os.path.exists(os.path.join(self.path,monthfile)):
	print("file %s does not exist - creating file from raw data" %monthfile)
	self.make_monthly(curr_month, curr_month_end)

	try:
	new_data = xr.open_dataset(os.path.join(self.path,monthfile))
	if init:
	dataset = new_data
	init = False
	else:
	dataset = xr.concat([dataset,new_data], self.split_dim)
	except:
	print("failed to read or concatenate data in %s" %monthfile)

	if self.init_reading:
	self.data = dataset
	self.init_reading = False
	else:
	self.data = xr.merge([old_data,dataset])

	self.start = start_date
	self.end = month_end[-1].strftime("%Y%m%d")

	########################################## MONTHLY MEAN HOURLY ##########################################

	def read_monthly_mean_hourly(self, filename, data_path, print_log = False):

	month_start = pd.date_range(self.start, self.end, freq = 'MS') # create list of first day in month
	month_end = pd.date_range(self.start, self.end, freq = 'M') # create list in last day of month

	self.start = month_start[0] # homogenise start and end
	self.end = month_end[-1]

	if print_log: print ("Reading all data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
	timer = time.clock()

	init = True
	for (curr_month, curr_month_end) in zip(month_start, month_end):

	new_data = self.read_data(curr_month, curr_month_end, filename, data_path, split = False)

	hourly_mean = new_data.groupby('time.hour').mean(dim = 'time') # create monthly mean hourly for the current month
	hourly_mean.coords['date'] = curr_month # label current data - get only month number with: curr_month.month

	if init: # for first iteration
	data_out = hourly_mean
	init = False
	else:
	data_out = xr.concat([data_out, hourly_mean], 'date') # add new slice

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds" %(timer))

	return data_out

	########################################## WEEKLY MEAN HOURLY #######################################

	def read_weekly_mean_hourly(self, filename, data_path, print_log = False):

	week_start = pd.date_range(self.start, self.end, freq = 'W-MON') # create list of first day in month
	week_end = pd.date_range(week_start[0], self.end, freq = 'W-SUN') # create list in last day of month

	self.start = week_start[0] # homogenise start and end
	self.end = week_end[-1]

	if print_log: print ("Reading all data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
	timer = time.clock()

	init = True
	for (curr_week, curr_week_end) in zip(week_start, week_end):

	new_data = self.read_data(curr_week, curr_week_end, filename, data_path, split = False)

	hourly_mean = new_data.groupby('time.hour').mean(dim = 'time') # create monthly mean hourly for the current month
	hourly_mean.coords['date'] = curr_week # label current data

	if init: # for first iteration
	data_out = hourly_mean
	init = False
	else:
	data_out = xr.concat([data_out, hourly_mean], 'date') # add new slice

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds" %(timer))

	return data_out

	########################################### SUBSETS ###################################################

	def read_sample(self, matrix_in = None, filename = None, sample_raw = True):
	# read input matrix for sampling locations

	if filename is not None:
	matrix_in = np.loadtxt(filename)

	self.loc_mask['sample'] = xr.DataArray(matrix_in, coords = self.loc_mask.coords)

	if sample_raw: self.sample = True
	idx = np.argwhere(self.loc_mask.sample.where(self.loc_mask['CH'] == 1,drop = True).values==1)
	return idx

	def create_regular_sample(self, downsampling_ratio, filename = None, sample_raw = True):
	n_lat = len(self.loc_mask.lat.values)
	n_lon = len(self.loc_mask.lon.values)
	smpl_int = int(np.floor(np.sqrt(downsampling_ratio)))

	# CREATE MATRIX OF THE SAME SIZE AS LOC_MASK
	matrix_reg = np.zeros((n_lat,n_lon))
	matrix_reg[::smpl_int,::smpl_int] = 1
	matrix_reg = matrix_reg * self.loc_mask.CH.values.T

	self.loc_mask['sample'] = xr.DataArray(matrix_reg, coords = self.loc_mask.coords)

	if sample_raw: self.sample = True
	if filename is not None:
	np.savetxt(filename, matrix_reg)

	idx = np.argwhere(self.loc_mask.sample.where(self.loc_mask['CH'] == 1,drop = True).values==1)
	return idx

	def create_random_sample(self, n_sample, filename = None, sample_raw = True):
	n_lat = len(self.loc_mask.lat.values)
	n_lon = len(self.loc_mask.lon.values)

	# CREATE MATRIX OF THE SAME SIZE AS LOC_MASK
	loc_mask_local = self.loc_mask.where(self.loc_mask['CH'] == 1)
	indices_CH = loc_mask_local.to_dataframe().reset_index().dropna().index # obtain list of indices in original dataset that are in CH
	indices_random = np.random.permutation(indices_CH)[:n_sample] # obtain list of random indices

	matrix_rand = np.zeros((n_lon*n_lat,1))
	matrix_rand[indices_random] = 1 # get matrix with random values set to 1
	matrix_rand = matrix_rand.reshape((n_lat, n_lon))
	self.loc_mask['sample'] = xr.DataArray(matrix_rand, coords = self.loc_mask.coords)

	if sample_raw: self.sample = True
	if filename is not None:
	np.savetxt(filename, matrix_rand)

	idx = np.argwhere(self.loc_mask.sample.where(self.loc_mask['CH'] == 1,drop = True).values==1)
	return idx, matrix_rand


	def get_subset(self, lon_idx, lat_idx, print_log = True):

	loc_mask_local = self.loc_mask.where(self.loc_mask['CH'] == 1, drop=True)

	timer = time.clock()
	init = True
	for lon, lat in tqdm(zip(lon_idx, lat_idx)):
	loc_mask_local.CH.values[lon,lat] = 2

	new_slice,_,_,_ = self.get_locslice(lat_idx = lat, lon_idx = lon, random = False)
	new_tbl = self.make_table(new_slice, print_log = False)

	if init:
	subset = new_tbl
	init = False
	else:
	subset = pd.concat([subset, new_tbl])

	if print_log: print ("Created subset in %.2f seconds" %(time.clock() - timer))

	return subset, loc_mask_local

	########################################### CREATE TABLES ############################################

	def make_table(self, indata = None, print_log = True):

	if indata is None:
	indata = self.data

	timer = time.clock()

	coords = self.loc_mask.to_dataframe().dropna().reset_index()

	table = indata.to_dataframe().dropna().reset_index()
	table['month'] = pd.to_datetime(table['date'].values).month
	table['day'] = pd.to_datetime(table['date'].values).dayofyear
	table['timestamp'] = util.to_timestamp(table['date'].values,table['hour'].values)
	table = pd.merge(left=table, right=coords, how="left", left_on=["lon", "lat"], right_on=["lon", "lat"])

	if print_log: print ("Created table in %.2f seconds" %(time.clock() - timer))

	return table

	########################################## DATA SLICES ################################################

	def get_timeslice(self, hour = -1, date = -1, hour_idx = -1, date_idx = -1):
	"""
	DESCRIPTION

	Inputs:
	hour / date: input to select data
	hour_idx/ date_idx: input to select data by index

	Outputs:
	"""

	try:
	if self.split_dim == 'date': # IF DATA IS SPLIT
	if hour == -1 and hour_idx == -1: # select random time
	print("Random selection of hour")
	hour_idx = np.random.randint(self.data.sizes['hour'])

	if date == -1 and date_idx == -1: # select random date
	print("Random selection of date")
	date_idx = np.random.randint(self.data.sizes['date'])

	if hour == -1:
	hour = self.data.hour.values[hour_idx]

	if date == -1: # if time was chosen by index
	date = self.data.date.values[date_idx]

	timeslice = self.data.sel(date = pd.to_datetime(date), hour = hour)

	else: # IF DATA IS NOT SPLIT YET
	if hour == -1 or date == -1:
	print("Error: Please specify date and time")
	return

	timeslice = self.split_dataset(dates = date).sel(hour = hour)

	except:
	print("Unexpected error:", sys.exc_info())
	print("Verify that date (%s) and hour (%d) are in the data set!" %(date, hour))
	return

	return timeslice, date, hour

	def get_locslice(self, lat_idx = -1, lon_idx = -1, start_date = -1, end_date = -1, random = True):
	"""
	DESCRIPTION

	Inputs:

	Outputs:
	"""
	if random == True: # select random time
	CH = False
	while CH == False:
	lat_idx = np.random.randint(self.data.sizes['lat'])
	lon_idx = np.random.randint(self.data.sizes['lon'])
	if self.loc_mask.isel(lon = lon_idx, lat = lat_idx).CH.values == 1: CH = True
	print("find random location in Switzerland ...")

	loc_mask_local = self.loc_mask.where(self.loc_mask['CH'] == 1, drop=True)
	loc_mask_local.CH.values[lon_idx,lat_idx] = 2

	if start_date == -1:
	start_date = self.start
	if end_date == -1:
	end_date = self.end
	dates = self.date_range(start_date, end_date).strftime("%Y%m%d")

	if self.split_dim == 'time':
	locslice = self.split_dataset(dates = dates)
	else:
	locslice = self.data.sel(date = pd.to_datetime(dates))

	try:
	locslice = locslice.isel(lon = lon_idx, lat = lat_idx)
	lon = self.data.lon.item(lon_idx)
	lat = self.data.lat.item(lat_idx)
	except:
	print("Unexpected error:", sys.exc_info()[0])
	print("Verify that longitude (idx %d) and latitude (idx %d) are in the data set!" %(lon_idx, lat_idx))
	return

	return locslice, lon, lat, loc_mask_local

	##########################################################################################################
	########################################## RAW DATA READING ############################################
	##########################################################################################################

	########################################## KEY FUNCTIONS #############################################

	def make_monthly(self, start_date, end_date):
	month_start = pd.date_range(start_date, end_date, freq = 'MS')
	month_end = pd.date_range(start_date, end_date, freq = 'M')

	for curr_month, curr_month_end in zip(month_start, month_end):
	try:
	self.add_data(curr_month, curr_month_end, reset = True)
	file = curr_month.strftime("%Y-%m") +'.nc'
	self.data.to_netcdf(os.path.join(self.path,file))
	except:
	print("Could not process month %s" %curr_month.strftime("%Y-%m"))

	self.start = start_date
	self.end = month_end[-1].strftime("%Y%m%d")

	def add_data(self, start_date, end_date, reset = False, print_log = True):

	self.start = start_date
	self.end = end_date

	if reset: self.init_data = True
	init_merge = True

	for ds in self.variables:
	if ds == 'SIS':
	if print_log: print('Adding global ... ')
	new_data = self.read_global(print_log)

	elif ds == 'SISDIR':
	if print_log: print('Adding direct ... ')
	new_data = self.read_direct(print_log)

	elif ds == 'ALB':
	if print_log: print('Adding albedo ... ')
	new_data = self.read_albedo(print_log)

	elif ds == 'KI':
	if print_log: print('Adding clearness index ... ')
	new_data = self.read_clearness(print_log)

	elif ds == 'SISCF':
	if print_log: print('Adding global clear sky ... ')
	new_data = self.read_clearsky_global(print_log)

	elif ds == 'SISDIRCF':
	if print_log: print('Adding direct clear sky ... ')
	new_data = self.read_clearsky_direct(print_log)

	else:
	print("ERROR: VARIABLE NOT FOUND")
	return

	if init_merge:
	new_dataset = new_data
	init_merge = False
	else:
	print('Merging data ... ')
	new_dataset = xr.merge([new_dataset, new_data])

	if self.init_data:
	self.data = new_dataset
	self.init_data = False
	else:
	print('Adding dataset ... ')
	self.data = xr.concat([self.data,new_dataset], self.split_dim)

	if print_log: print ("Reading completed")

	######################################### DATA SELECTION ###############################################

	def choose_format(self, filename, data_path, print_log = True):

	if self.format == 'H':
	if print_log: print('Adding raw data ... ')
	data_out = self.read_data(self.start, self.end, filename, data_path, split = self.split_raw)

	elif self.format == 'MMH':
	if print_log: print('Adding monthly mean hourly data ... ')
	data_out = self.read_monthly_mean_hourly(filename, data_path)

	elif self.format == 'WMH':
	if print_log: print('Adding weekly mean hourly data ... ')
	data_out = self.read_weekly_mean_hourly(filename, data_path)

	else:
	print("ERROR: DATA FORMAT NOT FOUND")
	return

	return data_out

	################################# INITIALIZE DATA TYPE READERS #########################################

	def read_global(self, print_log = True):

	data_path = os.path.join(self.path,'global')

	filename = lambda date: 'msg.SIS.H_ch02.lonlat_' + date + '000000.nc'

	# create data set:
	if print_log: print ("Reading global radiation data from %s to %s ..."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))

	timer = time.clock()

	global_data = self.choose_format(filename, data_path, print_log)

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds\n" %(timer))

	return global_data

	def read_direct(self, print_log = True):

	data_path = os.path.join(self.path,'direct')

	filename = lambda date: 'msg.SISDIR.H_ch02.lonlat_' + date + '000000.nc'

	# create data set:
	if print_log: print ("Reading direct radiation data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.start).strftime("%Y-%m-%d")))

	timer = time.clock()

	direct_data = self.choose_format(filename, data_path, print_log)

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds\n" %(timer))

	return direct_data

	def read_albedo(self, print_log = True):

	data_path = os.path.join(self.path,'albedo')

	filename = lambda date: 'msg.ALB.H_ch02.lonlat_' + date + '000000.nc'

	# create data set:
	if print_log: print ("Reading surface albedo data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))

	timer = time.clock()

	albedo_data = self.choose_format(filename, data_path, print_log)

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds\n" %(timer))

	return albedo_data

	def read_clearness(self, print_log = True):

	data_path = os.path.join(self.path,'clearness')

	filename = lambda date: 'msg.KI.H_ch02.lonlat_' + date + '000000.nc'

	# create data set:
	if print_log: print ("Reading clearness index data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))

	timer = time.clock()

	clearness_data = self.choose_format(filename, data_path, print_log)

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds\n" %(timer))

	return clearness_data

	def read_clearsky_global(self, print_log = True):

	data_path = os.path.join(self.path,'clearsky-global')

	filename = lambda date: 'msg.SISCF.H_ch02.lonlat_' + date + '000000.nc'

	# create data set:
	if print_log: print ("Reading global clear sky radiation data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))

	timer = time.clock()

	clearness_data = self.choose_format(filename, data_path, print_log)

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds\n" %(timer))

	return clearness_data


	def read_clearsky_direct(self, print_log = True):

	data_path = os.path.join(self.path,'clearsky-direct')

	filename = lambda date: 'msg.SISDIRCF.H_ch02.lonlat_' + date + '000000.nc'

	# create data set:
	if print_log: print ("Reading direct clear sky radiation data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))

	timer = time.clock()

	clearness_data = self.choose_format(filename, data_path, print_log)

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds\n" %(timer))

	return clearness_data

	##################################### READING RAW DATA ###############################################

	def read_data(self, start_date, end_date, filename, data_path, split):
	if split: split_dim = 'date'
	else: split_dim = 'time'

	dates = pd.date_range(start_date, end_date)
	init = True

	for date in tqdm(dates):
	file = filename(date.strftime("%Y%m%d"))

	try:
	new_data = xr.open_dataset(os.path.join(data_path,file))
	if init: # for first iteration
	self.loc_mask = self.loc_mask.reindex_like(new_data,'nearest')

	new_data = new_data.where(abs(new_data['time.hour'] - self.offset) <= self.rnge, drop = True) # only keep relevant hours
	new_data = new_data.where(self.loc_mask['CH'] == 1, drop=True).drop('geographic_projection') # only keep relevant locations & data

	if self.sample: new_data = new_data.where(self.loc_mask['sample'] == 1) # remove all samples that are not of interest, but do not reduce overall size

	if split:
	new_data = new_data.groupby('time.hour').mean(dim = 'time') # create monthly mean hourly for the current month
	new_data.coords['date'] = date

	if init: # for first iteration
	data_out = new_data # drop geographic projection (cut out unnecessary data)
	init = False
	else:
	data_out = xr.concat([data_out,new_data], split_dim)

	except:
	print("Unexpected error:", sys.exc_info()[0])
	print("Check file in %s" %(os.path.join(data_path,file)))

	return data_out

	####################################### SPLITTING DATASET #######################################

	def split_dataset(self, dates = [], print_log = True):

	try:
	if len(dates) == 0: # if nothing was passed to the funcion
	dates = self.date_range(self.start, self.end)

	date_range = pd.to_datetime(dates).strftime("%Y%m%d")

	if print_log: print ("Splitting all data from %s to %s ...\nThis is faster if split_raw is selected in initialisation of reader!"
	%(pd.to_datetime(dates[0]).strftime("%Y-%m-%d"), pd.to_datetime(dates[-1]).strftime("%Y-%m-%d")))
	timer = time.clock()

	init = True
	for day in date_range:
	day_data = self.data.sel(time = slice(day))
	data = day_data.groupby('time.hour').mean(dim = 'time')
	data.coords['date'] = pd.to_datetime(day)

	if init:
	splitted_data = data
	init = False
	else:
	splitted_data = xr.concat([splitted_data, data], 'date') # add new slice

	timer = time.clock() - timer
	if print_log: print ("Finished splitting data in %.2f seconds" %(timer))

	except:
	print("Error: Verify that date has right format; set ['datestring']")
	return

	return splitted_data

Radiation_Reader.pyNo OneTemporaryActions

File Metadata

Radiation_Reader.pyView Options

Event Timeline

Radiation_Reader.py
No OneTemporary
Actions

Radiation_Reader.py
View Options