meteo_raw.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Apr 28, 04:50

meteo_raw.py
View Options

	import numpy as np
	from tqdm import tqdm
	import time
	import pandas as pd
	import xarray as xr
	import os
	import util

	class Meteo_Raw():

	def __init__(self, data_path, start_time = 3, end_time = 19, variables = None, data_format = 'H', split_raw = True):
	'''
	Initialisation settings:
	- path: root of raw data, with subfolder 'global', 'direct', 'albedo'
	- start_time, end_time: define mask for time selection (min 0, max 23)
	- vars: list of variables of any combination of ['SIS', 'SISDIR', 'ALB', 'KI', 'SISCF']
	- data_format: 'H': raw data (hourly values); 'MMH': monthly-mean-hourly; 'WWH': weekly-mean-hourly
	- split_data: Only for 'H' - True for performing split into hour and date
	'''
	_all_vars = ['SIS', 'SISDIR', 'SISCF', 'SISDIRCF', 'KI', 'ALB']

	self.path = data_path

	self.offset = abs((start_time + end_time)/2) # for dropping of irrelevant data
	self.rnge = abs((end_time - start_time)/2) # for dropping of irrelevant data

	if variables == None:
	self.variables = _all_vars
	else:
	self.variables = variables
	self.format = data_format
	self.split_raw = split_raw
	if not split_raw and data_format == 'H':
	self.split_dim = 'time'
	else: self.split_dim = 'date' # dimension for concatenation of new data

	self.loc_mask = xr.open_dataset(os.path.join(data_path,'locations_mask.nc'))
	self._loc_ftrs = self.loc_mask.to_dataframe().columns

	# INITIALISATIONS
	self.init_data = True
	self.init_reading = True
	self.sample = False

	##########################################################################################################
	########################################## RAW DATA READING ############################################
	##########################################################################################################

	def make_monthly(self, start_date, end_date):
	month_start = pd.date_range(start_date, end_date, freq = 'MS')
	month_end = pd.date_range(start_date, end_date, freq = 'M')

	for curr_month, curr_month_end in zip(month_start, month_end):
	try:
	self.add_data(curr_month, curr_month_end, reset = True)
	file = curr_month.strftime("%Y-%m") +'.nc'
	self.data.to_netcdf(os.path.join(self.path,file))
	except:
	print("Could not process month %s" %curr_month.strftime("%Y-%m"))

	self.start = None
	self.end = None

	def add_data(self, start_date, end_date, reset = False, print_log = True):

	self.start = start_date
	self.end = end_date

	if reset: self.init_data = True
	init_merge = True

	for ds in self.variables:
	if ds == 'SIS':
	if print_log: print('Adding global ... ')
	new_data = self.read_global(print_log)

	elif ds == 'SISDIR':
	if print_log: print('Adding direct ... ')
	new_data = self.read_direct(print_log)

	elif ds == 'ALB':
	if print_log: print('Adding albedo ... ')
	new_data = self.read_albedo(print_log)

	elif ds == 'KI':
	if print_log: print('Adding clearness index ... ')
	new_data = self.read_clearness(print_log)

	elif ds == 'SISCF':
	if print_log: print('Adding global clear sky ... ')
	new_data = self.read_clearsky_global(print_log)

	elif ds == 'SISDIRCF':
	if print_log: print('Adding direct clear sky ... ')
	new_data = self.read_clearsky_direct(print_log)

	else:
	print("ERROR: VARIABLE NOT FOUND")
	return

	if init_merge:
	new_dataset = new_data
	init_merge = False
	else:
	print('Merging data ... ')
	new_dataset = xr.merge([new_dataset, new_data])

	if self.init_data:
	self.data = new_dataset
	self.init_data = False
	else:
	print('Adding dataset ... ')
	self.data = xr.concat([self.data,new_dataset], self.split_dim)

	if print_log: print ("Reading completed")

	######################################### DATA SELECTION ###############################################

	def choose_format(self, filename, data_path, print_log = True):

	if self.format == 'H':
	if print_log: print('Adding raw data ... ')
	data_out = self.read_data(self.start, self.end, filename, data_path, split = self.split_raw)

	elif self.format == 'MMH':
	if print_log: print('Adding monthly mean hourly data ... ')
	data_out = self.read_monthly_mean_hourly(filename, data_path)

	elif self.format == 'WMH':
	if print_log: print('Adding weekly mean hourly data ... ')
	data_out = self.read_weekly_mean_hourly(filename, data_path)

	else:
	print("ERROR: DATA FORMAT NOT FOUND")
	return

	return data_out

	################################# INITIALIZE DATA TYPE READERS #########################################

	def read_global(self, print_log = True):

	data_path = os.path.join(self.path,'global')

	filename = lambda date: 'msg.SIS.H_ch02.lonlat_' + date + '000000.nc'

	# create data set:
	if print_log: print ("Reading global radiation data from %s to %s ..."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))

	timer = time.clock()

	global_data = self.choose_format(filename, data_path, print_log)

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds\n" %(timer))

	return global_data

	def read_direct(self, print_log = True):

	data_path = os.path.join(self.path,'direct')

	filename = lambda date: 'msg.SISDIR.H_ch02.lonlat_' + date + '000000.nc'

	# create data set:
	if print_log: print ("Reading direct radiation data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.start).strftime("%Y-%m-%d")))

	timer = time.clock()

	direct_data = self.choose_format(filename, data_path, print_log)

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds\n" %(timer))

	return direct_data

	def read_albedo(self, print_log = True):

	data_path = os.path.join(self.path,'albedo')

	filename = lambda date: 'msg.ALB.H_ch02.lonlat_' + date + '000000.nc'

	# create data set:
	if print_log: print ("Reading surface albedo data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))

	timer = time.clock()

	albedo_data = self.choose_format(filename, data_path, print_log)

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds\n" %(timer))

	return albedo_data

	def read_clearness(self, print_log = True):

	data_path = os.path.join(self.path,'clearness')

	filename = lambda date: 'msg.KI.H_ch02.lonlat_' + date + '000000.nc'

	# create data set:
	if print_log: print ("Reading clearness index data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))

	timer = time.clock()

	clearness_data = self.choose_format(filename, data_path, print_log)

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds\n" %(timer))

	return clearness_data

	def read_clearsky_global(self, print_log = True):

	data_path = os.path.join(self.path,'clearsky-global')

	filename = lambda date: 'msg.SISCF.H_ch02.lonlat_' + date + '000000.nc'

	# create data set:
	if print_log: print ("Reading global clear sky radiation data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))

	timer = time.clock()

	clearness_data = self.choose_format(filename, data_path, print_log)

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds\n" %(timer))

	return clearness_data


	def read_clearsky_direct(self, print_log = True):

	data_path = os.path.join(self.path,'clearsky-direct')

	filename = lambda date: 'msg.SISDIRCF.H_ch02.lonlat_' + date + '000000.nc'

	# create data set:
	if print_log: print ("Reading direct clear sky radiation data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))

	timer = time.clock()

	clearness_data = self.choose_format(filename, data_path, print_log)

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds\n" %(timer))

	return clearness_data

	########################################## MONTHLY MEAN HOURLY ##########################################

	def read_monthly_mean_hourly(self, filename, data_path, print_log = False):

	month_start = pd.date_range(self.start, self.end, freq = 'MS') # create list of first day in month
	month_end = pd.date_range(self.start, self.end, freq = 'M') # create list in last day of month

	self.start = month_start[0] # homogenise start and end
	self.end = month_end[-1]

	if print_log: print ("Reading all data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
	timer = time.clock()

	init = True
	for (curr_month, curr_month_end) in zip(month_start, month_end):

	new_data = self.read_data(curr_month, curr_month_end, filename, data_path, split = False)

	hourly_mean = new_data.groupby('time.hour').mean(dim = 'time') # create monthly mean hourly for the current month
	hourly_mean.coords['date'] = curr_month # label current data - get only month number with: curr_month.month

	if init: # for first iteration
	data_out = hourly_mean
	init = False
	else:
	data_out = xr.concat([data_out, hourly_mean], 'date') # add new slice

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds" %(timer))

	return data_out

	########################################## WEEKLY MEAN HOURLY #######################################

	def read_weekly_mean_hourly(self, filename, data_path, print_log = False):

	week_start = pd.date_range(self.start, self.end, freq = 'W-MON') # create list of first day in month
	week_end = pd.date_range(week_start[0], self.end, freq = 'W-SUN') # create list in last day of month

	self.start = week_start[0] # homogenise start and end
	self.end = week_end[-1]

	if print_log: print ("Reading all data from %s to %s ...."
	%(pd.to_datetime(self.start).strftime("%Y-%m-%d"), pd.to_datetime(self.end).strftime("%Y-%m-%d")))
	timer = time.clock()

	init = True
	for (curr_week, curr_week_end) in zip(week_start, week_end):

	new_data = self.read_data(curr_week, curr_week_end, filename, data_path, split = False)

	hourly_mean = new_data.groupby('time.hour').mean(dim = 'time') # create monthly mean hourly for the current month
	hourly_mean.coords['date'] = curr_week # label current data

	if init: # for first iteration
	data_out = hourly_mean
	init = False
	else:
	data_out = xr.concat([data_out, hourly_mean], 'date') # add new slice

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds" %(timer))

	return data_out

	##################################### READING RAW DATA ###############################################

	def read_data(self, start_date, end_date, filename, data_path, split):
	if split: split_dim = 'date'
	else: split_dim = 'time'

	dates = pd.date_range(start_date, end_date)
	init = True

	for date in tqdm(dates):
	file = filename(date.strftime("%Y%m%d"))

	try:
	new_data = xr.open_dataset(os.path.join(data_path,file))
	if init: # for first iteration
	self.loc_mask = self.loc_mask.reindex_like(new_data,'nearest')

	new_data = new_data.where(abs(new_data['time.hour'] - self.offset) <= self.rnge, drop = True) # only keep relevant hours
	new_data = new_data.where(self.loc_mask['CH'] == 1, drop=True).drop('geographic_projection') # only keep relevant locations & data

	if self.sample: new_data = new_data.where(self.loc_mask['sample'] == 1) # remove all samples that are not of interest, but do not reduce overall size

	if split:
	new_data = new_data.groupby('time.hour').mean(dim = 'time') # create monthly mean hourly for the current month
	new_data.coords['date'] = date

	if init: # for first iteration
	data_out = new_data # drop geographic projection (cut out unnecessary data)
	init = False
	else:
	data_out = xr.concat([data_out,new_data], split_dim)

	except:
	print("Unexpected error:", sys.exc_info()[0])
	print("Check file in %s" %(os.path.join(data_path,file)))

	return data_out

	####################################### SPLITTING DATASET #######################################

	def split_dataset(self, dates = [], print_log = True):

	try:
	if len(dates) == 0: # if nothing was passed to the funcion
	dates = self.date_range(self.start, self.end)

	date_range = pd.to_datetime(dates).strftime("%Y%m%d")

	if print_log: print ("Splitting all data from %s to %s ...\nThis is faster if split_raw is selected in initialisation of reader!"
	%(pd.to_datetime(dates[0]).strftime("%Y-%m-%d"), pd.to_datetime(dates[-1]).strftime("%Y-%m-%d")))
	timer = time.clock()

	init = True
	for day in date_range:
	day_data = self.data.sel(time = slice(day))
	data = day_data.groupby('time.hour').mean(dim = 'time')
	data.coords['date'] = pd.to_datetime(day)

	if init:
	splitted_data = data
	init = False
	else:
	splitted_data = xr.concat([splitted_data, data], 'date') # add new slice

	timer = time.clock() - timer
	if print_log: print ("Finished splitting data in %.2f seconds" %(timer))

	except:
	print("Error: Verify that date has right format; set ['datestring']")
	return

	return splitted_data


	###################################### UTIL ################################################

	def date_range(self, start_date, end_date):

	if self.format == 'H':
	return pd.date_range(start_date, end_date, freq = 'D')

	elif self.format == 'MMH':
	return pd.date_range(start_date, end_date, freq = 'MS')

	elif self.format == 'WMH':
	return pd.date_range(start_date, end_date, freq = 'W-MON')

meteo_raw.pyNo OneTemporaryActions

File Metadata

meteo_raw.pyView Options

Event Timeline

meteo_raw.py
No OneTemporary
Actions

meteo_raw.py
View Options