meteo_data.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Apr 28, 22:22

meteo_data.py
View Options

	import numpy as np
	import time
	import pandas as pd
	import xarray as xr
	import os
	import util
	from meteo_raw import Meteo_Raw

	class Meteo_Reader():

	def __init__(self, data_path, variables = None, data_format = 'H'):
	'''
	Initialisation settings:
	- path: root of raw data, with subfolder 'global', 'direct', 'albedo'
	- vars: list of variables of any combination of ['SIS', 'SISDIR', 'ALB', 'KI', 'SISCF']
	- data_format: 'H': raw data (hourly values); 'MMH': monthly-mean-hourly
	'''
	self._all_vars = ['SIS', 'SISDIR', 'SISCF', 'SISDIRCF', 'KI', 'ALB', 'KI_SIS', 'KI_SISDIR']

	self.path = data_path

	self.format = data_format
	if variables is None:
	self.variables = self._all_vars
	else:
	self.variables = variables
	self.drop_vars = list(set(self._all_vars) - set(self.variables))

	print(data_path)
	print(os.path.join(data_path,'locations_mask.nc'))
	self.loc_mask = xr.open_dataset(os.path.join(data_path,'locations_mask.nc'))
	self._loc_ftrs = self.loc_mask.to_dataframe().columns
	# self.loc_mask['sample'] = self.loc_mask['CH']

	# INITIALISATIONS
	self.init_data = True
	self.init_reading = True
	self.sample = False

	self.loaded_hourmask = False


	##########################################################################################

	def read_yearly(self, year, variables = None, hourmask = True):
	if variables is not None:
	self.variables = variables
	self.drop_vars = list(set(self._all_vars) - set(self.variables))

	filename = str(year)+'.nc'

	if not os.path.exists(os.path.join(self.path,filename)):
	print("file %s does not exist - please create file from raw data" %filename)

	try:
	self.data = xr.open_dataset(os.path.join(self.path,filename))
	self.data = self.data.drop(self.drop_vars)

	if self.sample: self.data = self.data.where(self.loc_mask['sample'] == 1)

	except:
	print("failed to read data in %s" %filename)

	if hourmask: self.data = self.add_hourmask(year = year)

	self.start = pd.to_datetime(self.data.date.values[0]).strftime("%Y%m%d")
	self.end = pd.to_datetime(self.data.date.values[-1]).strftime("%Y%m%d")


	def read_data(self, start_date, end_date, variables = None, hourmask = True, reset = False, print_log = True):
	if variables is not None:
	self.variables = variables
	self.drop_vars = list(set(self._all_vars) - set(self.variables))

	reader = Meteo_Raw(self.path, variables = self._all_vars)

	if reset:
	self.init_reading = True

	month_start = pd.date_range(start_date, end_date, freq = 'MS')
	month_end = pd.date_range(start_date, end_date, freq = 'M')

	timer = time.clock()

	init = True
	for curr_month, curr_month_end in zip(month_start, month_end):

	monthfile = curr_month.strftime('%Y-%m') + '.nc'
	print(monthfile)

	if not os.path.exists(os.path.join(self.path,monthfile)):
	print("file %s does not exist - creating file from raw data" %monthfile)
	reader.make_monthly(curr_month, curr_month_end)

	try:
	new_data = xr.open_dataset(os.path.join(self.path,monthfile))
	new_data = new_data.drop(self.drop_vars)

	if self.sample: new_data = new_data.where(self.loc_mask['sample'] == 1)

	if self.format == 'MMH':
	new_data = new_data.mean(dim = 'date') # create monthly mean hourly for the current month
	new_data.coords['date'] = curr_month

	elif self.format != 'H':
	print("Unknown format to read data")

	if hourmask: new_data = self.add_hourmask(year = curr_month.year, month = curr_month.month, data = new_data)

	if init:
	dataset = new_data
	init = False
	else:
	dataset = xr.concat([dataset,new_data], 'date')
	except:
	print("failed to read or concatenate data in %s" %monthfile)

	if self.init_reading:
	self.data = dataset
	self.init_reading = False
	else:
	self.data = xr.merge([self.data,dataset])

	timer = time.clock() - timer
	if print_log: print ("Finished reading in %.2f seconds" %(timer))

	if hourmask: self.data = self.add_hourmask()

	self.start = month_start[0]
	self.end = month_end[-1].strftime("%Y%m%d")

	########################################### SUBSETS ###################################################

	def read_sample(self, matrix_in = None, filename = None, sample_name = 'sample', sample_raw = True):
	# read input matrix for sampling locations

	if filename is not None:
	matrix_in = np.loadtxt(filename)

	try:
	self.loc_mask[sample_name] = xr.DataArray(matrix_in, coords = self.loc_mask.coords)
	except ValueError:
	self.loc_mask[sample_name] = xr.DataArray(matrix_in.T, coords = self.loc_mask.coords)

	if sample_raw: self.sample = True

	def create_regular_sample(self, downsampling_ratio, filename = None, sample_raw = True):
	n_lat = len(self.loc_mask.lat.values)
	n_lon = len(self.loc_mask.lon.values)
	smpl_int = int(np.floor(np.sqrt(downsampling_ratio)))

	# CREATE MATRIX OF THE SAME SIZE AS LOC_MASK
	matrix_reg = np.zeros((n_lat,n_lon))
	matrix_reg[::smpl_int,::smpl_int] = 1
	matrix_reg = matrix_reg * self.loc_mask.CH.values.T

	self.loc_mask['sample'] = xr.DataArray(matrix_reg, coords = self.loc_mask.coords)

	if sample_raw: self.sample = True
	if filename is not None:
	np.savetxt(filename, matrix_reg)

	def create_random_sample(self, n_sample, filename = None, sample_raw = True):
	n_lat = len(self.loc_mask.lat.values)
	n_lon = len(self.loc_mask.lon.values)

	# CREATE MATRIX OF THE SAME SIZE AS LOC_MASK
	loc_mask_local = self.loc_mask.where(self.loc_mask['CH'] == 1)
	indices_CH = loc_mask_local.to_dataframe().reset_index().dropna().index # obtain list of indices in original dataset that are in CH
	indices_random = np.random.permutation(indices_CH)[:n_sample] # obtain list of random indices

	matrix_rand = np.zeros((n_lon*n_lat,1))
	matrix_rand[indices_random] = 1 # get matrix with random values set to 1
	matrix_rand = matrix_rand.reshape((n_lat, n_lon))
	self.loc_mask['sample'] = xr.DataArray(matrix_rand, coords = self.loc_mask.coords)

	if sample_raw: self.sample = True
	if filename is not None:
	np.savetxt(filename, matrix_rand)

	def split_sample(self, ratio = 0.8, n_split = None, filename = None):
	n_lat = len(self.loc_mask.lat.values)
	n_lon = len(self.loc_mask.lon.values)

	if n_split is None:
	n_split = int(round(ratio * np.sum(self.loc_mask['sample'].values)))

	loc_mask_local = self.loc_mask.where(self.loc_mask['sample'] == 1)
	indices_sub = loc_mask_local.to_dataframe().reset_index().dropna().index # obtain list of indices in original dataset that are in CH
	indices_random = np.random.permutation(indices_sub)[:n_split] # obtain list of random indices

	matrix_rand = np.zeros((n_lon*n_lat,1))
	matrix_rand[indices_random] = 1
	matrix_rand = matrix_rand.reshape((n_lat, n_lon))

	self.loc_mask['sample_train'] = xr.DataArray(matrix_rand, coords = self.loc_mask.coords)
	self.loc_mask['sample_test'] = self.loc_mask['sample'] - self.loc_mask['sample_train']

	if filename is not None:
	path = os.path.splitext(filename)[0]
	train_file = path + '_train.txt'
	test_file = path + '_test.txt'
	np.savetxt(train_file, matrix_rand)
	np.savetxt(test_file, self.loc_mask['sample_test'].values)

	def get_subset(self, matrix_in = None, filename = None, sample_name = None, print_log = True):

	if sample_name is None:
	sample_name = 'sample'

	if not (matrix_in is None and filename is None):
	self.read_sample(matrix_in, filename, sample_name, sample_raw = False)

	subset = self.data.where(self.loc_mask[sample_name]==1)

	return subset

	def add_hourmask(self, year = None, month = None, data = None):
	if data is None:
	data = self.data

	if self.format == 'MMH':
	if year is None:
	self.hourmask = xr.open_dataset(os.path.join(self.path,'hour_mask_mmh.nc'))
	return xr.merge([data, self.hourmask])
	else: return data

	elif year is not None:
	self.hourmask = xr.open_dataset(os.path.join(self.path,'hour_mask_day.nc'))

	dates = list(pd.date_range(str(year)+'0101', str(year)+'1231', freq = 'D'))
	if len(dates) == 366:
	dates.pop(59)
	self.hourmask.coords['date'] = dates

	if month is not None:
	month_start, month_end = util.month_to_dates(year, month)
	self.hourmask = self.hourmask.sel(date = slice(month_start, month_end))

	return xr.merge([data, self.hourmask])

	else: return data

	########################################### CREATE TABLES ############################################

	def make_table(self, indata = None, ftrs = None, print_log = True):

	if indata is None:
	indata = self.data

	timer = time.clock()

	if ftrs is None:
	ftrs = self._loc_ftrs

	for f in ftrs:
	if f in self._loc_ftrs:
	indata[f] = self.loc_mask[f]

	table = indata.to_dataframe().dropna().reset_index()

	table['month'] = pd.to_datetime(table['date'].values).month
	table['day'] = pd.to_datetime(table['date'].values).dayofyear
	table['timestamp'] = util.to_timestamp(table['date'].values,table['hour'].values)

	if print_log: print ("Created table in %.2f seconds" %(time.clock() - timer))

	return table

	###################################### UTIL ##############################################

	def date_range(self, start_date, end_date):

	if self.format == 'H':
	return pd.date_range(start_date, end_date, freq = 'D')

	elif self.format == 'MMH':
	return pd.date_range(start_date, end_date, freq = 'MS')

meteo_data.pyNo OneTemporaryActions

File Metadata

meteo_data.pyView Options

Event Timeline

meteo_data.py
No OneTemporary
Actions

meteo_data.py
View Options