Page MenuHomec4science

meteo_data.py
No OneTemporary

File Metadata

Created
Mon, Apr 28, 22:22

meteo_data.py

import numpy as np
import time
import pandas as pd
import xarray as xr
import os
import util
from meteo_raw import Meteo_Raw
class Meteo_Reader():
def __init__(self, data_path, variables = None, data_format = 'H'):
'''
Initialisation settings:
- path: root of raw data, with subfolder 'global', 'direct', 'albedo'
- vars: list of variables of any combination of ['SIS', 'SISDIR', 'ALB', 'KI', 'SISCF']
- data_format: 'H': raw data (hourly values); 'MMH': monthly-mean-hourly
'''
self._all_vars = ['SIS', 'SISDIR', 'SISCF', 'SISDIRCF', 'KI', 'ALB', 'KI_SIS', 'KI_SISDIR']
self.path = data_path
self.format = data_format
if variables is None:
self.variables = self._all_vars
else:
self.variables = variables
self.drop_vars = list(set(self._all_vars) - set(self.variables))
print(data_path)
print(os.path.join(data_path,'locations_mask.nc'))
self.loc_mask = xr.open_dataset(os.path.join(data_path,'locations_mask.nc'))
self._loc_ftrs = self.loc_mask.to_dataframe().columns
# self.loc_mask['sample'] = self.loc_mask['CH']
# INITIALISATIONS
self.init_data = True
self.init_reading = True
self.sample = False
self.loaded_hourmask = False
##########################################################################################
def read_yearly(self, year, variables = None, hourmask = True):
if variables is not None:
self.variables = variables
self.drop_vars = list(set(self._all_vars) - set(self.variables))
filename = str(year)+'.nc'
if not os.path.exists(os.path.join(self.path,filename)):
print("file %s does not exist - please create file from raw data" %filename)
try:
self.data = xr.open_dataset(os.path.join(self.path,filename))
self.data = self.data.drop(self.drop_vars)
if self.sample: self.data = self.data.where(self.loc_mask['sample'] == 1)
except:
print("failed to read data in %s" %filename)
if hourmask: self.data = self.add_hourmask(year = year)
self.start = pd.to_datetime(self.data.date.values[0]).strftime("%Y%m%d")
self.end = pd.to_datetime(self.data.date.values[-1]).strftime("%Y%m%d")
def read_data(self, start_date, end_date, variables = None, hourmask = True, reset = False, print_log = True):
if variables is not None:
self.variables = variables
self.drop_vars = list(set(self._all_vars) - set(self.variables))
reader = Meteo_Raw(self.path, variables = self._all_vars)
if reset:
self.init_reading = True
month_start = pd.date_range(start_date, end_date, freq = 'MS')
month_end = pd.date_range(start_date, end_date, freq = 'M')
timer = time.clock()
init = True
for curr_month, curr_month_end in zip(month_start, month_end):
monthfile = curr_month.strftime('%Y-%m') + '.nc'
print(monthfile)
if not os.path.exists(os.path.join(self.path,monthfile)):
print("file %s does not exist - creating file from raw data" %monthfile)
reader.make_monthly(curr_month, curr_month_end)
try:
new_data = xr.open_dataset(os.path.join(self.path,monthfile))
new_data = new_data.drop(self.drop_vars)
if self.sample: new_data = new_data.where(self.loc_mask['sample'] == 1)
if self.format == 'MMH':
new_data = new_data.mean(dim = 'date') # create monthly mean hourly for the current month
new_data.coords['date'] = curr_month
elif self.format != 'H':
print("Unknown format to read data")
if hourmask: new_data = self.add_hourmask(year = curr_month.year, month = curr_month.month, data = new_data)
if init:
dataset = new_data
init = False
else:
dataset = xr.concat([dataset,new_data], 'date')
except:
print("failed to read or concatenate data in %s" %monthfile)
if self.init_reading:
self.data = dataset
self.init_reading = False
else:
self.data = xr.merge([self.data,dataset])
timer = time.clock() - timer
if print_log: print ("Finished reading in %.2f seconds" %(timer))
if hourmask: self.data = self.add_hourmask()
self.start = month_start[0]
self.end = month_end[-1].strftime("%Y%m%d")
########################################### SUBSETS ###################################################
def read_sample(self, matrix_in = None, filename = None, sample_name = 'sample', sample_raw = True):
# read input matrix for sampling locations
if filename is not None:
matrix_in = np.loadtxt(filename)
try:
self.loc_mask[sample_name] = xr.DataArray(matrix_in, coords = self.loc_mask.coords)
except ValueError:
self.loc_mask[sample_name] = xr.DataArray(matrix_in.T, coords = self.loc_mask.coords)
if sample_raw: self.sample = True
def create_regular_sample(self, downsampling_ratio, filename = None, sample_raw = True):
n_lat = len(self.loc_mask.lat.values)
n_lon = len(self.loc_mask.lon.values)
smpl_int = int(np.floor(np.sqrt(downsampling_ratio)))
# CREATE MATRIX OF THE SAME SIZE AS LOC_MASK
matrix_reg = np.zeros((n_lat,n_lon))
matrix_reg[::smpl_int,::smpl_int] = 1
matrix_reg = matrix_reg * self.loc_mask.CH.values.T
self.loc_mask['sample'] = xr.DataArray(matrix_reg, coords = self.loc_mask.coords)
if sample_raw: self.sample = True
if filename is not None:
np.savetxt(filename, matrix_reg)
def create_random_sample(self, n_sample, filename = None, sample_raw = True):
n_lat = len(self.loc_mask.lat.values)
n_lon = len(self.loc_mask.lon.values)
# CREATE MATRIX OF THE SAME SIZE AS LOC_MASK
loc_mask_local = self.loc_mask.where(self.loc_mask['CH'] == 1)
indices_CH = loc_mask_local.to_dataframe().reset_index().dropna().index # obtain list of indices in original dataset that are in CH
indices_random = np.random.permutation(indices_CH)[:n_sample] # obtain list of random indices
matrix_rand = np.zeros((n_lon*n_lat,1))
matrix_rand[indices_random] = 1 # get matrix with random values set to 1
matrix_rand = matrix_rand.reshape((n_lat, n_lon))
self.loc_mask['sample'] = xr.DataArray(matrix_rand, coords = self.loc_mask.coords)
if sample_raw: self.sample = True
if filename is not None:
np.savetxt(filename, matrix_rand)
def split_sample(self, ratio = 0.8, n_split = None, filename = None):
n_lat = len(self.loc_mask.lat.values)
n_lon = len(self.loc_mask.lon.values)
if n_split is None:
n_split = int(round(ratio * np.sum(self.loc_mask['sample'].values)))
loc_mask_local = self.loc_mask.where(self.loc_mask['sample'] == 1)
indices_sub = loc_mask_local.to_dataframe().reset_index().dropna().index # obtain list of indices in original dataset that are in CH
indices_random = np.random.permutation(indices_sub)[:n_split] # obtain list of random indices
matrix_rand = np.zeros((n_lon*n_lat,1))
matrix_rand[indices_random] = 1
matrix_rand = matrix_rand.reshape((n_lat, n_lon))
self.loc_mask['sample_train'] = xr.DataArray(matrix_rand, coords = self.loc_mask.coords)
self.loc_mask['sample_test'] = self.loc_mask['sample'] - self.loc_mask['sample_train']
if filename is not None:
path = os.path.splitext(filename)[0]
train_file = path + '_train.txt'
test_file = path + '_test.txt'
np.savetxt(train_file, matrix_rand)
np.savetxt(test_file, self.loc_mask['sample_test'].values)
def get_subset(self, matrix_in = None, filename = None, sample_name = None, print_log = True):
if sample_name is None:
sample_name = 'sample'
if not (matrix_in is None and filename is None):
self.read_sample(matrix_in, filename, sample_name, sample_raw = False)
subset = self.data.where(self.loc_mask[sample_name]==1)
return subset
def add_hourmask(self, year = None, month = None, data = None):
if data is None:
data = self.data
if self.format == 'MMH':
if year is None:
self.hourmask = xr.open_dataset(os.path.join(self.path,'hour_mask_mmh.nc'))
return xr.merge([data, self.hourmask])
else: return data
elif year is not None:
self.hourmask = xr.open_dataset(os.path.join(self.path,'hour_mask_day.nc'))
dates = list(pd.date_range(str(year)+'0101', str(year)+'1231', freq = 'D'))
if len(dates) == 366:
dates.pop(59)
self.hourmask.coords['date'] = dates
if month is not None:
month_start, month_end = util.month_to_dates(year, month)
self.hourmask = self.hourmask.sel(date = slice(month_start, month_end))
return xr.merge([data, self.hourmask])
else: return data
########################################### CREATE TABLES ############################################
def make_table(self, indata = None, ftrs = None, print_log = True):
if indata is None:
indata = self.data
timer = time.clock()
if ftrs is None:
ftrs = self._loc_ftrs
for f in ftrs:
if f in self._loc_ftrs:
indata[f] = self.loc_mask[f]
table = indata.to_dataframe().dropna().reset_index()
table['month'] = pd.to_datetime(table['date'].values).month
table['day'] = pd.to_datetime(table['date'].values).dayofyear
table['timestamp'] = util.to_timestamp(table['date'].values,table['hour'].values)
if print_log: print ("Created table in %.2f seconds" %(time.clock() - timer))
return table
###################################### UTIL ##############################################
def date_range(self, start_date, end_date):
if self.format == 'H':
return pd.date_range(start_date, end_date, freq = 'D')
elif self.format == 'MMH':
return pd.date_range(start_date, end_date, freq = 'MS')

Event Timeline