Page MenuHomec4science

open_msdataset_source.py
No OneTemporary

File Metadata

Created
Sat, May 11, 15:24

open_msdataset_source.py

"""
Class to represent MITgcm mds file storage format.
"""
# python 3 compatiblity
from __future__ import print_function, division
from glob import glob
import os
import re
import numpy as np
import warnings
from io import StringIO
import inspect
import xarray as xr
import dask.array as da
# we keep the metadata in its own module to keep this one cleaner
from .variables import dimensions, \
horizontal_coordinates_spherical, horizontal_coordinates_cartesian, \
horizontal_coordinates_curvcart, \
vertical_coordinates, horizontal_grid_variables, vertical_grid_variables, \
volume_grid_variables, state_variables, aliases
# would it be better to import mitgcm_variables and then automate the search
# for variable dictionaries
from .utils import parse_meta_file, read_mds, parse_available_diagnostics
# should we hard code this?
LLC_NUM_FACES = 13
LLC_FACE_DIMNAME = 'face'
def open_mdsdataset(data_dir, grid_dir=None,
iters='all', prefix=None, read_grid=True,
delta_t=1, ref_date=None, calendar='gregorian',
geometry='sphericalpolar',
grid_vars_to_coords=True, swap_dims=None,
endian=">", chunks=None,
ignore_unknown_vars=False, default_dtype=None,
nx=None, ny=None, nz=None,
llc_method="smallchunks"):
"""Open MITgcm-style mds (.data / .meta) file output as xarray datset.
Parameters
----------
data_dir : string
Path to the directory where the mds .data and .meta files are stored
grid_dir : string, optional
Path to the directory where the mds .data and .meta files are stored, if
different from ``data_dir``.
iters : list, optional
The iterations numbers of the files to be read. If ``None``, no data
files will be read. If ``'all'`` (default), all iterations will be read.
prefix : list, optional
List of different filename prefixes to read. Default (``None``) is to
read all available files.
read_grid : bool, optional
Whether to read the grid data
delta_t : number, optional
The timestep used in the model. (Can't be inferred.)
ref_date : string, optional
An iSO date string corresponding to the zero timestep,
e.g. "1990-1-1 0:0:0" (See CF conventions [1]_)
calendar : string, optional
A calendar allowed by CF conventions [1]_
geometry : {'sphericalpolar', 'cartesian', 'llc', 'curvilinear'}
MITgcm grid geometry specifier
grid_vars_to_coords : boolean, optional
Whether to promote grid variables to coordinate status
swap_dims : boolean, optional
Whether to swap the logical dimensions for physical ones. If ``None``,
will be set to ``False`` for ``geometry==llc`` and ``True`` otherwise.
endian : {'=', '>', '<'}, optional
Endianness of variables. Default for MITgcm is ">" (big endian)
chunks : int or dict, optional
If chunks is provided, it used to load the new dataset into dask arrays.
ignore_unknown_vars : boolean, optional
Don't raise an error if unknown variables are encountered while reading
the dataset.
default_dtype : numpy.dtype, optional
A datatype to fall back on if the metadata can't be read.
nx, ny, nz : int, optional
The numerical dimensions of the model. These will be inferred from
``XC.meta`` and ``RC.meta`` if they are not specified. If
``geometry==llc``, ``ny`` does not have to specified.
llc_method : {"smallchunks", "bigchunks"}, optional
Which routine to use for reading LLC data. "smallchunks" splits the file
into a individual dask chunk of size (nx x nx) for each face of each
level (i.e. the total number of chunks is 13 * nz). "bigchunks" loads
the whole raw data file (either into memory or as a numpy.memmap),
splits it into faces, and concatenates those faces together using
``dask.array.concatenate``. The different methods will have different
memory and i/o performance depending on the details of the system
configuration.
Returns
-------
dset : xarray.Dataset
Dataset object containing all coordinates and variables.
References
----------
.. [1] http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch04s04.html
"""
# get frame info for history
frame = inspect.currentframe()
_, _, _, arg_values = inspect.getargvalues(frame)
del arg_values['frame']
function_name = inspect.getframeinfo(frame)[2]
# auto-detect whether to swap dims
if swap_dims is None:
if read_grid == False:
swap_dims = False
else:
swap_dims = False if geometry in ('llc', 'curvilinear') else True
# some checks for argument consistency
if swap_dims and not read_grid:
raise ValueError("If swap_dims==True, read_grid must be True.")
# We either have a single iter, in which case we create a fresh store,
# or a list of iters, in which case we combine.
if iters == 'all':
iters = _get_all_iternums(data_dir, file_prefixes=prefix)
if iters is None:
iternum = None
else:
try:
iternum = int(iters)
# if not we probably have some kind of list
except TypeError:
if len(iters) == 1:
iternum = int(iters[0])
else:
# We have to check to make sure we have the same prefixes at
# each timestep...otherwise we can't combine the datasets.
first_prefixes = prefix or _get_all_matching_prefixes(
data_dir, iters[0])
for iternum in iters:
these_prefixes = _get_all_matching_prefixes(
data_dir, iternum, prefix
)
# don't care about order
if set(these_prefixes) != set(first_prefixes):
raise IOError("Could not find the expected file "
"prefixes %s at iternum %g. (Instead "
"found %s)" % (repr(first_prefixes),
iternum,
repr(these_prefixes)))
# chunk at least by time
chunks = chunks or {}
# recursively open each dataset at a time
kwargs = dict(
grid_dir=grid_dir, delta_t=delta_t, swap_dims=False,
prefix=prefix, ref_date=ref_date, calendar=calendar,
geometry=geometry,
grid_vars_to_coords=False,
endian=endian, chunks=chunks,
ignore_unknown_vars=ignore_unknown_vars,
default_dtype=default_dtype,
nx=nx, ny=ny, nz=nz, llc_method=llc_method)
datasets = [open_mdsdataset(
data_dir, iters=iternum, read_grid=False, **kwargs)
for iternum in iters]
# now add the grid
if read_grid:
if 'iters' in kwargs:
kwargs.remove('iters')
if 'read_grid' in kwargs:
kwargs.remove('read_grid')
datasets.insert(0,
open_mdsdataset(data_dir, iters=None, read_grid=True,
**kwargs))
# apply chunking
ds = xr.auto_combine(datasets)
if swap_dims:
ds = _swap_dimensions(ds, geometry)
if grid_vars_to_coords:
ds = _set_coords(ds)
return ds
store = _MDSDataStore(data_dir, grid_dir, iternum, delta_t, read_grid,
prefix, ref_date, calendar,
geometry, endian,
ignore_unknown_vars=ignore_unknown_vars,
default_dtype=default_dtype,
nx=nx, ny=ny, nz=nz, llc_method=llc_method)
ds = xr.Dataset.load_store(store)
if swap_dims:
ds = _swap_dimensions(ds, geometry)
if grid_vars_to_coords:
ds = _set_coords(ds)
if ref_date:
ds = xr.decode_cf(ds)
# do we need more fancy logic (like open_dataset), or is this enough
if chunks is not None:
ds = ds.chunk(chunks)
# set attributes for CF conventions
ds.attrs['Conventions'] = "CF-1.6"
ds.attrs['title'] = "netCDF wrapper of MITgcm MDS binary data"
ds.attrs['source'] = "MITgcm"
arg_string = ', '.join(['%s=%s' % (str(k), repr(v))
for (k, v) in arg_values.items()])
ds.attrs['history'] = ('Created by calling '
'`%s(%s)`'% (function_name, arg_string))
return ds

Event Timeline