Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F92170862
open_msdataset_source.pyx
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Nov 17, 23:54
Size
8 KB
Mime Type
text/x-python
Expires
Tue, Nov 19, 23:54 (2 d)
Engine
blob
Format
Raw Data
Handle
22384133
Attached To
rCTRACKER ctracker3
open_msdataset_source.pyx
View Options
"""
Class to represent MITgcm mds file storage format.
"""
# python 3 compatiblity
from __future__ import print_function, division
from glob import glob
import os
import re
import numpy as np
import warnings
from io import StringIO
import inspect
import xarray as xr
import dask.array as da
# we keep the metadata in its own module to keep this one cleaner
from .variables import dimensions, \
horizontal_coordinates_spherical, horizontal_coordinates_cartesian, \
horizontal_coordinates_curvcart, \
vertical_coordinates, horizontal_grid_variables, vertical_grid_variables, \
volume_grid_variables, state_variables, aliases
# would it be better to import mitgcm_variables and then automate the search
# for variable dictionaries
from .utils import parse_meta_file, read_mds, parse_available_diagnostics
# should we hard code this?
LLC_NUM_FACES = 13
LLC_FACE_DIMNAME = 'face'
def open_mdsdataset(data_dir, grid_dir=None,
iters='all', prefix=None, read_grid=True,
delta_t=1, ref_date=None, calendar='gregorian',
geometry='sphericalpolar',
grid_vars_to_coords=True, swap_dims=None,
endian=">", chunks=None,
ignore_unknown_vars=False, default_dtype=None,
nx=None, ny=None, nz=None,
llc_method="smallchunks"):
"""Open MITgcm-style mds (.data / .meta) file output as xarray datset.
Parameters
----------
data_dir : string
Path to the directory where the mds .data and .meta files are stored
grid_dir : string, optional
Path to the directory where the mds .data and .meta files are stored, if
different from ``data_dir``.
iters : list, optional
The iterations numbers of the files to be read. If ``None``, no data
files will be read. If ``'all'`` (default), all iterations will be read.
prefix : list, optional
List of different filename prefixes to read. Default (``None``) is to
read all available files.
read_grid : bool, optional
Whether to read the grid data
delta_t : number, optional
The timestep used in the model. (Can't be inferred.)
ref_date : string, optional
An iSO date string corresponding to the zero timestep,
e.g. "1990-1-1 0:0:0" (See CF conventions [1]_)
calendar : string, optional
A calendar allowed by CF conventions [1]_
geometry : {'sphericalpolar', 'cartesian', 'llc', 'curvilinear'}
MITgcm grid geometry specifier
grid_vars_to_coords : boolean, optional
Whether to promote grid variables to coordinate status
swap_dims : boolean, optional
Whether to swap the logical dimensions for physical ones. If ``None``,
will be set to ``False`` for ``geometry==llc`` and ``True`` otherwise.
endian : {'=', '>', '<'}, optional
Endianness of variables. Default for MITgcm is ">" (big endian)
chunks : int or dict, optional
If chunks is provided, it used to load the new dataset into dask arrays.
ignore_unknown_vars : boolean, optional
Don't raise an error if unknown variables are encountered while reading
the dataset.
default_dtype : numpy.dtype, optional
A datatype to fall back on if the metadata can't be read.
nx, ny, nz : int, optional
The numerical dimensions of the model. These will be inferred from
``XC.meta`` and ``RC.meta`` if they are not specified. If
``geometry==llc``, ``ny`` does not have to specified.
llc_method : {"smallchunks", "bigchunks"}, optional
Which routine to use for reading LLC data. "smallchunks" splits the file
into a individual dask chunk of size (nx x nx) for each face of each
level (i.e. the total number of chunks is 13 * nz). "bigchunks" loads
the whole raw data file (either into memory or as a numpy.memmap),
splits it into faces, and concatenates those faces together using
``dask.array.concatenate``. The different methods will have different
memory and i/o performance depending on the details of the system
configuration.
Returns
-------
dset : xarray.Dataset
Dataset object containing all coordinates and variables.
References
----------
.. [1] http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch04s04.html
"""
# get frame info for history
frame = inspect.currentframe()
_, _, _, arg_values = inspect.getargvalues(frame)
del arg_values['frame']
function_name = inspect.getframeinfo(frame)[2]
# auto-detect whether to swap dims
if swap_dims is None:
if read_grid == False:
swap_dims = False
else:
swap_dims = False if geometry in ('llc', 'curvilinear') else True
# some checks for argument consistency
if swap_dims and not read_grid:
raise ValueError("If swap_dims==True, read_grid must be True.")
# We either have a single iter, in which case we create a fresh store,
# or a list of iters, in which case we combine.
if iters == 'all':
iters = _get_all_iternums(data_dir, file_prefixes=prefix)
if iters is None:
iternum = None
else:
try:
iternum = int(iters)
# if not we probably have some kind of list
except TypeError:
if len(iters) == 1:
iternum = int(iters[0])
else:
# We have to check to make sure we have the same prefixes at
# each timestep...otherwise we can't combine the datasets.
first_prefixes = prefix or _get_all_matching_prefixes(
data_dir, iters[0])
for iternum in iters:
these_prefixes = _get_all_matching_prefixes(
data_dir, iternum, prefix
)
# don't care about order
if set(these_prefixes) != set(first_prefixes):
raise IOError("Could not find the expected file "
"prefixes %s at iternum %g. (Instead "
"found %s)" % (repr(first_prefixes),
iternum,
repr(these_prefixes)))
# chunk at least by time
chunks = chunks or {}
# recursively open each dataset at a time
kwargs = dict(
grid_dir=grid_dir, delta_t=delta_t, swap_dims=False,
prefix=prefix, ref_date=ref_date, calendar=calendar,
geometry=geometry,
grid_vars_to_coords=False,
endian=endian, chunks=chunks,
ignore_unknown_vars=ignore_unknown_vars,
default_dtype=default_dtype,
nx=nx, ny=ny, nz=nz, llc_method=llc_method)
datasets = [open_mdsdataset(
data_dir, iters=iternum, read_grid=False, **kwargs)
for iternum in iters]
# now add the grid
if read_grid:
if 'iters' in kwargs:
kwargs.remove('iters')
if 'read_grid' in kwargs:
kwargs.remove('read_grid')
datasets.insert(0,
open_mdsdataset(data_dir, iters=None, read_grid=True,
**kwargs))
# apply chunking
ds = xr.auto_combine(datasets)
if swap_dims:
ds = _swap_dimensions(ds, geometry)
if grid_vars_to_coords:
ds = _set_coords(ds)
return ds
store = _MDSDataStore(data_dir, grid_dir, iternum, delta_t, read_grid,
prefix, ref_date, calendar,
geometry, endian,
ignore_unknown_vars=ignore_unknown_vars,
default_dtype=default_dtype,
nx=nx, ny=ny, nz=nz, llc_method=llc_method)
ds = xr.Dataset.load_store(store)
if swap_dims:
ds = _swap_dimensions(ds, geometry)
if grid_vars_to_coords:
ds = _set_coords(ds)
if ref_date:
ds = xr.decode_cf(ds)
# do we need more fancy logic (like open_dataset), or is this enough
if chunks is not None:
ds = ds.chunk(chunks)
# set attributes for CF conventions
ds.attrs['Conventions'] = "CF-1.6"
ds.attrs['title'] = "netCDF wrapper of MITgcm MDS binary data"
ds.attrs['source'] = "MITgcm"
arg_string = ', '.join(['%s=%s' % (str(k), repr(v))
for (k, v) in arg_values.items()])
ds.attrs['history'] = ('Created by calling '
'`%s(%s)`'% (function_name, arg_string))
return ds
Event Timeline
Log In to Comment