Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F62207142
open_msdataset_source.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, May 11, 15:24
Size
8 KB
Mime Type
text/x-python
Expires
Mon, May 13, 15:24 (2 d)
Engine
blob
Format
Raw Data
Handle
17619028
Attached To
rCTRACKER ctracker3
open_msdataset_source.py
View Options
"""
Class to represent MITgcm mds file storage format.
"""
# python 3 compatiblity
from
__future__
import
print_function
,
division
from
glob
import
glob
import
os
import
re
import
numpy
as
np
import
warnings
from
io
import
StringIO
import
inspect
import
xarray
as
xr
import
dask.array
as
da
# we keep the metadata in its own module to keep this one cleaner
from
.variables
import
dimensions
,
\
horizontal_coordinates_spherical
,
horizontal_coordinates_cartesian
,
\
horizontal_coordinates_curvcart
,
\
vertical_coordinates
,
horizontal_grid_variables
,
vertical_grid_variables
,
\
volume_grid_variables
,
state_variables
,
aliases
# would it be better to import mitgcm_variables and then automate the search
# for variable dictionaries
from
.utils
import
parse_meta_file
,
read_mds
,
parse_available_diagnostics
# should we hard code this?
LLC_NUM_FACES
=
13
LLC_FACE_DIMNAME
=
'face'
def
open_mdsdataset
(
data_dir
,
grid_dir
=
None
,
iters
=
'all'
,
prefix
=
None
,
read_grid
=
True
,
delta_t
=
1
,
ref_date
=
None
,
calendar
=
'gregorian'
,
geometry
=
'sphericalpolar'
,
grid_vars_to_coords
=
True
,
swap_dims
=
None
,
endian
=
">"
,
chunks
=
None
,
ignore_unknown_vars
=
False
,
default_dtype
=
None
,
nx
=
None
,
ny
=
None
,
nz
=
None
,
llc_method
=
"smallchunks"
):
"""Open MITgcm-style mds (.data / .meta) file output as xarray datset.
Parameters
----------
data_dir : string
Path to the directory where the mds .data and .meta files are stored
grid_dir : string, optional
Path to the directory where the mds .data and .meta files are stored, if
different from ``data_dir``.
iters : list, optional
The iterations numbers of the files to be read. If ``None``, no data
files will be read. If ``'all'`` (default), all iterations will be read.
prefix : list, optional
List of different filename prefixes to read. Default (``None``) is to
read all available files.
read_grid : bool, optional
Whether to read the grid data
delta_t : number, optional
The timestep used in the model. (Can't be inferred.)
ref_date : string, optional
An iSO date string corresponding to the zero timestep,
e.g. "1990-1-1 0:0:0" (See CF conventions [1]_)
calendar : string, optional
A calendar allowed by CF conventions [1]_
geometry : {'sphericalpolar', 'cartesian', 'llc', 'curvilinear'}
MITgcm grid geometry specifier
grid_vars_to_coords : boolean, optional
Whether to promote grid variables to coordinate status
swap_dims : boolean, optional
Whether to swap the logical dimensions for physical ones. If ``None``,
will be set to ``False`` for ``geometry==llc`` and ``True`` otherwise.
endian : {'=', '>', '<'}, optional
Endianness of variables. Default for MITgcm is ">" (big endian)
chunks : int or dict, optional
If chunks is provided, it used to load the new dataset into dask arrays.
ignore_unknown_vars : boolean, optional
Don't raise an error if unknown variables are encountered while reading
the dataset.
default_dtype : numpy.dtype, optional
A datatype to fall back on if the metadata can't be read.
nx, ny, nz : int, optional
The numerical dimensions of the model. These will be inferred from
``XC.meta`` and ``RC.meta`` if they are not specified. If
``geometry==llc``, ``ny`` does not have to specified.
llc_method : {"smallchunks", "bigchunks"}, optional
Which routine to use for reading LLC data. "smallchunks" splits the file
into a individual dask chunk of size (nx x nx) for each face of each
level (i.e. the total number of chunks is 13 * nz). "bigchunks" loads
the whole raw data file (either into memory or as a numpy.memmap),
splits it into faces, and concatenates those faces together using
``dask.array.concatenate``. The different methods will have different
memory and i/o performance depending on the details of the system
configuration.
Returns
-------
dset : xarray.Dataset
Dataset object containing all coordinates and variables.
References
----------
.. [1] http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch04s04.html
"""
# get frame info for history
frame
=
inspect
.
currentframe
()
_
,
_
,
_
,
arg_values
=
inspect
.
getargvalues
(
frame
)
del
arg_values
[
'frame'
]
function_name
=
inspect
.
getframeinfo
(
frame
)[
2
]
# auto-detect whether to swap dims
if
swap_dims
is
None
:
if
read_grid
==
False
:
swap_dims
=
False
else
:
swap_dims
=
False
if
geometry
in
(
'llc'
,
'curvilinear'
)
else
True
# some checks for argument consistency
if
swap_dims
and
not
read_grid
:
raise
ValueError
(
"If swap_dims==True, read_grid must be True."
)
# We either have a single iter, in which case we create a fresh store,
# or a list of iters, in which case we combine.
if
iters
==
'all'
:
iters
=
_get_all_iternums
(
data_dir
,
file_prefixes
=
prefix
)
if
iters
is
None
:
iternum
=
None
else
:
try
:
iternum
=
int
(
iters
)
# if not we probably have some kind of list
except
TypeError
:
if
len
(
iters
)
==
1
:
iternum
=
int
(
iters
[
0
])
else
:
# We have to check to make sure we have the same prefixes at
# each timestep...otherwise we can't combine the datasets.
first_prefixes
=
prefix
or
_get_all_matching_prefixes
(
data_dir
,
iters
[
0
])
for
iternum
in
iters
:
these_prefixes
=
_get_all_matching_prefixes
(
data_dir
,
iternum
,
prefix
)
# don't care about order
if
set
(
these_prefixes
)
!=
set
(
first_prefixes
):
raise
IOError
(
"Could not find the expected file "
"prefixes
%s
at iternum
%g
. (Instead "
"found
%s
)"
%
(
repr
(
first_prefixes
),
iternum
,
repr
(
these_prefixes
)))
# chunk at least by time
chunks
=
chunks
or
{}
# recursively open each dataset at a time
kwargs
=
dict
(
grid_dir
=
grid_dir
,
delta_t
=
delta_t
,
swap_dims
=
False
,
prefix
=
prefix
,
ref_date
=
ref_date
,
calendar
=
calendar
,
geometry
=
geometry
,
grid_vars_to_coords
=
False
,
endian
=
endian
,
chunks
=
chunks
,
ignore_unknown_vars
=
ignore_unknown_vars
,
default_dtype
=
default_dtype
,
nx
=
nx
,
ny
=
ny
,
nz
=
nz
,
llc_method
=
llc_method
)
datasets
=
[
open_mdsdataset
(
data_dir
,
iters
=
iternum
,
read_grid
=
False
,
**
kwargs
)
for
iternum
in
iters
]
# now add the grid
if
read_grid
:
if
'iters'
in
kwargs
:
kwargs
.
remove
(
'iters'
)
if
'read_grid'
in
kwargs
:
kwargs
.
remove
(
'read_grid'
)
datasets
.
insert
(
0
,
open_mdsdataset
(
data_dir
,
iters
=
None
,
read_grid
=
True
,
**
kwargs
))
# apply chunking
ds
=
xr
.
auto_combine
(
datasets
)
if
swap_dims
:
ds
=
_swap_dimensions
(
ds
,
geometry
)
if
grid_vars_to_coords
:
ds
=
_set_coords
(
ds
)
return
ds
store
=
_MDSDataStore
(
data_dir
,
grid_dir
,
iternum
,
delta_t
,
read_grid
,
prefix
,
ref_date
,
calendar
,
geometry
,
endian
,
ignore_unknown_vars
=
ignore_unknown_vars
,
default_dtype
=
default_dtype
,
nx
=
nx
,
ny
=
ny
,
nz
=
nz
,
llc_method
=
llc_method
)
ds
=
xr
.
Dataset
.
load_store
(
store
)
if
swap_dims
:
ds
=
_swap_dimensions
(
ds
,
geometry
)
if
grid_vars_to_coords
:
ds
=
_set_coords
(
ds
)
if
ref_date
:
ds
=
xr
.
decode_cf
(
ds
)
# do we need more fancy logic (like open_dataset), or is this enough
if
chunks
is
not
None
:
ds
=
ds
.
chunk
(
chunks
)
# set attributes for CF conventions
ds
.
attrs
[
'Conventions'
]
=
"CF-1.6"
ds
.
attrs
[
'title'
]
=
"netCDF wrapper of MITgcm MDS binary data"
ds
.
attrs
[
'source'
]
=
"MITgcm"
arg_string
=
', '
.
join
([
'
%s
=
%s
'
%
(
str
(
k
),
repr
(
v
))
for
(
k
,
v
)
in
arg_values
.
items
()])
ds
.
attrs
[
'history'
]
=
(
'Created by calling '
'`
%s
(
%s
)`'
%
(
function_name
,
arg_string
))
return
ds
Event Timeline
Log In to Comment