Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F71330910
open_msdataset_source.pyx
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Jul 11, 03:16
Size
8 KB
Mime Type
text/x-python
Expires
Sat, Jul 13, 03:16 (2 d)
Engine
blob
Format
Raw Data
Handle
18933143
Attached To
rCTRACKER ctracker3
open_msdataset_source.pyx
View Options
"""
Class to represent MITgcm mds file storage format.
"""
# python 3 compatiblity
from
__future__
import
print_function
,
division
from
glob
import
glob
import
os
import
re
import
numpy
as
np
import
warnings
from
io
import
StringIO
import
inspect
import
xarray
as
xr
import
dask.array
as
da
# we keep the metadata in its own module to keep this one cleaner
from
.variables
import
dimensions
,
\
horizontal_coordinates_spherical
,
horizontal_coordinates_cartesian
,
\
horizontal_coordinates_curvcart
,
\
vertical_coordinates
,
horizontal_grid_variables
,
vertical_grid_variables
,
\
volume_grid_variables
,
state_variables
,
aliases
# would it be better to import mitgcm_variables and then automate the search
# for variable dictionaries
from
.utils
import
parse_meta_file
,
read_mds
,
parse_available_diagnostics
# should we hard code this?
LLC_NUM_FACES
=
13
LLC_FACE_DIMNAME
=
'face'
def
open_mdsdataset
(
data_dir
,
grid_dir
=
None
,
iters
=
'all'
,
prefix
=
None
,
read_grid
=
True
,
delta_t
=
1
,
ref_date
=
None
,
calendar
=
'gregorian'
,
geometry
=
'sphericalpolar'
,
grid_vars_to_coords
=
True
,
swap_dims
=
None
,
endian
=
">"
,
chunks
=
None
,
ignore_unknown_vars
=
False
,
default_dtype
=
None
,
nx
=
None
,
ny
=
None
,
nz
=
None
,
llc_method
=
"smallchunks"
):
"""Open MITgcm-style mds (.data / .meta) file output as xarray datset.
Parameters
----------
data_dir : string
Path to the directory where the mds .data and .meta files are stored
grid_dir : string, optional
Path to the directory where the mds .data and .meta files are stored, if
different from ``data_dir``.
iters : list, optional
The iterations numbers of the files to be read. If ``None``, no data
files will be read. If ``'all'`` (default), all iterations will be read.
prefix : list, optional
List of different filename prefixes to read. Default (``None``) is to
read all available files.
read_grid : bool, optional
Whether to read the grid data
delta_t : number, optional
The timestep used in the model. (Can't be inferred.)
ref_date : string, optional
An iSO date string corresponding to the zero timestep,
e.g. "1990-1-1 0:0:0" (See CF conventions [1]_)
calendar : string, optional
A calendar allowed by CF conventions [1]_
geometry : {'sphericalpolar', 'cartesian', 'llc', 'curvilinear'}
MITgcm grid geometry specifier
grid_vars_to_coords : boolean, optional
Whether to promote grid variables to coordinate status
swap_dims : boolean, optional
Whether to swap the logical dimensions for physical ones. If ``None``,
will be set to ``False`` for ``geometry==llc`` and ``True`` otherwise.
endian : {'=', '>', '<'}, optional
Endianness of variables. Default for MITgcm is ">" (big endian)
chunks : int or dict, optional
If chunks is provided, it used to load the new dataset into dask arrays.
ignore_unknown_vars : boolean, optional
Don't raise an error if unknown variables are encountered while reading
the dataset.
default_dtype : numpy.dtype, optional
A datatype to fall back on if the metadata can't be read.
nx, ny, nz : int, optional
The numerical dimensions of the model. These will be inferred from
``XC.meta`` and ``RC.meta`` if they are not specified. If
``geometry==llc``, ``ny`` does not have to specified.
llc_method : {"smallchunks", "bigchunks"}, optional
Which routine to use for reading LLC data. "smallchunks" splits the file
into a individual dask chunk of size (nx x nx) for each face of each
level (i.e. the total number of chunks is 13 * nz). "bigchunks" loads
the whole raw data file (either into memory or as a numpy.memmap),
splits it into faces, and concatenates those faces together using
``dask.array.concatenate``. The different methods will have different
memory and i/o performance depending on the details of the system
configuration.
Returns
-------
dset : xarray.Dataset
Dataset object containing all coordinates and variables.
References
----------
.. [1] http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch04s04.html
"""
# get frame info for history
frame
=
inspect
.
currentframe
()
_
,
_
,
_
,
arg_values
=
inspect
.
getargvalues
(
frame
)
del
arg_values
[
'frame'
]
function_name
=
inspect
.
getframeinfo
(
frame
)[
2
]
# auto-detect whether to swap dims
if
swap_dims
is
None
:
if
read_grid
==
False
:
swap_dims
=
False
else
:
swap_dims
=
False
if
geometry
in
(
'llc'
,
'curvilinear'
)
else
True
# some checks for argument consistency
if
swap_dims
and
not
read_grid
:
raise
ValueError
(
"If swap_dims==True, read_grid must be True."
)
# We either have a single iter, in which case we create a fresh store,
# or a list of iters, in which case we combine.
if
iters
==
'all'
:
iters
=
_get_all_iternums
(
data_dir
,
file_prefixes
=
prefix
)
if
iters
is
None
:
iternum
=
None
else
:
try
:
iternum
=
int
(
iters
)
# if not we probably have some kind of list
except
TypeError
:
if
len
(
iters
)
==
1
:
iternum
=
int
(
iters
[
0
])
else
:
# We have to check to make sure we have the same prefixes at
# each timestep...otherwise we can't combine the datasets.
first_prefixes
=
prefix
or
_get_all_matching_prefixes
(
data_dir
,
iters
[
0
])
for
iternum
in
iters
:
these_prefixes
=
_get_all_matching_prefixes
(
data_dir
,
iternum
,
prefix
)
# don't care about order
if
set
(
these_prefixes
)
!=
set
(
first_prefixes
):
raise
IOError
(
"Could not find the expected file "
"prefixes
%s
at iternum
%g
. (Instead "
"found
%s
)"
%
(
repr
(
first_prefixes
),
iternum
,
repr
(
these_prefixes
)))
# chunk at least by time
chunks
=
chunks
or
{}
# recursively open each dataset at a time
kwargs
=
dict
(
grid_dir
=
grid_dir
,
delta_t
=
delta_t
,
swap_dims
=
False
,
prefix
=
prefix
,
ref_date
=
ref_date
,
calendar
=
calendar
,
geometry
=
geometry
,
grid_vars_to_coords
=
False
,
endian
=
endian
,
chunks
=
chunks
,
ignore_unknown_vars
=
ignore_unknown_vars
,
default_dtype
=
default_dtype
,
nx
=
nx
,
ny
=
ny
,
nz
=
nz
,
llc_method
=
llc_method
)
datasets
=
[
open_mdsdataset
(
data_dir
,
iters
=
iternum
,
read_grid
=
False
,
**
kwargs
)
for
iternum
in
iters
]
# now add the grid
if
read_grid
:
if
'iters'
in
kwargs
:
kwargs
.
remove
(
'iters'
)
if
'read_grid'
in
kwargs
:
kwargs
.
remove
(
'read_grid'
)
datasets
.
insert
(
0
,
open_mdsdataset
(
data_dir
,
iters
=
None
,
read_grid
=
True
,
**
kwargs
))
# apply chunking
ds
=
xr
.
auto_combine
(
datasets
)
if
swap_dims
:
ds
=
_swap_dimensions
(
ds
,
geometry
)
if
grid_vars_to_coords
:
ds
=
_set_coords
(
ds
)
return
ds
store
=
_MDSDataStore
(
data_dir
,
grid_dir
,
iternum
,
delta_t
,
read_grid
,
prefix
,
ref_date
,
calendar
,
geometry
,
endian
,
ignore_unknown_vars
=
ignore_unknown_vars
,
default_dtype
=
default_dtype
,
nx
=
nx
,
ny
=
ny
,
nz
=
nz
,
llc_method
=
llc_method
)
ds
=
xr
.
Dataset
.
load_store
(
store
)
if
swap_dims
:
ds
=
_swap_dimensions
(
ds
,
geometry
)
if
grid_vars_to_coords
:
ds
=
_set_coords
(
ds
)
if
ref_date
:
ds
=
xr
.
decode_cf
(
ds
)
# do we need more fancy logic (like open_dataset), or is this enough
if
chunks
is
not
None
:
ds
=
ds
.
chunk
(
chunks
)
# set attributes for CF conventions
ds
.
attrs
[
'Conventions'
]
=
"CF-1.6"
ds
.
attrs
[
'title'
]
=
"netCDF wrapper of MITgcm MDS binary data"
ds
.
attrs
[
'source'
]
=
"MITgcm"
arg_string
=
', '
.
join
([
'
%s
=
%s
'
%
(
str
(
k
),
repr
(
v
))
for
(
k
,
v
)
in
arg_values
.
items
()])
ds
.
attrs
[
'history'
]
=
(
'Created by calling '
'`
%s
(
%s
)`'
%
(
function_name
,
arg_string
))
return
ds
Event Timeline
Log In to Comment