diff --git a/.gitignore b/.gitignore index c37948f226d..7fbb64c296a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,100 +1,14 @@ +# Python +__pycache__/ *.py[cod] -__pycache__ -.env -.venv -# example caches from Hypothesis -.hypothesis/ +# Virtual environments +.venv/ +venv/ -# temp files from docs build -doc/*.nc -doc/auto_gallery -doc/rasm.zarr +# Sphinx build output +doc/_build/ -# C extensions -*.so - -# Packages -*.egg -*.egg-info -.eggs -dist -build -eggs -parts -bin -var -sdist -develop-eggs -.installed.cfg -lib -lib64 - -# Installer logs -pip-log.txt - -# Unit test / coverage reports -.coverage -.coverage.* -.tox -nosetests.xml -.cache -.prettier_cache -.dmypy.json -.mypy_cache -.ropeproject/ -.tags* -.testmon* -.tmontmp/ -.pytest_cache -dask-worker-space/ - -# asv environments -asv_bench/.asv -asv_bench/pkgs - -# Translations -*.mo - -# Mr Developer -.mr.developer.cfg -.project -.pydevproject - -# IDEs -.idea -*.swp +# OS files .DS_Store -.vscode/ - -# xarray specific -doc/_build -doc/generated/ -doc/api/generated/ -xarray/tests/data/*.grib.*.idx - -# Claude Code -.claude/ - -# Sync tools -Icon* - -.ipynb_checkpoints -doc/team-panel.txt -doc/external-examples-gallery.txt -doc/notebooks-examples-gallery.txt -doc/videos-gallery.txt -doc/*.zarr -doc/*.nc -doc/*.h5 - -# Until we support this properly, excluding from gitignore. (adding it to -# gitignore to make it _easier_ to work with `uv`, not as an indication that I -# think we shouldn't...) -uv.lock -mypy_report/ -xarray-docs/ - -# pixi environments -.pixi -pixi.lock +Thumbs.db diff --git a/doc/user-guide/io-netcdf.rst b/doc/user-guide/io-netcdf.rst new file mode 100644 index 00000000000..e69de29bb2d diff --git a/doc/user-guide/io-other.rst b/doc/user-guide/io-other.rst new file mode 100644 index 00000000000..e69de29bb2d diff --git a/doc/user-guide/io-zarr.rst b/doc/user-guide/io-zarr.rst new file mode 100644 index 00000000000..e69de29bb2d diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 57be09deb0a..46e353ba2f0 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -1,1733 +1,9 @@ -.. currentmodule:: xarray -.. _io: - -Reading and writing files +Reading and Writing Files ========================= -Xarray supports direct serialization and IO to several file formats, from -simple :ref:`io.pickle` files to the more flexible :ref:`io.netcdf` -format (recommended). - -.. jupyter-execute:: - :hide-code: - - import os - - import iris - import ncdata.iris_xarray - import numpy as np - import pandas as pd - import xarray as xr - - np.random.seed(123456) - -You can read different types of files in ``xr.open_dataset`` by specifying the engine to be used: - -.. code:: python - - xr.open_dataset("example.nc", engine="netcdf4") - -The "engine" provides a set of instructions that tells xarray how -to read the data and pack them into a ``Dataset`` (or ``Dataarray``). -These instructions are stored in an underlying "backend". - -Xarray comes with several backends that cover many common data formats. -Many more backends are available via external libraries, or you can `write your own `_. -This diagram aims to help you determine - based on the format of the file you'd like to read - -which type of backend you're using and how to use it. - -Text and boxes are clickable for more information. -Following the diagram is detailed information on many popular backends. -You can learn more about using and developing backends in the -`Xarray tutorial JupyterBook `_. - -.. - _comment: mermaid Flowcharg "link" text gets secondary color background, SVG icon fill gets primary color - -.. raw:: html - - - -.. mermaid:: - :config: {"theme":"base","themeVariables":{"fontSize":"20px","primaryColor":"#fff","primaryTextColor":"#fff","primaryBorderColor":"#59c7d6","lineColor":"#e28126","secondaryColor":"#767985"}} - :alt: Flowchart illustrating how to choose the right backend engine to read your data - - flowchart LR - built-in-eng["`**Is your data stored in one of these formats?** - - netCDF4 - - netCDF3 - - Zarr - - DODS/OPeNDAP - - HDF5 - `"] - - built-in("`**You're in luck!** Xarray bundles a backend to automatically read these formats. - Open data using xr.open_dataset(). We recommend - explicitly setting engine='xxxx' for faster loading.`") - - installed-eng["""One of these formats? - - GRIB - - TileDB - - GeoTIFF, JPEG-2000, etc. (via GDAL) - - Sentinel-1 SAFE - """] - - installed("""Install the linked backend library and use it with - xr.open_dataset(file, engine='xxxx').""") - - other["`**Options:** - - Look around to see if someone has created an Xarray backend for your format! - - Create your own backend - - Convert your data to a supported format - `"] - - built-in-eng -->|Yes| built-in - built-in-eng -->|No| installed-eng - - installed-eng -->|Yes| installed - installed-eng -->|No| other - - click built-in-eng "https://docs.xarray.dev/en/stable/get-help/faq.html#how-do-i-open-format-x-file-as-an-xarray-dataset" - - - classDef quesNodefmt font-size:12pt,fill:#0e4666,stroke:#59c7d6,stroke-width:3 - class built-in-eng,installed-eng quesNodefmt - - classDef ansNodefmt font-size:12pt,fill:#4a4a4a,stroke:#17afb4,stroke-width:3 - class built-in,installed,other ansNodefmt - - linkStyle default font-size:18pt,stroke-width:4 - - -.. _io.backend_resolution: - -Backend Selection ------------------ - -When opening a file or URL without explicitly specifying the ``engine`` parameter, -xarray automatically selects an appropriate backend based on the file path or URL. -The backends are tried in order: **netcdf4 → h5netcdf → scipy → pydap → zarr**. - -.. note:: - You can customize the order in which netCDF backends are tried using the - ``netcdf_engine_order`` option in :py:func:`~xarray.set_options`: - - .. code-block:: python - - # Prefer h5netcdf over netcdf4 - xr.set_options(netcdf_engine_order=["h5netcdf", "netcdf4", "scipy"]) - - See :ref:`options` for more details on configuration options. - -The following tables show which backend will be selected for different types of URLs and files. - -.. important:: - ✅ means the backend will **guess it can open** the URL or file based on its path, extension, - or magic number, but this doesn't guarantee success. For example, not all Zarr stores are - xarray-compatible. - - ❌ means the backend will not attempt to open it. - -Remote URL Resolution -~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - :widths: 50 10 10 10 10 10 - - * - URL - - :ref:`netcdf4 ` - - :ref:`h5netcdf ` - - :ref:`scipy ` - - :ref:`pydap ` - - :ref:`zarr ` - * - ``https://example.com/store.zarr`` - - ❌ - - ❌ - - ❌ - - ❌ - - ✅ - * - ``https://example.com/data.nc`` - - ✅ - - ✅ - - ❌ - - ❌ - - ❌ - * - ``http://example.com/data.nc?var=temp`` - - ✅ - - ❌ - - ❌ - - ❌ - - ❌ - * - ``http://example.com/dap4/data.nc?var=x`` - - ✅ - - ❌ - - ❌ - - ✅ - - ❌ - * - ``dap2://opendap.nasa.gov/dataset`` - - ❌ - - ❌ - - ❌ - - ✅ - - ❌ - * - ``https://example.com/DAP4/data`` - - ❌ - - ❌ - - ❌ - - ✅ - - ❌ - * - ``http://test.opendap.org/dap4/file.nc4`` - - ✅ - - ✅ - - ❌ - - ✅ - - ❌ - * - ``https://example.com/DAP4/data.nc`` - - ✅ - - ✅ - - ❌ - - ✅ - - ❌ - -Local File Resolution -~~~~~~~~~~~~~~~~~~~~~ - -For local files, backends first try to read the file's **magic number** (first few bytes). -If the magic number **cannot be read** (e.g., file doesn't exist, no permissions), they fall -back to checking the file **extension**. If the magic number is readable but invalid, the -backend returns False (does not fall back to extension). - -.. list-table:: - :header-rows: 1 - :widths: 40 20 10 10 10 10 - - * - File Path - - Magic Number - - :ref:`netcdf4 ` - - :ref:`h5netcdf ` - - :ref:`scipy ` - - :ref:`zarr ` - * - ``/path/to/file.nc`` - - ``CDF\x01`` (netCDF3) - - ✅ - - ❌ - - ✅ - - ❌ - * - ``/path/to/file.nc4`` - - ``\x89HDF\r\n\x1a\n`` (HDF5/netCDF4) - - ✅ - - ✅ - - ❌ - - ❌ - * - ``/path/to/file.nc.gz`` - - ``\x1f\x8b`` + ``CDF`` inside - - ❌ - - ❌ - - ✅ - - ❌ - * - ``/path/to/store.zarr/`` - - (directory) - - ❌ - - ❌ - - ❌ - - ✅ - * - ``/path/to/file.nc`` - - *(no magic number)* - - ✅ - - ✅ - - ✅ - - ❌ - * - ``/path/to/file.xyz`` - - ``CDF\x01`` (netCDF3) - - ✅ - - ❌ - - ✅ - - ❌ - * - ``/path/to/file.xyz`` - - ``\x89HDF\r\n\x1a\n`` (HDF5/netCDF4) - - ✅ - - ✅ - - ❌ - - ❌ - * - ``/path/to/file.xyz`` - - *(no magic number)* - - ❌ - - ❌ - - ❌ - - ❌ - -.. note:: - Remote URLs ending in ``.nc`` are **ambiguous**: - - - They could be netCDF files stored on a remote HTTP server (readable by ``netcdf4`` or ``h5netcdf``) - - They could be OPeNDAP/DAP endpoints (readable by ``netcdf4`` with DAP support or ``pydap``) - - These interpretations are fundamentally incompatible. If xarray's automatic - selection chooses the wrong backend, you must explicitly specify the ``engine`` parameter: - - .. code-block:: python - - # Force interpretation as a DAP endpoint - ds = xr.open_dataset("http://example.com/data.nc", engine="pydap") - - # Force interpretation as a remote netCDF file - ds = xr.open_dataset("https://example.com/data.nc", engine="netcdf4") - - -.. _io.netcdf: - -netCDF ------- - -The recommended way to store xarray data structures is `netCDF`__, which -is a binary file format for self-described datasets that originated -in the geosciences. Xarray is based on the netCDF data model, so netCDF files -on disk directly correspond to :py:class:`Dataset` objects (more accurately, -a group in a netCDF file directly corresponds to a :py:class:`Dataset` object. -See :ref:`io.netcdf_groups` for more.) - -NetCDF is supported on almost all platforms, and parsers exist -for the vast majority of scientific programming languages. Recent versions of -netCDF are based on the even more widely used HDF5 file-format. - -__ https://www.unidata.ucar.edu/software/netcdf/ - -.. tip:: - - If you aren't familiar with this data format, the `netCDF FAQ`_ is a good - place to start. - -.. _netCDF FAQ: https://www.unidata.ucar.edu/software/netcdf/docs/faq.html#What-Is-netCDF - -Reading and writing netCDF files with xarray requires scipy, h5netcdf, or the -`netCDF4-Python`__ library to be installed. SciPy only supports reading and writing -of netCDF V3 files. - -__ https://github.com/Unidata/netcdf4-python - -We can save a Dataset to disk using the -:py:meth:`Dataset.to_netcdf` method: - -.. jupyter-execute:: - - nc_filename = "saved_on_disk.nc" - -.. jupyter-execute:: - :hide-code: - - # Ensure the file is located in a unique temporary directory - # so that it doesn't conflict with parallel builds of the - # documentation. - - import tempfile - import os.path - - tempdir = tempfile.TemporaryDirectory() - nc_filename = os.path.join(tempdir.name, nc_filename) - -.. jupyter-execute:: - - ds = xr.Dataset( - {"foo": (("x", "y"), np.random.rand(4, 5))}, - coords={ - "x": [10, 20, 30, 40], - "y": pd.date_range("2000-01-01", periods=5), - "z": ("x", list("abcd")), - }, - ) - - ds.to_netcdf(nc_filename) - -By default, the file is saved as netCDF4 (assuming netCDF4-Python is -installed). You can control the format and engine used to write the file with -the ``format`` and ``engine`` arguments. - -.. tip:: - - Using the `h5netcdf `_ package - by passing ``engine='h5netcdf'`` to :py:meth:`open_dataset` can - sometimes be quicker than the default ``engine='netcdf4'`` that uses the - `netCDF4 `_ package. - - -We can load netCDF files to create a new Dataset using -:py:func:`open_dataset`: - -.. jupyter-execute:: - - ds_disk = xr.open_dataset(nc_filename) - ds_disk - -.. jupyter-execute:: - :hide-code: - - # Close "saved_on_disk.nc", but retain the file until after closing or deleting other - # datasets that will refer to it. - ds_disk.close() - -Similarly, a DataArray can be saved to disk using the -:py:meth:`DataArray.to_netcdf` method, and loaded -from disk using the :py:func:`open_dataarray` function. As netCDF files -correspond to :py:class:`Dataset` objects, these functions internally -convert the ``DataArray`` to a ``Dataset`` before saving, and then convert back -when loading, ensuring that the ``DataArray`` that is loaded is always exactly -the same as the one that was saved. - -A dataset can also be loaded or written to a specific group within a netCDF -file. To load from a group, pass a ``group`` keyword argument to the -``open_dataset`` function. The group can be specified as a path-like -string, e.g., to access subgroup 'bar' within group 'foo' pass -'/foo/bar' as the ``group`` argument. When writing multiple groups in one file, -pass ``mode='a'`` to ``to_netcdf`` to ensure that each call does not delete the -file. - -.. tip:: - - It is recommended to use :py:class:`~xarray.DataTree` to represent - hierarchical data, and to use the :py:meth:`xarray.DataTree.to_netcdf` method - when writing hierarchical data to a netCDF file. - -Data is *always* loaded lazily from netCDF files. You can manipulate, slice and subset -Dataset and DataArray objects, and no array values are loaded into memory until -you try to perform some sort of actual computation. For an example of how these -lazy arrays work, see the OPeNDAP section below. - -There may be minor differences in the :py:class:`Dataset` object returned -when reading a NetCDF file with different engines. - -It is important to note that when you modify values of a Dataset, even one -linked to files on disk, only the in-memory copy you are manipulating in xarray -is modified: the original file on disk is never touched. - -.. tip:: - - Xarray's lazy loading of remote or on-disk datasets is often but not always - desirable. Before performing computationally intense operations, it is - often a good idea to load a Dataset (or DataArray) entirely into memory by - invoking the :py:meth:`Dataset.load` method. - -Datasets have a :py:meth:`Dataset.close` method to close the associated -netCDF file. However, it's often cleaner to use a ``with`` statement: - -.. jupyter-execute:: - - # this automatically closes the dataset after use - with xr.open_dataset(nc_filename) as ds: - print(ds.keys()) - -Although xarray provides reasonable support for incremental reads of files on -disk, it does not support incremental writes, which can be a useful strategy -for dealing with datasets too big to fit into memory. Instead, xarray integrates -with dask.array (see :ref:`dask`), which provides a fully featured engine for -streaming computation. - -It is possible to append or overwrite netCDF variables using the ``mode='a'`` -argument. When using this option, all variables in the dataset will be written -to the original netCDF file, regardless if they exist in the original dataset. - - -.. _io.netcdf_groups: - -Groups -~~~~~~ - -Whilst netCDF groups can only be loaded individually as ``Dataset`` objects, a -whole file of many nested groups can be loaded as a single -:py:class:`xarray.DataTree` object. To open a whole netCDF file as a tree of groups -use the :py:func:`xarray.open_datatree` function. To save a DataTree object as a -netCDF file containing many groups, use the :py:meth:`xarray.DataTree.to_netcdf` method. - - -.. _netcdf.root_group.note: - -.. note:: - Due to file format specifications the on-disk root group name is always ``"/"``, - overriding any given ``DataTree`` root node name. - -.. _netcdf.group.warning: - -.. warning:: - ``DataTree`` objects do not follow the exact same data model as netCDF - files, which means that perfect round-tripping is not always possible. - - In particular in the netCDF data model dimensions are entities that can - exist regardless of whether any variable possesses them. This is in contrast - to `xarray's data model `_ - (and hence :ref:`DataTree's data model `) in which the - dimensions of a (Dataset/Tree) object are simply the set of dimensions - present across all variables in that dataset. - - This means that if a netCDF file contains dimensions but no variables which - possess those dimensions, these dimensions will not be present when that - file is opened as a DataTree object. - Saving this DataTree object to file will therefore not preserve these - "unused" dimensions. - -.. _io.encoding: - -Reading encoded data -~~~~~~~~~~~~~~~~~~~~ - -NetCDF files follow some conventions for encoding datetime arrays (as numbers -with a "units" attribute) and for packing and unpacking data (as -described by the "scale_factor" and "add_offset" attributes). If the argument -``decode_cf=True`` (default) is given to :py:func:`open_dataset`, xarray will attempt -to automatically decode the values in the netCDF objects according to -`CF conventions`_. Sometimes this will fail, for example, if a variable -has an invalid "units" or "calendar" attribute. For these cases, you can -turn this decoding off manually. - -.. _CF conventions: https://cfconventions.org/ - -You can view this encoding information (among others) in the -:py:attr:`DataArray.encoding` and -:py:attr:`DataArray.encoding` attributes: - -.. jupyter-execute:: - - ds_disk["y"].encoding - -.. jupyter-execute:: - - ds_disk.encoding - -Note that all operations that manipulate variables other than indexing -will remove encoding information. - -In some cases it is useful to intentionally reset a dataset's original encoding values. -This can be done with either the :py:meth:`Dataset.drop_encoding` or -:py:meth:`DataArray.drop_encoding` methods. - -.. jupyter-execute:: - - ds_no_encoding = ds_disk.drop_encoding() - ds_no_encoding.encoding - -.. _combining multiple files: - -Reading multi-file datasets -........................... - -NetCDF files are often encountered in collections, e.g., with different files -corresponding to different model runs or one file per timestamp. -Xarray can straightforwardly combine such files into a single Dataset by making use of -:py:func:`concat`, :py:func:`merge`, :py:func:`combine_nested` and -:py:func:`combine_by_coords`. For details on the difference between these -functions see :ref:`combining data`. - -Xarray includes support for manipulating datasets that don't fit into memory -with dask_. If you have dask installed, you can open multiple files -simultaneously in parallel using :py:func:`open_mfdataset`:: - - xr.open_mfdataset('my/files/*.nc', parallel=True) - -This function automatically concatenates and merges multiple files into a -single xarray dataset. -It is the recommended way to open multiple files with xarray. -For more details on parallel reading, see :ref:`combining.multi`, :ref:`dask.io` and a -`blog post`_ by Stephan Hoyer. -:py:func:`open_mfdataset` takes many kwargs that allow you to -control its behaviour (for e.g. ``parallel``, ``combine``, ``compat``, ``join``, ``concat_dim``). -See its docstring for more details. - - -.. note:: - - A common use-case involves a dataset distributed across a large number of files with - each file containing a large number of variables. Commonly, a few of these variables - need to be concatenated along a dimension (say ``"time"``), while the rest are equal - across the datasets (ignoring floating point differences). The following command - with suitable modifications (such as ``parallel=True``) works well with such datasets:: - - xr.open_mfdataset('my/files/*.nc', concat_dim="time", combine="nested", - data_vars='minimal', coords='minimal', compat='override') - - This command concatenates variables along the ``"time"`` dimension, but only those that - already contain the ``"time"`` dimension (``data_vars='minimal', coords='minimal'``). - Variables that lack the ``"time"`` dimension are taken from the first dataset - (``compat='override'``). - - -.. _dask: https://www.dask.org -.. _blog post: https://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ - -Sometimes multi-file datasets are not conveniently organized for easy use of :py:func:`open_mfdataset`. -One can use the ``preprocess`` argument to provide a function that takes a dataset -and returns a modified Dataset. -:py:func:`open_mfdataset` will call ``preprocess`` on every dataset -(corresponding to each file) prior to combining them. - - -If :py:func:`open_mfdataset` does not meet your needs, other approaches are possible. -The general pattern for parallel reading of multiple files -using dask, modifying those datasets and then combining into a single ``Dataset`` is:: - - def modify(ds): - # modify ds here - return ds - - - # this is basically what open_mfdataset does - open_kwargs = dict(decode_cf=True, decode_times=False) - open_tasks = [dask.delayed(xr.open_dataset)(f, **open_kwargs) for f in file_names] - tasks = [dask.delayed(modify)(task) for task in open_tasks] - datasets = dask.compute(tasks) # get a list of xarray.Datasets - combined = xr.combine_nested(datasets) # or some combination of concat, merge - - -As an example, here's how we could approximate ``MFDataset`` from the netCDF4 -library:: - - from glob import glob - import xarray as xr - - def read_netcdfs(files, dim): - # glob expands paths with * to a list of files, like the unix shell - paths = sorted(glob(files)) - datasets = [xr.open_dataset(p) for p in paths] - combined = xr.concat(datasets, dim) - return combined - - combined = read_netcdfs('/all/my/files/*.nc', dim='time') - -This function will work in many cases, but it's not very robust. First, it -never closes files, which means it will fail if you need to load more than -a few thousand files. Second, it assumes that you want all the data from each -file and that it can all fit into memory. In many situations, you only need -a small subset or an aggregated summary of the data from each file. - -Here's a slightly more sophisticated example of how to remedy these -deficiencies:: - - def read_netcdfs(files, dim, transform_func=None): - def process_one_path(path): - # use a context manager, to ensure the file gets closed after use - with xr.open_dataset(path) as ds: - # transform_func should do some sort of selection or - # aggregation - if transform_func is not None: - ds = transform_func(ds) - # load all data from the transformed dataset, to ensure we can - # use it after closing each original file - ds.load() - return ds - - paths = sorted(glob(files)) - datasets = [process_one_path(p) for p in paths] - combined = xr.concat(datasets, dim) - return combined - - # here we suppose we only care about the combined mean of each file; - # you might also use indexing operations like .sel to subset datasets - combined = read_netcdfs('/all/my/files/*.nc', dim='time', - transform_func=lambda ds: ds.mean()) - -This pattern works well and is very robust. We've used similar code to process -tens of thousands of files constituting 100s of GB of data. - - -.. _io.netcdf.writing_encoded: - -Writing encoded data -~~~~~~~~~~~~~~~~~~~~ - -Conversely, you can customize how xarray writes netCDF files on disk by -providing explicit encodings for each dataset variable. The ``encoding`` -argument takes a dictionary with variable names as keys and variable specific -encodings as values. These encodings are saved as attributes on the netCDF -variables on disk, which allows xarray to faithfully read encoded data back into -memory. - -It is important to note that using encodings is entirely optional: if you do not -supply any of these encoding options, xarray will write data to disk using a -default encoding, or the options in the ``encoding`` attribute, if set. -This works perfectly fine in most cases, but encoding can be useful for -additional control, especially for enabling compression. - -In the file on disk, these encodings are saved as attributes on each variable, which -allow xarray and other CF-compliant tools for working with netCDF files to correctly -read the data. - -Scaling and type conversions -............................ - -These encoding options (based on `CF Conventions on packed data`_) work on any -version of the netCDF file format: - -- ``dtype``: Any valid NumPy dtype or string convertible to a dtype, e.g., ``'int16'`` - or ``'float32'``. This controls the type of the data written on disk. -- ``_FillValue``: Values of ``NaN`` in xarray variables are remapped to this value when - saved on disk. This is important when converting floating point with missing values - to integers on disk, because ``NaN`` is not a valid value for integer dtypes. By - default, variables with float types are attributed a ``_FillValue`` of ``NaN`` in the - output file, unless explicitly disabled with an encoding ``{'_FillValue': None}``. -- ``scale_factor`` and ``add_offset``: Used to convert from encoded data on disk to - to the decoded data in memory, according to the formula - ``decoded = scale_factor * encoded + add_offset``. Please note that ``scale_factor`` - and ``add_offset`` must be of same type and determine the type of the decoded data. - -These parameters can be fruitfully combined to compress discretized data on disk. For -example, to save the variable ``foo`` with a precision of 0.1 in 16-bit integers while -converting ``NaN`` to ``-9999``, we would use -``encoding={'foo': {'dtype': 'int16', 'scale_factor': 0.1, '_FillValue': -9999}}``. -Compression and decompression with such discretization is extremely fast. - -.. _CF Conventions on packed data: https://cfconventions.org/cf-conventions/cf-conventions.html#packed-data - -.. _io.string-encoding: - -String encoding -............... - -Xarray can write unicode strings to netCDF files in two ways: - -- As variable length strings. This is only supported on netCDF4 (HDF5) files. -- By encoding strings into bytes, and writing encoded bytes as a character - array. The default encoding is UTF-8. - -By default, we use variable length strings for compatible files and fall-back -to using encoded character arrays. Character arrays can be selected even for -netCDF4 files by setting the ``dtype`` field in ``encoding`` to ``S1`` -(corresponding to NumPy's single-character bytes dtype). - -If character arrays are used: - -- The string encoding that was used is stored on - disk in the ``_Encoding`` attribute, which matches an ad-hoc convention - `adopted by the netCDF4-Python library `_. - At the time of this writing (October 2017), a standard convention for indicating - string encoding for character arrays in netCDF files was - `still under discussion `_. - Technically, you can use - `any string encoding recognized by Python `_ if you feel the need to deviate from UTF-8, - by setting the ``_Encoding`` field in ``encoding``. But - `we don't recommend it `_. -- The character dimension name can be specified by the ``char_dim_name`` field of a variable's - ``encoding``. If the name of the character dimension is not specified, the default is - ``f'string{data.shape[-1]}'``. When decoding character arrays from existing files, the - ``char_dim_name`` is added to the variables ``encoding`` to preserve if encoding happens, but - the field can be edited by the user. - -.. warning:: - - Missing values in bytes or unicode string arrays (represented by ``NaN`` in - xarray) are currently written to disk as empty strings ``''``. This means - missing values will not be restored when data is loaded from disk. - This behavior is likely to change in the future (:issue:`1647`). - Unfortunately, explicitly setting a ``_FillValue`` for string arrays to handle - missing values doesn't work yet either, though we also hope to fix this in the - future. - -Chunk based compression -....................... - -``zlib``, ``complevel``, ``fletcher32``, ``contiguous`` and ``chunksizes`` -can be used for enabling netCDF4/HDF5's chunk based compression, as described -in the `documentation for createVariable`_ for netCDF4-Python. This only works -for netCDF4 files and thus requires using ``format='netCDF4'`` and either -``engine='netcdf4'`` or ``engine='h5netcdf'``. - -.. _documentation for createVariable: https://unidata.github.io/netcdf4-python/#netCDF4.Dataset.createVariable - -Chunk based gzip compression can yield impressive space savings, especially -for sparse data, but it comes with significant performance overhead. HDF5 -libraries can only read complete chunks back into memory, and maximum -decompression speed is in the range of 50-100 MB/s. Worse, HDF5's compression -and decompression currently cannot be parallelized with dask. For these reasons, we -recommend trying discretization based compression (described above) first. - -Time units -.......... - -The ``units`` and ``calendar`` attributes control how xarray serializes ``datetime64`` and -``timedelta64`` arrays to datasets on disk as numeric values. The ``units`` encoding -should be a string like ``'days since 1900-01-01'`` for ``datetime64`` data or a string -like ``'days'`` for ``timedelta64`` data. ``calendar`` should be one of the calendar types -supported by netCDF4-python: ``'standard'``, ``'gregorian'``, ``'proleptic_gregorian'``, ``'noleap'``, -``'365_day'``, ``'360_day'``, ``'julian'``, ``'all_leap'``, ``'366_day'``. - -By default, xarray uses the ``'proleptic_gregorian'`` calendar and units of the smallest time -difference between values, with a reference time of the first time value. - - -.. _io.coordinates: - -Coordinates -........... - -You can control the ``coordinates`` attribute written to disk by specifying ``DataArray.encoding["coordinates"]``. -If not specified, xarray automatically sets ``DataArray.encoding["coordinates"]`` to a space-delimited list -of names of coordinate variables that share dimensions with the ``DataArray`` being written. -This allows perfect roundtripping of xarray datasets but may not be desirable. -When an xarray ``Dataset`` contains non-dimensional coordinates that do not share dimensions with any of -the variables, these coordinate variable names are saved under a "global" ``"coordinates"`` attribute. -This is not CF-compliant but again facilitates roundtripping of xarray datasets. - -Invalid netCDF files -~~~~~~~~~~~~~~~~~~~~ - -The library ``h5netcdf`` allows writing some dtypes that aren't -allowed in netCDF4 (see -`h5netcdf documentation `_). -This feature is available through :py:meth:`DataArray.to_netcdf` and -:py:meth:`Dataset.to_netcdf` when used with ``engine="h5netcdf"``, only if -``invalid_netcdf=True`` is explicitly set. - -.. warning:: - - Note that this produces a file that is likely to be not readable by other netCDF - libraries! - -.. _io.hdf5: - -HDF5 ----- -`HDF5`_ is both a file format and a data model for storing information. HDF5 stores -data hierarchically, using groups to create a nested structure. HDF5 is a more -general version of the netCDF4 data model, so the nested structure is one of many -similarities between the two data formats. - -Reading HDF5 files in xarray requires the ``h5netcdf`` engine, which can be installed -with ``conda install h5netcdf``. Once installed we can use xarray to open HDF5 files: - -.. code:: python - - xr.open_dataset("/path/to/my/file.h5") - -The similarities between HDF5 and netCDF4 mean that HDF5 data can be written with the -same :py:meth:`Dataset.to_netcdf` method as used for netCDF4 data: - -.. jupyter-execute:: - - ds = xr.Dataset( - {"foo": (("x", "y"), np.random.rand(4, 5))}, - coords={ - "x": [10, 20, 30, 40], - "y": pd.date_range("2000-01-01", periods=5), - "z": ("x", list("abcd")), - }, - ) - -.. jupyter-execute:: - :hide-code: - - # Check if the file exists and if not, create it - if not os.path.exists("saved_on_disk.h5"): - ds.to_netcdf("saved_on_disk.h5") - -.. code:: python - - ds.to_netcdf("saved_on_disk.h5") - -Groups -~~~~~~ - -If you have multiple or highly nested groups, xarray by default may not read the group -that you want. A particular group of an HDF5 file can be specified using the ``group`` -argument: - -.. code:: python - - xr.open_dataset("/path/to/my/file.h5", group="/my/group") - -While xarray cannot interrogate an HDF5 file to determine which groups are available, -the HDF5 Python reader `h5py`_ can be used instead. - -Natively the xarray data structures can only handle one level of nesting, organized as -DataArrays inside of Datasets. If your HDF5 file has additional levels of hierarchy you -can only access one group and a time and will need to specify group names. - -.. _HDF5: https://hdfgroup.github.io/hdf5/index.html -.. _h5py: https://www.h5py.org/ - - -.. _io.zarr: - -Zarr ----- - -`Zarr`_ is a Python package that provides an implementation of chunked, compressed, -N-dimensional arrays. -Zarr has the ability to store arrays in a range of ways, including in memory, -in files, and in cloud-based object storage such as `Amazon S3`_ and -`Google Cloud Storage`_. -Xarray's Zarr backend allows xarray to leverage these capabilities, including -the ability to store and analyze datasets far too large fit onto disk -(particularly :ref:`in combination with dask `). - -Xarray can't open just any zarr dataset, because xarray requires special -metadata (attributes) describing the dataset dimensions and coordinates. -At this time, xarray can only open zarr datasets with these special attributes, -such as zarr datasets written by xarray, -`netCDF `_, -or `GDAL `_. -For implementation details, see :ref:`zarr_encoding`. - -To write a dataset with zarr, we use the :py:meth:`Dataset.to_zarr` method. - -To write to a local directory, we pass a path to a directory: - -.. jupyter-execute:: - - zarr_filename = "example.zarr" - -.. jupyter-execute:: - :hide-code: - - import os.path - import tempfile - - tempdir = tempfile.TemporaryDirectory() - zarr_filename = os.path.join(tempdir.name, zarr_filename) - -.. jupyter-execute:: - :stderr: - - ds = xr.Dataset( - {"foo": (("x", "y"), np.random.rand(4, 5))}, - coords={ - "x": [10, 20, 30, 40], - "y": pd.date_range("2000-01-01", periods=5), - "z": ("x", list("abcd")), - }, - ) - ds.to_zarr(zarr_filename, zarr_format=2, consolidated=False) - -(The suffix ``.zarr`` is optional--just a reminder that a zarr store lives -there.) If the directory does not exist, it will be created. If a zarr -store is already present at that path, an error will be raised, preventing it -from being overwritten. To override this behavior and overwrite an existing -store, add ``mode='w'`` when invoking :py:meth:`~Dataset.to_zarr`. - -DataArrays can also be saved to disk using the :py:meth:`DataArray.to_zarr` method, -and loaded from disk using the :py:func:`open_dataarray` function with ``engine='zarr'``. -Similar to :py:meth:`DataArray.to_netcdf`, :py:meth:`DataArray.to_zarr` will -convert the ``DataArray`` to a ``Dataset`` before saving, and then convert back -when loading, ensuring that the ``DataArray`` that is loaded is always exactly -the same as the one that was saved. - -.. note:: - - xarray does not write `NCZarr `_ attributes. - Therefore, NCZarr data must be opened in read-only mode. - -To store variable length strings, convert them to object arrays first with -``dtype=object``. - -To read back a zarr dataset that has been created this way, we use the -:py:func:`open_zarr` method: - -.. jupyter-execute:: - - ds_zarr = xr.open_zarr(zarr_filename, consolidated=False) - ds_zarr - -Cloud Storage Buckets -~~~~~~~~~~~~~~~~~~~~~ - -It is possible to read and write xarray datasets directly from / to cloud -storage buckets using zarr. This example uses the `gcsfs`_ package to provide -an interface to `Google Cloud Storage`_. - -General `fsspec`_ URLs, those that begin with ``s3://`` or ``gcs://`` for example, -are parsed and the store set up for you automatically when reading. -You should include any arguments to the storage backend as the -key ```storage_options``, part of ``backend_kwargs``. - -.. code:: python - - ds_gcs = xr.open_dataset( - "gcs:///path.zarr", - backend_kwargs={ - "storage_options": {"project": "", "token": None} - }, - engine="zarr", - ) - - -This also works with ``open_mfdataset``, allowing you to pass a list of paths or -a URL to be interpreted as a glob string. - -For writing, you may either specify a bucket URL or explicitly set up a -``zarr.abc.store.Store`` instance, as follows: - -.. tab:: URL - - .. code:: python - - # write to the bucket via GCS URL - ds.to_zarr("gs://") - # read it back - ds_gcs = xr.open_zarr("gs://") - -.. tab:: fsspec - - .. code:: python - - import gcsfs - import zarr - - # manually manage the cloud filesystem connection -- useful, for example, - # when you need to manage permissions to cloud resources - fs = gcsfs.GCSFileSystem(project="", token=None) - zstore = zarr.storage.FsspecStore(fs, path="") - - # write to the bucket - ds.to_zarr(store=zstore) - # read it back - ds_gcs = xr.open_zarr(zstore) - -.. tab:: obstore - - .. code:: python - - import obstore - import zarr - - # alternatively, obstore offers a modern, performant interface for - # cloud buckets - gcsstore = obstore.store.GCSStore( - "", prefix="", skip_signature=True - ) - zstore = zarr.store.ObjectStore(gcsstore) - - # write to the bucket - ds.to_zarr(store=zstore) - # read it back - ds_gcs = xr.open_zarr(zstore) - - -.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ -.. _obstore: https://developmentseed.org/obstore/latest/ -.. _Zarr: https://zarr.readthedocs.io/ -.. _Amazon S3: https://aws.amazon.com/s3/ -.. _Google Cloud Storage: https://cloud.google.com/storage/ -.. _gcsfs: https://github.com/fsspec/gcsfs - -.. _io.zarr.distributed_writes: - -Distributed writes -~~~~~~~~~~~~~~~~~~ - -Xarray will natively use dask to write in parallel to a zarr store, which should -satisfy most moderately sized datasets. For more flexible parallelization, we -can use ``region`` to write to limited regions of arrays in an existing Zarr -store. - -To scale this up to writing large datasets, first create an initial Zarr store -without writing all of its array data. This can be done by first creating a -``Dataset`` with dummy values stored in :ref:`dask `, and then calling -``to_zarr`` with ``compute=False`` to write only metadata (including ``attrs``) -to Zarr: - -.. jupyter-execute:: - :hide-code: - - tempdir.cleanup() - -.. jupyter-execute:: - - import dask.array - - # The values of this dask array are entirely irrelevant; only the dtype, - # shape and chunks are used - dummies = dask.array.zeros(30, chunks=10) - ds = xr.Dataset({"foo": ("x", dummies)}, coords={"x": np.arange(30)}) - # Now we write the metadata without computing any array values - ds.to_zarr(zarr_filename, compute=False, consolidated=False) - -Now, a Zarr store with the correct variable shapes and attributes exists that -can be filled out by subsequent calls to ``to_zarr``. -Setting ``region="auto"`` will open the existing store and determine the -correct alignment of the new data with the existing dimensions, or as an -explicit mapping from dimension names to Python ``slice`` objects indicating -where the data should be written (in index space, not label space), e.g., - -.. jupyter-execute:: - - # For convenience, we'll slice a single dataset, but in the real use-case - # we would create them separately possibly even from separate processes. - ds = xr.Dataset({"foo": ("x", np.arange(30))}, coords={"x": np.arange(30)}) - # Any of the following region specifications are valid - ds.isel(x=slice(0, 10)).to_zarr(zarr_filename, region="auto", consolidated=False) - ds.isel(x=slice(10, 20)).to_zarr(zarr_filename, region={"x": "auto"}, consolidated=False) - ds.isel(x=slice(20, 30)).to_zarr(zarr_filename, region={"x": slice(20, 30)}, consolidated=False) - -Concurrent writes with ``region`` are safe as long as they modify distinct -chunks in the underlying Zarr arrays (or use an appropriate ``lock``). - -As a safety check to make it harder to inadvertently override existing values, -if you set ``region`` then *all* variables included in a Dataset must have -dimensions included in ``region``. Other variables (typically coordinates) -need to be explicitly dropped and/or written in a separate calls to ``to_zarr`` -with ``mode='a'``. - -Zarr Compressors and Filters -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -There are many different `options for compression and filtering possible with -zarr `_. - -These options can be passed to the ``to_zarr`` method as variable encoding. -For example: - -.. jupyter-execute:: - - zarr_filename = "foo.zarr" - -.. jupyter-execute:: - :hide-code: - - import os.path - import tempfile - tempdir = tempfile.TemporaryDirectory() - zarr_filename = os.path.join(tempdir.name, zarr_filename) - -.. jupyter-execute:: - - import zarr - from zarr.codecs import BloscCodec - - compressor = BloscCodec(cname="zstd", clevel=3, shuffle="shuffle") - ds.to_zarr(zarr_filename, consolidated=False, encoding={"foo": {"compressors": [compressor]}}) - -.. note:: - - Not all native zarr compression and filtering options have been tested with - xarray. - -.. _io.zarr.appending: - -Modifying existing Zarr stores -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Xarray supports several ways of incrementally writing variables to a Zarr -store. These options are useful for scenarios when it is infeasible or -undesirable to write your entire dataset at once. - -1. Use ``mode='a'`` to add or overwrite entire variables, -2. Use ``append_dim`` to resize and append to existing variables, and -3. Use ``region`` to write to limited regions of existing arrays. - -.. tip:: - - For ``Dataset`` objects containing dask arrays, a - single call to ``to_zarr()`` will write all of your data in parallel. - -.. warning:: - - Alignment of coordinates is currently not checked when modifying an - existing Zarr store. It is up to the user to ensure that coordinates are - consistent. - -To add or overwrite entire variables, simply call :py:meth:`~Dataset.to_zarr` -with ``mode='a'`` on a Dataset containing the new variables, passing in an -existing Zarr store or path to a Zarr store. - -To resize and then append values along an existing dimension in a store, set -``append_dim``. This is a good option if data always arrives in a particular -order, e.g., for time-stepping a simulation: - -.. jupyter-execute:: - :hide-code: - - tempdir.cleanup() - -.. jupyter-execute:: - - ds1 = xr.Dataset( - {"foo": (("x", "y", "t"), np.random.rand(4, 5, 2))}, - coords={ - "x": [10, 20, 30, 40], - "y": [1, 2, 3, 4, 5], - "t": pd.date_range("2001-01-01", periods=2), - }, - ) - ds1.to_zarr(zarr_filename, consolidated=False) - -.. jupyter-execute:: - - ds2 = xr.Dataset( - {"foo": (("x", "y", "t"), np.random.rand(4, 5, 2))}, - coords={ - "x": [10, 20, 30, 40], - "y": [1, 2, 3, 4, 5], - "t": pd.date_range("2001-01-03", periods=2), - }, - ) - ds2.to_zarr(zarr_filename, append_dim="t", consolidated=False) - -.. _io.zarr.writing_chunks: - -Specifying chunks in a zarr store -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Chunk sizes may be specified in one of three ways when writing to a zarr store: - -1. Manual chunk sizing through the use of the ``encoding`` argument in :py:meth:`Dataset.to_zarr`: -2. Automatic chunking based on chunks in dask arrays -3. Default chunk behavior determined by the zarr library - -The resulting chunks will be determined based on the order of the above list; dask -chunks will be overridden by manually-specified chunks in the encoding argument, -and the presence of either dask chunks or chunks in the ``encoding`` attribute will -supersede the default chunking heuristics in zarr. - -Importantly, this logic applies to every array in the zarr store individually, -including coordinate arrays. Therefore, if a dataset contains one or more dask -arrays, it may still be desirable to specify a chunk size for the coordinate arrays -(for example, with a chunk size of ``-1`` to include the full coordinate). - -To specify chunks manually using the ``encoding`` argument, provide a nested -dictionary with the structure ``{'variable_or_coord_name': {'chunks': chunks_tuple}}``. - -.. note:: - - The positional ordering of the chunks in the encoding argument must match the - positional ordering of the dimensions in each array. Watch out for arrays with - differently-ordered dimensions within a single Dataset. - -For example, let's say we're working with a dataset with dimensions -``('time', 'x', 'y')``, a variable ``Tair`` which is chunked in ``x`` and ``y``, -and two multi-dimensional coordinates ``xc`` and ``yc``: - -.. jupyter-execute:: - - ds = xr.tutorial.open_dataset("rasm") - - ds["Tair"] = ds["Tair"].chunk({"x": 100, "y": 100}) - - ds - -These multi-dimensional coordinates are only two-dimensional and take up very little -space on disk or in memory, yet when writing to disk the default zarr behavior is to -split them into chunks: - -.. jupyter-execute:: - - ds.to_zarr(zarr_filename, consolidated=False, mode="w") - !tree -I zarr.json $zarr_filename - - -This may cause unwanted overhead on some systems, such as when reading from a cloud -storage provider. To disable this chunking, we can specify a chunk size equal to the -shape of each coordinate array in the ``encoding`` argument: - -.. jupyter-execute:: - - ds.to_zarr( - zarr_filename, - encoding={"xc": {"chunks": ds.xc.shape}, "yc": {"chunks": ds.yc.shape}}, - consolidated=False, - mode="w", - ) - !tree -I zarr.json $zarr_filename - - -The number of chunks on Tair matches our dask chunks, while there is now only a single -chunk in the directory stores of each coordinate. - -Groups -~~~~~~ - -Nested groups in zarr stores can be represented by loading the store as a -:py:class:`xarray.DataTree` object, similarly to netCDF. To open a whole zarr store as -a tree of groups use the :py:func:`open_datatree` function. To save a -``DataTree`` object as a zarr store containing many groups, use the -:py:meth:`xarray.DataTree.to_zarr()` method. - -.. note:: - Note that perfect round-tripping should always be possible with a zarr - store (:ref:`unlike for netCDF files `), as zarr does - not support "unused" dimensions. - - For the root group the same restrictions (:ref:`as for netCDF files `) apply. - Due to file format specifications the on-disk root group name is always ``"/"`` - overriding any given ``DataTree`` root node name. - - -.. _io.zarr.consolidated_metadata: - -Consolidated Metadata -~~~~~~~~~~~~~~~~~~~~~ - - -Xarray needs to read all of the zarr metadata when it opens a dataset. -In some storage mediums, such as with cloud object storage (e.g. `Amazon S3`_), -this can introduce significant overhead, because two separate HTTP calls to the -object store must be made for each variable in the dataset. -By default Xarray uses a feature called -*consolidated metadata*, storing all metadata for the entire dataset with a -single key (by default called ``.zmetadata``). This typically drastically speeds -up opening the store. (For more information on this feature, consult the -`zarr docs on consolidating metadata `_.) - -By default, xarray writes consolidated metadata and attempts to read stores -with consolidated metadata, falling back to use non-consolidated metadata for -reads. Because this fall-back option is so much slower, xarray issues a -``RuntimeWarning`` with guidance when reading with consolidated metadata fails: - - Failed to open Zarr store with consolidated metadata, falling back to try - reading non-consolidated metadata. This is typically much slower for - opening a dataset. To silence this warning, consider: - - 1. Consolidating metadata in this existing store with - :py:func:`zarr.consolidate_metadata`. - 2. Explicitly setting ``consolidated=False``, to avoid trying to read - consolidate metadata. - 3. Explicitly setting ``consolidated=True``, to raise an error in this case - instead of falling back to try reading non-consolidated metadata. - - -Fill Values -~~~~~~~~~~~ - -Zarr arrays have a ``fill_value`` that is used for chunks that were never written to disk. -For the Zarr version 2 format, Xarray will set ``fill_value`` to be equal to the CF/NetCDF ``"_FillValue"``. -This is ``np.nan`` by default for floats, and unset otherwise. Note that the Zarr library will set a -default ``fill_value`` if not specified (usually ``0``). - -For the Zarr version 3 format, ``_FillValue`` and ```fill_value`` are decoupled. -So you can set ``fill_value`` in ``encoding`` as usual. - -Note that at read-time, you can control whether ``_FillValue`` is masked using the -``mask_and_scale`` kwarg; and whether Zarr's ``fill_value`` is treated as synonymous -with ``_FillValue`` using the ``use_zarr_fill_value_as_mask`` kwarg to :py:func:`xarray.open_zarr`. - - -.. _io.kerchunk: - -Kerchunk --------- - -`Kerchunk `_ is a Python library -that allows you to access chunked and compressed data formats (such as NetCDF3, NetCDF4, HDF5, GRIB2, TIFF & FITS), -many of which are primary data formats for many data archives, by viewing the -whole archive as an ephemeral `Zarr`_ dataset which allows for parallel, chunk-specific access. - -Instead of creating a new copy of the dataset in the Zarr spec/format or -downloading the files locally, Kerchunk reads through the data archive and extracts the -byte range and compression information of each chunk and saves as a ``reference``. -These references are then saved as ``json`` files or ``parquet`` (more efficient) -for later use. You can view some of these stored in the ``references`` -directory `here `_. - - -.. note:: - These references follow this `specification `_. - Packages like `kerchunk`_ and `virtualizarr `_ - help in creating and reading these references. - - -Reading these data archives becomes really easy with ``kerchunk`` in combination -with ``xarray``, especially when these archives are large in size. A single combined -reference can refer to thousands of the original data files present in these archives. -You can view the whole dataset with from this combined reference using the above packages. - -The following example shows opening a single ``json`` reference to the ``saved_on_disk.h5`` file created above. -If the file were instead stored remotely (e.g. ``s3://saved_on_disk.h5``) you can use ``storage_options`` -that are used to `configure fsspec `_: - -.. jupyter-execute:: - - ds_kerchunked = xr.open_dataset( - "./combined.json", - engine="kerchunk", - storage_options={}, - ) - - ds_kerchunked - -.. note:: - - You can refer to the `project pythia kerchunk cookbook `_ - and the `pangeo guide on kerchunk `_ for more information. - - -.. _io.iris: - -Iris ----- - -The Iris_ tool allows easy reading of common meteorological and climate model formats -(including GRIB and UK MetOffice PP files) into ``Cube`` objects which are in many ways very -similar to ``DataArray`` objects, while enforcing a CF-compliant data model. - -DataArray ``to_iris`` and ``from_iris`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -If iris is installed, xarray can convert a ``DataArray`` into a ``Cube`` using -:py:meth:`DataArray.to_iris`: - -.. jupyter-execute:: - - da = xr.DataArray( - np.random.rand(4, 5), - dims=["x", "y"], - coords=dict(x=[10, 20, 30, 40], y=pd.date_range("2000-01-01", periods=5)), - ) - - cube = da.to_iris() - print(cube) - -Conversely, we can create a new ``DataArray`` object from a ``Cube`` using -:py:meth:`DataArray.from_iris`: - -.. jupyter-execute:: - - da_cube = xr.DataArray.from_iris(cube) - da_cube - -Ncdata -~~~~~~ -Ncdata_ provides more sophisticated means of transferring data, including entire -datasets. It uses the file saving and loading functions in both projects to provide a -more "correct" translation between them, but still with very low overhead and not -using actual disk files. - -Here we load an xarray dataset and convert it to Iris cubes: - -.. jupyter-execute:: - :stderr: - - ds = xr.tutorial.open_dataset("air_temperature_gradient") - cubes = ncdata.iris_xarray.cubes_from_xarray(ds) - print(cubes) - -.. jupyter-execute:: - - print(cubes[1]) - -And we can convert the cubes back to an xarray dataset: - -.. jupyter-execute:: - - # ensure dataset-level and variable-level attributes loaded correctly - iris.FUTURE.save_split_attrs = True - - ds = ncdata.iris_xarray.cubes_to_xarray(cubes) - ds - -Ncdata can also adjust file data within load and save operations, to fix data loading -problems or provide exact save formatting without needing to modify files on disk. -See for example : `ncdata usage examples`_ - -.. _Iris: https://scitools-iris.readthedocs.io -.. _Ncdata: https://ncdata.readthedocs.io/en/latest/index.html -.. _ncdata usage examples: https://github.com/pp-mo/ncdata/tree/v0.1.2?tab=readme-ov-file#correct-a-miscoded-attribute-in-iris-input - -.. _io.opendap: - -OPeNDAP -------- - -Xarray includes support for `OPeNDAP`__ (via the netCDF4 library or Pydap), which -lets us access large datasets over HTTP. - -__ https://www.opendap.org/ - -For example, we can open a connection to GBs of weather data produced by the -`PRISM`__ project, and hosted by `IRI`__ at Columbia: - -__ https://www.prism.oregonstate.edu/ -__ https://iri.columbia.edu/ - - -.. jupyter-input:: - - remote_data = xr.open_dataset( - "http://iridl.ldeo.columbia.edu/SOURCES/.OSU/.PRISM/.monthly/dods", - decode_times=False, - ) - remote_data - -.. jupyter-output:: - - - Dimensions: (T: 1422, X: 1405, Y: 621) - Coordinates: - * X (X) float32 -125.0 -124.958 -124.917 -124.875 -124.833 -124.792 -124.75 ... - * T (T) float32 -779.5 -778.5 -777.5 -776.5 -775.5 -774.5 -773.5 -772.5 -771.5 ... - * Y (Y) float32 49.9167 49.875 49.8333 49.7917 49.75 49.7083 49.6667 49.625 ... - Data variables: - ppt (T, Y, X) float64 ... - tdmean (T, Y, X) float64 ... - tmax (T, Y, X) float64 ... - tmin (T, Y, X) float64 ... - Attributes: - Conventions: IRIDL - expires: 1375315200 - -.. TODO: update this example to show off decode_cf? - -.. note:: - - Like many real-world datasets, this dataset does not entirely follow - `CF conventions`_. Unexpected formats will usually cause xarray's automatic - decoding to fail. The way to work around this is to either set - ``decode_cf=False`` in ``open_dataset`` to turn off all use of CF - conventions, or by only disabling the troublesome parser. - In this case, we set ``decode_times=False`` because the time axis here - provides the calendar attribute in a format that xarray does not expect - (the integer ``360`` instead of a string like ``'360_day'``). - -We can select and slice this data any number of times, and nothing is loaded -over the network until we look at particular values: - -.. jupyter-input:: - - tmax = remote_data["tmax"][:500, ::3, ::3] - tmax - -.. jupyter-output:: - - - [48541500 values with dtype=float64] - Coordinates: - * Y (Y) float32 49.9167 49.7917 49.6667 49.5417 49.4167 49.2917 ... - * X (X) float32 -125.0 -124.875 -124.75 -124.625 -124.5 -124.375 ... - * T (T) float32 -779.5 -778.5 -777.5 -776.5 -775.5 -774.5 -773.5 ... - Attributes: - pointwidth: 120 - standard_name: air_temperature - units: Celsius_scale - expires: 1443657600 - -.. jupyter-input:: - - # the data is downloaded automatically when we make the plot - tmax[0].plot() - -.. image:: ../_static/opendap-prism-tmax.png - -Some servers require authentication before we can access the data. Pydap uses -a `Requests`__ session object (which the user can pre-define), and this -session object can recover `authentication`__` credentials from a locally stored -``.netrc`` file. For example, to connect to a server that requires NASA's -URS authentication, with the username/password credentials stored on a locally -accessible ``.netrc``, access to OPeNDAP data should be as simple as this:: - - import xarray as xr - import requests - - my_session = requests.Session() - - ds_url = 'https://gpm1.gesdisc.eosdis.nasa.gov/opendap/hyrax/example.nc' - - ds = xr.open_dataset(ds_url, session=my_session, engine="pydap") - -Moreover, a bearer token header can be included in a `Requests`__ session -object, allowing for token-based authentication which OPeNDAP servers can use -to avoid some redirects. - - -Lastly, OPeNDAP servers may provide endpoint URLs for different OPeNDAP protocols, -DAP2 and DAP4. To specify which protocol between the two options to use, you can -replace the scheme of the url with the name of the protocol. For example:: - - # dap2 url - ds_url = 'dap2://gpm1.gesdisc.eosdis.nasa.gov/opendap/hyrax/example.nc' - - # dap4 url - ds_url = 'dap4://gpm1.gesdisc.eosdis.nasa.gov/opendap/hyrax/example.nc' - -While most OPeNDAP servers implement DAP2, not all servers implement DAP4. It -is recommended to check if the URL you are using `supports DAP4`__ by checking the -URL on a browser. - -__ https://docs.python-requests.org -__ https://pydap.github.io/pydap/en/notebooks/Authentication.html -__ https://pydap.github.io/pydap/en/faqs/dap2_or_dap4_url.html - -.. _io.pickle: - -Pickle ------- - -The simplest way to serialize an xarray object is to use Python's built-in pickle -module: - -.. jupyter-execute:: - - import pickle - - # use the highest protocol (-1) because it is way faster than the default - # text based pickle format - pkl = pickle.dumps(ds, protocol=-1) - - pickle.loads(pkl) - -Pickling is important because it doesn't require any external libraries -and lets you use xarray objects with Python modules like -:py:mod:`multiprocessing` or :ref:`Dask `. However, pickling is -**not recommended for long-term storage**. - -Restoring a pickle requires that the internal structure of the types for the -pickled data remain unchanged. Because the internal design of xarray is still -being refined, we make no guarantees (at this point) that objects pickled with -this version of xarray will work in future versions. - -.. note:: - - When pickling an object opened from a NetCDF file, the pickle file will - contain a reference to the file on disk. If you want to store the actual - array values, load it into memory first with :py:meth:`Dataset.load` - or :py:meth:`Dataset.compute`. - -.. _dictionary io: - -Dictionary ----------- - -We can convert a ``Dataset`` (or a ``DataArray``) to a dict using -:py:meth:`Dataset.to_dict`: - -.. jupyter-execute:: - - ds = xr.Dataset({"foo": ("x", np.arange(30))}) - d = ds.to_dict() - d - -We can create a new xarray object from a dict using -:py:meth:`Dataset.from_dict`: - -.. jupyter-execute:: - - ds_dict = xr.Dataset.from_dict(d) - ds_dict - -Dictionary support allows for flexible use of xarray objects. It doesn't -require external libraries and dicts can easily be pickled, or converted to -json, or geojson. All the values are converted to lists, so dicts might -be quite large. - -To export just the dataset schema without the data itself, use the -``data=False`` option: - -.. jupyter-execute:: - - ds.to_dict(data=False) - -.. jupyter-execute:: - :hide-code: - - # We're now done with the dataset named `ds`. Although the `with` statement closed - # the dataset, displaying the unpickled pickle of `ds` re-opened "saved_on_disk.nc". - # However, `ds` (rather than the unpickled dataset) refers to the open file. Delete - # `ds` to close the file. - del ds - - tempdir.cleanup() - -This can be useful for generating indices of dataset contents to expose to -search indices or other automated data discovery tools. - -.. _io.rasterio: - -Rasterio --------- - -GDAL readable raster data using `rasterio`_ such as GeoTIFFs can be opened using the `rioxarray`_ extension. -`rioxarray`_ can also handle geospatial related tasks such as re-projecting and clipping. - -.. jupyter-input:: - - import rioxarray - - rds = rioxarray.open_rasterio("RGB.byte.tif") - rds - -.. jupyter-output:: - - - [1703814 values with dtype=uint8] - Coordinates: - * band (band) int64 1 2 3 - * y (y) float64 2.827e+06 2.826e+06 ... 2.612e+06 2.612e+06 - * x (x) float64 1.021e+05 1.024e+05 ... 3.389e+05 3.392e+05 - spatial_ref int64 0 - Attributes: - STATISTICS_MAXIMUM: 255 - STATISTICS_MEAN: 29.947726688477 - STATISTICS_MINIMUM: 0 - STATISTICS_STDDEV: 52.340921626611 - transform: (300.0379266750948, 0.0, 101985.0, 0.0, -300.0417827... - _FillValue: 0.0 - scale_factor: 1.0 - add_offset: 0.0 - grid_mapping: spatial_ref - -.. jupyter-input:: - - rds.rio.crs - # CRS.from_epsg(32618) - - rds4326 = rds.rio.reproject("epsg:4326") - - rds4326.rio.crs - # CRS.from_epsg(4326) - - rds4326.rio.to_raster("RGB.byte.4326.tif") - - -.. _rasterio: https://rasterio.readthedocs.io/en/latest/ -.. _rioxarray: https://corteva.github.io/rioxarray/stable/ -.. _test files: https://github.com/rasterio/rasterio/blob/master/tests/data/RGB.byte.tif -.. _pyproj: https://github.com/pyproj4/pyproj - -.. _io.cfgrib: - -.. jupyter-execute:: - :hide-code: - - tempdir.cleanup() - -GRIB format via cfgrib ----------------------- - -Xarray supports reading GRIB files via ECMWF cfgrib_ python driver, -if it is installed. To open a GRIB file supply ``engine='cfgrib'`` -to :py:func:`open_dataset` after installing cfgrib_: - -.. jupyter-input:: - - ds_grib = xr.open_dataset("example.grib", engine="cfgrib") - -We recommend installing cfgrib via conda:: - - conda install -c conda-forge cfgrib - -.. _cfgrib: https://github.com/ecmwf/cfgrib - - -CSV and other formats supported by pandas ------------------------------------------ - -For more options (tabular formats and CSV files in particular), consider -exporting your objects to pandas and using its broad range of `IO tools`_. -For CSV files, one might also consider `xarray_extras`_. - -.. _xarray_extras: https://xarray-extras.readthedocs.io/en/latest/api/csv.html - -.. _IO tools: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html - - -Third party libraries ---------------------- - -More formats are supported by extension libraries: +.. toctree:: + :maxdepth: 2 -- `xarray-mongodb `_: Store xarray objects on MongoDB + ../io-netcdf + ../io-zarr + ../io-other diff --git a/doc/user-guide/plotting-2d.rst b/doc/user-guide/plotting-2d.rst new file mode 100644 index 00000000000..e69de29bb2d diff --git a/doc/user-guide/plotting-faceting.rst b/doc/user-guide/plotting-faceting.rst new file mode 100644 index 00000000000..e69de29bb2d diff --git a/doc/user-guide/plotting-lines.rst b/doc/user-guide/plotting-lines.rst new file mode 100644 index 00000000000..e69de29bb2d diff --git a/doc/user-guide/plotting-scatter.rst b/doc/user-guide/plotting-scatter.rst new file mode 100644 index 00000000000..e69de29bb2d diff --git a/doc/user-guide/plotting.rst b/doc/user-guide/plotting.rst index 6d45130611e..14b0f6b3547 100644 --- a/doc/user-guide/plotting.rst +++ b/doc/user-guide/plotting.rst @@ -1,950 +1,10 @@ -.. currentmodule:: xarray -.. _plotting: - Plotting ======== -Introduction ------------- - -Labeled data enables expressive computations. These same -labels can also be used to easily create informative plots. - -Xarray's plotting capabilities are centered around -:py:class:`DataArray` objects. -To plot :py:class:`Dataset` objects -simply access the relevant DataArrays, i.e. ``dset['var1']``. -Dataset specific plotting routines are also available (see :ref:`plot-dataset`). -Here we focus mostly on arrays 2d or larger. If your data fits -nicely into a pandas DataFrame then you're better off using one of the more -developed tools there. - -Xarray plotting functionality is a thin wrapper around the popular -`matplotlib `_ library. -Matplotlib syntax and function names were copied as much as possible, which -makes for an easy transition between the two. -Matplotlib must be installed before xarray can plot. - -To use xarray's plotting capabilities with time coordinates containing -``cftime.datetime`` objects -`nc-time-axis `_ v1.3.0 or later -needs to be installed. - -For more extensive plotting applications consider the following projects: - -- `Seaborn `_: "provides - a high-level interface for drawing attractive statistical graphics." - Integrates well with pandas. - -- `HoloViews `_ - and `GeoViews `_: "Composable, declarative - data structures for building even complex visualizations easily." Includes - native support for xarray objects. - -- `hvplot `_: ``hvplot`` makes it very easy to produce - dynamic plots (backed by ``Holoviews`` or ``Geoviews``) by adding a ``hvplot`` - accessor to DataArrays. - -- `Cartopy `_: Provides cartographic - tools. - -Imports -~~~~~~~ - -.. jupyter-execute:: - :hide-code: - - # Use defaults so we don't get gridlines in generated docs - import matplotlib as mpl - - mpl.rcdefaults() - -The following imports are necessary for all of the examples. - -.. jupyter-execute:: - - import cartopy.crs as ccrs - import matplotlib.pyplot as plt - import numpy as np - import pandas as pd - import xarray as xr - -For these examples we'll use the North American air temperature dataset. - -.. jupyter-execute:: - - airtemps = xr.tutorial.open_dataset("air_temperature") - airtemps - -.. jupyter-execute:: - - # Convert to celsius - air = airtemps.air - 273.15 - - # copy attributes to get nice figure labels and change Kelvin to Celsius - air.attrs = airtemps.air.attrs - air.attrs["units"] = "deg C" - -.. note:: - Until :issue:`1614` is solved, you might need to copy over the metadata in ``attrs`` to get informative figure labels (as was done above). - - -DataArrays ----------- - -One Dimension -~~~~~~~~~~~~~ - -================ - Simple Example -================ - -The simplest way to make a plot is to call the :py:func:`DataArray.plot()` method. - -.. jupyter-execute:: - - air1d = air.isel(lat=10, lon=10) - air1d.plot(); - -Xarray uses the coordinate name along with metadata ``attrs.long_name``, -``attrs.standard_name``, ``DataArray.name`` and ``attrs.units`` (if available) -to label the axes. -The names ``long_name``, ``standard_name`` and ``units`` are copied from the -`CF-conventions spec `_. -When choosing names, the order of precedence is ``long_name``, ``standard_name`` and finally ``DataArray.name``. -The y-axis label in the above plot was constructed from the ``long_name`` and ``units`` attributes of ``air1d``. - -.. jupyter-execute:: - - air1d.attrs - -====================== - Additional Arguments -====================== - -Additional arguments are passed directly to the matplotlib function which -does the work. -For example, :py:func:`xarray.plot.line` calls -matplotlib.pyplot.plot_ passing in the index and the array values as x and y, respectively. -So to make a line plot with blue triangles a matplotlib format string -can be used: - -.. _matplotlib.pyplot.plot: https://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.plot - -.. jupyter-execute:: - - air1d[:200].plot.line("b-^"); - -.. note:: - Not all xarray plotting methods support passing positional arguments - to the wrapped matplotlib functions, but they do all - support keyword arguments. - -Keyword arguments work the same way, and are more explicit. - -.. jupyter-execute:: - - air1d[:200].plot.line(color="purple", marker="o"); - -========================= - Adding to Existing Axis -========================= - -To add the plot to an existing axis pass in the axis as a keyword argument -``ax``. This works for all xarray plotting methods. -In this example ``axs`` is an array consisting of the left and right -axes created by ``plt.subplots``. - -.. jupyter-execute:: - - fig, axs = plt.subplots(ncols=2) - - print(axs) - - air1d.plot(ax=axs[0]) - air1d.plot.hist(ax=axs[1]); - -On the right is a histogram created by :py:func:`xarray.plot.hist`. - -.. _plotting.figsize: - -============================= - Controlling the figure size -============================= - -You can pass a ``figsize`` argument to all xarray's plotting methods to -control the figure size. For convenience, xarray's plotting methods also -support the ``aspect`` and ``size`` arguments which control the size of the -resulting image via the formula ``figsize = (aspect * size, size)``: - -.. jupyter-execute:: - - air1d.plot(aspect=2, size=3); - -This feature also works with :ref:`plotting.faceting`. For facet plots, -``size`` and ``aspect`` refer to a single panel (so that ``aspect * size`` -gives the width of each facet in inches), while ``figsize`` refers to the -entire figure (as for matplotlib's ``figsize`` argument). - -.. note:: - - If ``figsize`` or ``size`` are used, a new figure is created, - so this is mutually exclusive with the ``ax`` argument. - -.. note:: - - The convention used by xarray (``figsize = (aspect * size, size)``) is - borrowed from seaborn: it is therefore `not equivalent to matplotlib's`_. - -.. _not equivalent to matplotlib's: https://github.com/mwaskom/seaborn/issues/746 - - -.. _plotting.multiplelines: - -========================= - Determine x-axis values -========================= - -Per default dimension coordinates are used for the x-axis (here the time coordinates). -However, you can also use non-dimension coordinates, MultiIndex levels, and dimensions -without coordinates along the x-axis. To illustrate this, let's calculate a 'decimal day' (epoch) -from the time and assign it as a non-dimension coordinate: - -.. jupyter-execute:: - - decimal_day = (air1d.time - air1d.time[0]) / pd.Timedelta("1d") - air1d_multi = air1d.assign_coords(decimal_day=("time", decimal_day.data)) - air1d_multi - -To use ``'decimal_day'`` as x coordinate it must be explicitly specified: - -.. jupyter-execute:: - - air1d_multi.plot(x="decimal_day"); - -Creating a new MultiIndex named ``'date'`` from ``'time'`` and ``'decimal_day'``, -it is also possible to use a MultiIndex level as x-axis: - -.. jupyter-execute:: - - air1d_multi = air1d_multi.set_index(date=("time", "decimal_day")) - air1d_multi.plot(x="decimal_day"); - -Finally, if a dataset does not have any coordinates it enumerates all data points: - -.. jupyter-execute:: - - air1d_multi = air1d_multi.drop_vars(["date", "time", "decimal_day"]) - air1d_multi.plot(); - -The same applies to 2D plots below. - -==================================================== - Multiple lines showing variation along a dimension -==================================================== - -It is possible to make line plots of two-dimensional data by calling :py:func:`xarray.plot.line` -with appropriate arguments. Consider the 3D variable ``air`` defined above. We can use line -plots to check the variation of air temperature at three different latitudes along a longitude line: - -.. jupyter-execute:: - - air.isel(lon=10, lat=[19, 21, 22]).plot.line(x="time"); - -It is required to explicitly specify either - -1. ``x``: the dimension to be used for the x-axis, or -2. ``hue``: the dimension you want to represent by multiple lines. - -Thus, we could have made the previous plot by specifying ``hue='lat'`` instead of ``x='time'``. -If required, the automatic legend can be turned off using ``add_legend=False``. Alternatively, -``hue`` can be passed directly to :py:func:`xarray.plot.line` as ``air.isel(lon=10, lat=[19,21,22]).plot.line(hue='lat')``. - - -======================== - Dimension along y-axis -======================== - -It is also possible to make line plots such that the data are on the x-axis and a dimension is on the y-axis. This can be done by specifying the appropriate ``y`` keyword argument. - -.. jupyter-execute:: - - air.isel(time=10, lon=[10, 11]).plot(y="lat", hue="lon"); - -============ - Step plots -============ - -As an alternative, also a step plot similar to matplotlib's ``plt.step`` can be -made using 1D data. - -.. jupyter-execute:: - - air1d[:20].plot.step(where="mid"); - -The argument ``where`` defines where the steps should be placed, options are -``'pre'`` (default), ``'post'``, and ``'mid'``. This is particularly handy -when plotting data grouped with :py:meth:`Dataset.groupby_bins`. - -.. jupyter-execute:: - - air_grp = air.mean(["time", "lon"]).groupby_bins("lat", [0, 23.5, 66.5, 90]) - air_mean = air_grp.mean() - air_std = air_grp.std() - air_mean.plot.step() - (air_mean + air_std).plot.step(ls=":") - (air_mean - air_std).plot.step(ls=":") - plt.ylim(-20, 30) - plt.title("Zonal mean temperature"); - -In this case, the actual boundaries of the bins are used and the ``where`` argument -is ignored. - - -Other axes kwargs -~~~~~~~~~~~~~~~~~ - - -The keyword arguments ``xincrease`` and ``yincrease`` let you control the axes direction. - -.. jupyter-execute:: - - air.isel(time=10, lon=[10, 11]).plot.line( - y="lat", hue="lon", xincrease=False, yincrease=False - ); - -In addition, one can use ``xscale, yscale`` to set axes scaling; -``xticks, yticks`` to set axes ticks and ``xlim, ylim`` to set axes limits. -These accept the same values as the matplotlib methods ``ax.set_(x,y)scale()``, -``ax.set_(x,y)ticks()``, ``ax.set_(x,y)lim()``, respectively. - - -Two Dimensions -~~~~~~~~~~~~~~ - -================ - Simple Example -================ - -The default method :py:meth:`DataArray.plot` calls :py:func:`xarray.plot.pcolormesh` -by default when the data is two-dimensional. - -.. jupyter-execute:: - - air2d = air.isel(time=500) - air2d.plot(); - -All 2d plots in xarray allow the use of the keyword arguments ``yincrease`` -and ``xincrease``. - -.. jupyter-execute:: - - air2d.plot(yincrease=False); - -.. note:: - - We use :py:func:`xarray.plot.pcolormesh` as the default two-dimensional plot - method because it is more flexible than :py:func:`xarray.plot.imshow`. - However, for large arrays, ``imshow`` can be much faster than ``pcolormesh``. - If speed is important to you and you are plotting a regular mesh, consider - using ``imshow``. - -================ - Missing Values -================ - -Xarray plots data with :ref:`missing_values`. - -.. jupyter-execute:: - - bad_air2d = air2d.copy() - bad_air2d[dict(lat=slice(0, 10), lon=slice(0, 25))] = np.nan - bad_air2d.plot(); - -======================== - Nonuniform Coordinates -======================== - -It's not necessary for the coordinates to be evenly spaced. Both -:py:func:`xarray.plot.pcolormesh` (default) and :py:func:`xarray.plot.contourf` can -produce plots with nonuniform coordinates. - -.. jupyter-execute:: - - b = air2d.copy() - # Apply a nonlinear transformation to one of the coords - b.coords["lat"] = np.log(b.coords["lat"]) - - b.plot(); - -==================== - Other types of plot -==================== - -There are several other options for plotting 2D data. - -Contour plot using :py:meth:`DataArray.plot.contour()` - -.. jupyter-execute:: - - air2d.plot.contour(); - -Filled contour plot using :py:meth:`DataArray.plot.contourf()` - -.. jupyter-execute:: - - air2d.plot.contourf(); - -Surface plot using :py:meth:`DataArray.plot.surface()` - -.. jupyter-execute:: - - # transpose just to make the example look a bit nicer - air2d.T.plot.surface(); - -==================== - Calling Matplotlib -==================== - -Since this is a thin wrapper around matplotlib, all the functionality of -matplotlib is available. - -.. jupyter-execute:: - - air2d.plot(cmap=plt.cm.Blues) - plt.title("These colors prove North America\nhas fallen in the ocean") - plt.ylabel("latitude") - plt.xlabel("longitude"); - -.. note:: - - Xarray methods update label information and generally play around with the - axes. So any kind of updates to the plot - should be done *after* the call to the xarray's plot. - In the example below, ``plt.xlabel`` effectively does nothing, since - ``d_ylog.plot()`` updates the xlabel. - - .. jupyter-execute:: - - plt.xlabel("Never gonna see this.") - air2d.plot(); - -=========== - Colormaps -=========== - -Xarray borrows logic from Seaborn to infer what kind of color map to use. For -example, consider the original data in Kelvins rather than Celsius: - -.. jupyter-execute:: - - airtemps.air.isel(time=0).plot(); - -The Celsius data contain 0, so a diverging color map was used. The -Kelvins do not have 0, so the default color map was used. - -.. _robust-plotting: - -======== - Robust -======== - -Outliers often have an extreme effect on the output of the plot. -Here we add two bad data points. This affects the color scale, -washing out the plot. - -.. jupyter-execute:: - - air_outliers = airtemps.air.isel(time=0).copy() - air_outliers[0, 0] = 100 - air_outliers[-1, -1] = 400 - - air_outliers.plot(); - -This plot shows that we have outliers. The easy way to visualize -the data without the outliers is to pass the parameter -``robust=True``. -This will use the 2nd and 98th -percentiles of the data to compute the color limits. - -.. jupyter-execute:: - - air_outliers.plot(robust=True); - -Observe that the ranges of the color bar have changed. The arrows on the -color bar indicate -that the colors include data points outside the bounds. - -==================== - Discrete Colormaps -==================== - -It is often useful, when visualizing 2d data, to use a discrete colormap, -rather than the default continuous colormaps that matplotlib uses. The -``levels`` keyword argument can be used to generate plots with discrete -colormaps. For example, to make a plot with 8 discrete color intervals: - -.. jupyter-execute:: - - air2d.plot(levels=8); - -It is also possible to use a list of levels to specify the boundaries of the -discrete colormap: - -.. jupyter-execute:: - - air2d.plot(levels=[0, 12, 18, 30]); - -You can also specify a list of discrete colors through the ``colors`` argument: - -.. jupyter-execute:: - - flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"] - air2d.plot(levels=[0, 12, 18, 30], colors=flatui); - -Finally, if you have `Seaborn `_ -installed, you can also specify a seaborn color palette to the ``cmap`` -argument. Note that ``levels`` *must* be specified with seaborn color palettes -if using ``imshow`` or ``pcolormesh`` (but not with ``contour`` or ``contourf``, -since levels are chosen automatically). - -.. jupyter-execute:: - - air2d.plot(levels=10, cmap="husl"); - -.. _plotting.faceting: - -Faceting -~~~~~~~~ - -Faceting here refers to splitting an array along one or two dimensions and -plotting each group. -Xarray's basic plotting is useful for plotting two dimensional arrays. What -about three or four dimensional arrays? That's where facets become helpful. -The general approach to plotting here is called “small multiples”, where the -same kind of plot is repeated multiple times, and the specific use of small -multiples to display the same relationship conditioned on one or more other -variables is often called a “trellis plot”. - -Consider the temperature data set. There are 4 observations per day for two -years which makes for 2920 values along the time dimension. -One way to visualize this data is to make a -separate plot for each time period. - -The faceted dimension should not have too many values; -faceting on the time dimension will produce 2920 plots. That's -too much to be helpful. To handle this situation try performing -an operation that reduces the size of the data in some way. For example, we -could compute the average air temperature for each month and reduce the -size of this dimension from 2920 -> 12. A simpler way is -to just take a slice on that dimension. -So let's use a slice to pick 6 times throughout the first year. - -.. jupyter-execute:: - - t = air.isel(time=slice(0, 365 * 4, 250)) - t.coords - -================ - Simple Example -================ - -The easiest way to create faceted plots is to pass in ``row`` or ``col`` -arguments to the xarray plotting methods/functions. This returns a -:py:class:`xarray.plot.FacetGrid` object. - -.. jupyter-execute:: - - g_simple = t.plot(x="lon", y="lat", col="time", col_wrap=3); - -Faceting also works for line plots. - -.. jupyter-execute:: - - g_simple_line = t.isel(lat=slice(0, None, 4)).plot( - x="lon", hue="lat", col="time", col_wrap=3 - ); - -=============== - 4 dimensional -=============== - -For 4 dimensional arrays we can use the rows and columns of the grids. -Here we create a 4 dimensional array by taking the original data and adding -a fixed amount. Now we can see how the temperature maps would compare if -one were much hotter. - -.. jupyter-execute:: - - t2 = t.isel(time=slice(0, 2)) - t4d = xr.concat([t2, t2 + 40], pd.Index(["normal", "hot"], name="fourth_dim")) - # This is a 4d array - t4d.coords - - t4d.plot(x="lon", y="lat", col="time", row="fourth_dim"); - -================ - Other features -================ - -Faceted plotting supports other arguments common to xarray 2d plots. - -.. jupyter-execute:: - - hasoutliers = t.isel(time=slice(0, 5)).copy() - hasoutliers[0, 0, 0] = -100 - hasoutliers[-1, -1, -1] = 400 - - g = hasoutliers.plot.pcolormesh( - x="lon", - y="lat", - col="time", - col_wrap=3, - robust=True, - cmap="viridis", - cbar_kwargs={"label": "this has outliers"}, - ) - -=================== - FacetGrid Objects -=================== - -The object returned, ``g`` in the above examples, is a :py:class:`~xarray.plot.FacetGrid` object -that links a :py:class:`DataArray` to a matplotlib figure with a particular structure. -This object can be used to control the behavior of the multiple plots. -It borrows an API and code from `Seaborn's FacetGrid -`_. -The structure is contained within the ``axs`` and ``name_dicts`` -attributes, both 2d NumPy object arrays. - -.. jupyter-execute:: - - g.axs - -.. jupyter-execute:: - - g.name_dicts - -It's possible to select the :py:class:`xarray.DataArray` or -:py:class:`xarray.Dataset` corresponding to the FacetGrid through the -``name_dicts``. - -.. jupyter-execute:: - - g.data.loc[g.name_dicts[0, 0]] - -Here is an example of using the lower level API and then modifying the axes after -they have been plotted. - -.. jupyter-execute:: - - - g = t.plot.imshow(x="lon", y="lat", col="time", col_wrap=3, robust=True) - - for i, ax in enumerate(g.axs.flat): - ax.set_title("Air Temperature %d" % i) - - bottomright = g.axs[-1, -1] - bottomright.annotate("bottom right", (240, 40)); - - -:py:class:`~xarray.plot.FacetGrid` objects have methods that let you customize the automatically generated -axis labels, axis ticks and plot titles. See :py:meth:`~xarray.plot.FacetGrid.set_titles`, -:py:meth:`~xarray.plot.FacetGrid.set_xlabels`, :py:meth:`~xarray.plot.FacetGrid.set_ylabels` and -:py:meth:`~xarray.plot.FacetGrid.set_ticks` for more information. -Plotting functions can be applied to each subset of the data by calling -:py:meth:`~xarray.plot.FacetGrid.map_dataarray` or to each subplot by calling :py:meth:`~xarray.plot.FacetGrid.map`. - -TODO: add an example of using the ``map`` method to plot dataset variables -(e.g., with ``plt.quiver``). - -.. _plot-dataset: - -Datasets --------- - -Xarray has limited support for plotting Dataset variables against each other. -Consider this dataset - -.. jupyter-execute:: - - ds = xr.tutorial.scatter_example_dataset(seed=42) - ds - - -Scatter -~~~~~~~ - -Let's plot the ``A`` DataArray as a function of the ``y`` coord - -.. jupyter-execute:: - - with xr.set_options(display_expand_data=False): - display(ds.A) - -.. jupyter-execute:: - - ds.A.plot.scatter(x="y"); - -Same plot can be displayed using the dataset: - -.. jupyter-execute:: - - ds.plot.scatter(x="y", y="A"); - -Now suppose we want to scatter the ``A`` DataArray against the ``B`` DataArray - -.. jupyter-execute:: - - ds.plot.scatter(x="A", y="B"); - -The ``hue`` kwarg lets you vary the color by variable value - -.. jupyter-execute:: - - ds.plot.scatter(x="A", y="B", hue="w"); - -You can force a legend instead of a colorbar by setting ``add_legend=True, add_colorbar=False``. - -.. jupyter-execute:: - - ds.plot.scatter(x="A", y="B", hue="w", add_legend=True, add_colorbar=False); - -.. jupyter-execute:: - - ds.plot.scatter(x="A", y="B", hue="w", add_legend=False, add_colorbar=True); - -The ``markersize`` kwarg lets you vary the point's size by variable value. -You can additionally pass ``size_norm`` to control how the variable's values are mapped to point sizes. - -.. jupyter-execute:: - - ds.plot.scatter(x="A", y="B", hue="y", markersize="z"); - -The ``z`` kwarg lets you plot the data along the z-axis as well. - -.. jupyter-execute:: - - ds.plot.scatter(x="A", y="B", z="z", hue="y", markersize="x"); - -Faceting is also possible - -.. jupyter-execute:: - - ds.plot.scatter(x="A", y="B", hue="y", markersize="x", row="x", col="w"); - -And adding the z-axis - -.. jupyter-execute:: - - ds.plot.scatter(x="A", y="B", z="z", hue="y", markersize="x", row="x", col="w"); - -For more advanced scatter plots, we recommend converting the relevant data variables -to a pandas DataFrame and using the extensive plotting capabilities of ``seaborn``. - -Quiver -~~~~~~ - -Visualizing vector fields is supported with quiver plots: - -.. jupyter-execute:: - - ds.isel(w=1, z=1).plot.quiver(x="x", y="y", u="A", v="B"); - - -where ``u`` and ``v`` denote the x and y direction components of the arrow vectors. Again, faceting is also possible: - -.. jupyter-execute:: - - ds.plot.quiver(x="x", y="y", u="A", v="B", col="w", row="z", scale=4); - -``scale`` is required for faceted quiver plots. -The scale determines the number of data units per arrow length unit, i.e. a smaller scale parameter makes the arrow longer. - -Streamplot -~~~~~~~~~~ - -Visualizing vector fields is also supported with streamline plots: - -.. jupyter-execute:: - - ds.isel(w=1, z=1).plot.streamplot(x="x", y="y", u="A", v="B"); - - -where ``u`` and ``v`` denote the x and y direction components of the vectors tangent to the streamlines. -Again, faceting is also possible: - -.. jupyter-execute:: - - ds.plot.streamplot(x="x", y="y", u="A", v="B", col="w", row="z"); - -.. _plot-maps: - -Maps ----- - -To follow this section you'll need to have Cartopy installed and working. - -This script will plot the air temperature on a map. - -.. jupyter-execute:: - :stderr: - - air = xr.tutorial.open_dataset("air_temperature").air - - p = air.isel(time=0).plot( - subplot_kws=dict(projection=ccrs.Orthographic(-80, 35), facecolor="gray"), - transform=ccrs.PlateCarree(), - ) - p.axes.set_global() - - p.axes.coastlines(); - -When faceting on maps, the projection can be transferred to the ``plot`` -function using the ``subplot_kws`` keyword. The axes for the subplots created -by faceting are accessible in the object returned by ``plot``: - -.. jupyter-execute:: - - p = air.isel(time=[0, 4]).plot( - transform=ccrs.PlateCarree(), - col="time", - subplot_kws={"projection": ccrs.Orthographic(-80, 35)}, - ) - for ax in p.axs.flat: - ax.coastlines() - ax.gridlines() - - -Details -------- - -Ways to Use -~~~~~~~~~~~ - -There are three ways to use the xarray plotting functionality: - -1. Use ``plot`` as a convenience method for a DataArray. - -2. Access a specific plotting method from the ``plot`` attribute of a - DataArray. - -3. Directly from the xarray plot submodule. - -These are provided for user convenience; they all call the same code. - -.. jupyter-execute:: - - da = xr.DataArray(range(5)) - fig, axs = plt.subplots(ncols=2, nrows=2) - da.plot(ax=axs[0, 0]) - da.plot.line(ax=axs[0, 1]) - xr.plot.plot(da, ax=axs[1, 0]) - xr.plot.line(da, ax=axs[1, 1]); - -Here the output is the same. Since the data is 1 dimensional the line plot -was used. - -The convenience method :py:meth:`xarray.DataArray.plot` dispatches to an appropriate -plotting function based on the dimensions of the ``DataArray`` and whether -the coordinates are sorted and uniformly spaced. This table -describes what gets plotted: - -=============== =========================== -Dimensions Plotting function ---------------- --------------------------- -1 :py:func:`xarray.plot.line` -2 :py:func:`xarray.plot.pcolormesh` -Anything else :py:func:`xarray.plot.hist` -=============== =========================== - -Coordinates -~~~~~~~~~~~ - -If you'd like to find out what's really going on in the coordinate system, -read on. - -.. jupyter-execute:: - - a0 = xr.DataArray(np.zeros((4, 3, 2)), dims=("y", "x", "z"), name="temperature") - a0[0, 0, 0] = 1 - a = a0.isel(z=0) - a - -The plot will produce an image corresponding to the values of the array. -Hence the top left pixel will be a different color than the others. -Before reading on, you may want to look at the coordinates and -think carefully about what the limits, labels, and orientation for -each of the axes should be. - -.. jupyter-execute:: - - a.plot(); - -It may seem strange that -the values on the y axis are decreasing with -0.5 on the top. This is because -the pixels are centered over their coordinates, and the -axis labels and ranges correspond to the values of the -coordinates. - -Multidimensional coordinates -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -See also: :ref:`/examples/multidimensional-coords.ipynb`. - -You can plot irregular grids defined by multidimensional coordinates with -xarray, but you'll have to tell the plot function to use these coordinates -instead of the default ones: - -.. jupyter-execute:: - - lon, lat = np.meshgrid(np.linspace(-20, 20, 5), np.linspace(0, 30, 4)) - lon += lat / 10 - lat += lon / 10 - da = xr.DataArray( - np.arange(20).reshape(4, 5), - dims=["y", "x"], - coords={"lat": (("y", "x"), lat), "lon": (("y", "x"), lon)}, - ) - - da.plot.pcolormesh(x="lon", y="lat"); - -Note that in this case, xarray still follows the pixel centered convention. -This might be undesirable in some cases, for example when your data is defined -on a polar projection (:issue:`781`). This is why the default is to not follow -this convention when plotting on a map: - -.. jupyter-execute:: - :stderr: - - ax = plt.subplot(projection=ccrs.PlateCarree()) - da.plot.pcolormesh(x="lon", y="lat", ax=ax) - ax.scatter(lon, lat, transform=ccrs.PlateCarree()) - ax.coastlines() - ax.gridlines(draw_labels=True); - -You can however decide to infer the cell boundaries and use the -``infer_intervals`` keyword: - -.. jupyter-execute:: - - ax = plt.subplot(projection=ccrs.PlateCarree()) - da.plot.pcolormesh(x="lon", y="lat", ax=ax, infer_intervals=True) - ax.scatter(lon, lat, transform=ccrs.PlateCarree()) - ax.coastlines() - ax.gridlines(draw_labels=True); - -.. note:: - The data model of xarray does not support datasets with `cell boundaries`_ - yet. If you want to use these coordinates, you'll have to make the plots - outside the xarray framework. - -.. _cell boundaries: https://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#cell-boundaries - -One can also make line plots with multidimensional coordinates. In this case, ``hue`` must be a dimension name, not a coordinate name. - -.. jupyter-execute:: +.. toctree:: + :maxdepth: 2 - f, ax = plt.subplots(2, 1) - da.plot.line(x="lon", hue="y", ax=ax[0]) - da.plot.line(x="lon", hue="x", ax=ax[1]); + ../plotting-lines + ../plotting-2d + ../plotting-scatter + ../plotting-faceting diff --git a/xarray/static/html/icons-svg-inline.html b/xarray/static/html/icons-svg-inline.html index b0e837a26cd..7c0d3f552d4 100644 --- a/xarray/static/html/icons-svg-inline.html +++ b/xarray/static/html/icons-svg-inline.html @@ -1,15 +1,29 @@ - - - - - - - - - - - - - + + + + + + + + + + + + +