Skip to content

Commit ea9f02b

Browse files
mjwillsonpre-commit-ci[bot]shoyerkmuehlbauer
authored
Support for DataTree.to_netcdf to write to a file-like object or bytes (#10571)
* Improve reading and writing of NetCDF files to/from bytes or file-like objects. * Allows use of h5netcdf engine when writing to file-like objects (such as BytesIO), stop forcing use of scipy backend in this case (which is incompatible with groups and DataTree). Makes h5netcdf the default engine for DataTree.to_netcdf rather than leaving the choice of default up to Dataset.to_netcdf. * Allows use of h5netcdf engine to read from a bytes object. * Allows DataTree.to_netcdf to return bytes when filepath argument is omitted (similar to Dataset.to_netcdf. * Add a test for Dataset.to_netcdf(engine='h5netcdf') and fix a bug where bytes were being returned before the h5py.File had been closed, which it appears is needed for it to finish writing a valid file. This required a further workaround to prevent the BytesIO being closed by the scipy backend when it is used in a similar way. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Move close() fixes into scipy backends I also updated the h5netcdf backend to silence warnings from not closing files that were already open (which are issued from CachingFileManager). * Fix type annotations * Fix error from arViz * better typing and different fixes * Fixes per review, also use memoryview for return value * one more test * remove unnecessary use of BytesIO * remove inadvertent print() * Fix typing * Don't silently override engine in to_netcdf * Use type alias instead of refining filename_or_obj type everywhere * Fix grammar --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Stephan Hoyer <[email protected]> Co-authored-by: Kai Mühlbauer <[email protected]>
1 parent 938e186 commit ea9f02b

19 files changed

+464
-162
lines changed

doc/whats-new.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ v2025.07.2 (unreleased)
1313
New Features
1414
~~~~~~~~~~~~
1515

16+
- :py:meth:`DataTree.to_netcdf` can now write to a file-like object, or return bytes if called without a filepath. (:issue:`10570`)
17+
By `Matthew Willson <https://github.com/mjwillson>`_.
1618

1719
Breaking changes
1820
~~~~~~~~~~~~~~~~

xarray/backends/api.py

Lines changed: 52 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
Sequence,
1212
)
1313
from functools import partial
14-
from io import BytesIO
14+
from io import IOBase
1515
from itertools import starmap
1616
from numbers import Number
1717
from typing import (
@@ -31,6 +31,8 @@
3131
from xarray.backends.common import (
3232
AbstractDataStore,
3333
ArrayWriter,
34+
BytesIOProxy,
35+
T_PathFileOrDataStore,
3436
_find_absolute_paths,
3537
_normalize_path,
3638
)
@@ -503,7 +505,7 @@ def _datatree_from_backend_datatree(
503505

504506

505507
def open_dataset(
506-
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
508+
filename_or_obj: T_PathFileOrDataStore,
507509
*,
508510
engine: T_Engine = None,
509511
chunks: T_Chunks = None,
@@ -533,12 +535,13 @@ def open_dataset(
533535
534536
Parameters
535537
----------
536-
filename_or_obj : str, Path, file-like or DataStore
538+
filename_or_obj : str, Path, file-like, bytes, memoryview or DataStore
537539
Strings and Path objects are interpreted as a path to a netCDF file
538540
or an OpenDAP URL and opened with python-netCDF4, unless the filename
539541
ends with .gz, in which case the file is gunzipped and opened with
540-
scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
541-
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
542+
scipy.io.netcdf (only netCDF3 supported). Bytes, memoryview and
543+
file-like objects are opened by scipy.io.netcdf (netCDF3) or h5netcdf
544+
(netCDF4).
542545
engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "zarr", None}\
543546
, installed backend \
544547
or subclass of xarray.backends.BackendEntrypoint, optional
@@ -743,7 +746,7 @@ def open_dataset(
743746

744747

745748
def open_dataarray(
746-
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
749+
filename_or_obj: T_PathFileOrDataStore,
747750
*,
748751
engine: T_Engine = None,
749752
chunks: T_Chunks = None,
@@ -774,12 +777,13 @@ def open_dataarray(
774777
775778
Parameters
776779
----------
777-
filename_or_obj : str, Path, file-like or DataStore
780+
filename_or_obj : str, Path, file-like, bytes, memoryview or DataStore
778781
Strings and Path objects are interpreted as a path to a netCDF file
779782
or an OpenDAP URL and opened with python-netCDF4, unless the filename
780783
ends with .gz, in which case the file is gunzipped and opened with
781-
scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
782-
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
784+
scipy.io.netcdf (only netCDF3 supported). Bytes, memoryview and
785+
file-like objects are opened by scipy.io.netcdf (netCDF3) or h5netcdf
786+
(netCDF4).
783787
engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "zarr", None}\
784788
, installed backend \
785789
or subclass of xarray.backends.BackendEntrypoint, optional
@@ -970,7 +974,7 @@ def open_dataarray(
970974

971975

972976
def open_datatree(
973-
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
977+
filename_or_obj: T_PathFileOrDataStore,
974978
*,
975979
engine: T_Engine = None,
976980
chunks: T_Chunks = None,
@@ -1001,8 +1005,10 @@ def open_datatree(
10011005
10021006
Parameters
10031007
----------
1004-
filename_or_obj : str, Path, file-like, or DataStore
1005-
Strings and Path objects are interpreted as a path to a netCDF file or Zarr store.
1008+
filename_or_obj : str, Path, file-like, bytes or DataStore
1009+
Strings and Path objects are interpreted as a path to a netCDF file or
1010+
Zarr store. Bytes and memoryview objects are interpreted as file
1011+
contents.
10061012
engine : {"netcdf4", "h5netcdf", "zarr", None}, \
10071013
installed backend or xarray.backends.BackendEntrypoint, optional
10081014
Engine to use when reading files. If not provided, the default engine
@@ -1208,7 +1214,7 @@ def open_datatree(
12081214

12091215

12101216
def open_groups(
1211-
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
1217+
filename_or_obj: T_PathFileOrDataStore,
12121218
*,
12131219
engine: T_Engine = None,
12141220
chunks: T_Chunks = None,
@@ -1243,8 +1249,10 @@ def open_groups(
12431249
12441250
Parameters
12451251
----------
1246-
filename_or_obj : str, Path, file-like, or DataStore
1247-
Strings and Path objects are interpreted as a path to a netCDF file or Zarr store.
1252+
filename_or_obj : str, Path, file-like, bytes, memoryview or DataStore
1253+
Strings and Path objects are interpreted as a path to a netCDF file or
1254+
Zarr store. Bytes and memoryview objects are interpreted as file
1255+
contents.
12481256
engine : {"netcdf4", "h5netcdf", "zarr", None}, \
12491257
installed backend or xarray.backends.BackendEntrypoint, optional
12501258
Engine to use when reading files. If not provided, the default engine
@@ -1780,7 +1788,7 @@ def to_netcdf(
17801788
) -> tuple[ArrayWriter, AbstractDataStore]: ...
17811789

17821790

1783-
# path=None writes to bytes
1791+
# path=None writes to bytes or memoryview, depending on store
17841792
@overload
17851793
def to_netcdf(
17861794
dataset: Dataset,
@@ -1795,7 +1803,7 @@ def to_netcdf(
17951803
multifile: Literal[False] = False,
17961804
invalid_netcdf: bool = False,
17971805
auto_complex: bool | None = None,
1798-
) -> bytes: ...
1806+
) -> bytes | memoryview: ...
17991807

18001808

18011809
# compute=False returns dask.Delayed
@@ -1821,7 +1829,7 @@ def to_netcdf(
18211829
@overload
18221830
def to_netcdf(
18231831
dataset: Dataset,
1824-
path_or_file: str | os.PathLike,
1832+
path_or_file: str | os.PathLike | IOBase,
18251833
mode: NetcdfWriteModes = "w",
18261834
format: T_NetcdfTypes | None = None,
18271835
group: str | None = None,
@@ -1877,7 +1885,7 @@ def to_netcdf(
18771885
@overload
18781886
def to_netcdf(
18791887
dataset: Dataset,
1880-
path_or_file: str | os.PathLike | None,
1888+
path_or_file: str | os.PathLike | IOBase | None,
18811889
mode: NetcdfWriteModes = "w",
18821890
format: T_NetcdfTypes | None = None,
18831891
group: str | None = None,
@@ -1888,12 +1896,12 @@ def to_netcdf(
18881896
multifile: bool = False,
18891897
invalid_netcdf: bool = False,
18901898
auto_complex: bool | None = None,
1891-
) -> tuple[ArrayWriter, AbstractDataStore] | bytes | Delayed | None: ...
1899+
) -> tuple[ArrayWriter, AbstractDataStore] | bytes | memoryview | Delayed | None: ...
18921900

18931901

18941902
def to_netcdf(
18951903
dataset: Dataset,
1896-
path_or_file: str | os.PathLike | None = None,
1904+
path_or_file: str | os.PathLike | IOBase | None = None,
18971905
mode: NetcdfWriteModes = "w",
18981906
format: T_NetcdfTypes | None = None,
18991907
group: str | None = None,
@@ -1904,7 +1912,7 @@ def to_netcdf(
19041912
multifile: bool = False,
19051913
invalid_netcdf: bool = False,
19061914
auto_complex: bool | None = None,
1907-
) -> tuple[ArrayWriter, AbstractDataStore] | bytes | Delayed | None:
1915+
) -> tuple[ArrayWriter, AbstractDataStore] | bytes | memoryview | Delayed | None:
19081916
"""This function creates an appropriate datastore for writing a dataset to
19091917
disk as a netCDF file
19101918
@@ -1918,26 +1926,27 @@ def to_netcdf(
19181926
if encoding is None:
19191927
encoding = {}
19201928

1921-
if path_or_file is None:
1929+
if isinstance(path_or_file, str):
1930+
if engine is None:
1931+
engine = _get_default_engine(path_or_file)
1932+
path_or_file = _normalize_path(path_or_file)
1933+
else:
1934+
# writing to bytes/memoryview or a file-like object
19221935
if engine is None:
1936+
# TODO: only use 'scipy' if format is None or a netCDF3 format
19231937
engine = "scipy"
1924-
elif engine != "scipy":
1938+
elif engine not in ("scipy", "h5netcdf"):
19251939
raise ValueError(
1926-
"invalid engine for creating bytes with "
1927-
f"to_netcdf: {engine!r}. Only the default engine "
1928-
"or engine='scipy' is supported"
1940+
"invalid engine for creating bytes/memoryview or writing to a "
1941+
f"file-like object with to_netcdf: {engine!r}. Only "
1942+
"engine=None, engine='scipy' and engine='h5netcdf' is "
1943+
"supported."
19291944
)
19301945
if not compute:
19311946
raise NotImplementedError(
19321947
"to_netcdf() with compute=False is not yet implemented when "
19331948
"returning bytes"
19341949
)
1935-
elif isinstance(path_or_file, str):
1936-
if engine is None:
1937-
engine = _get_default_engine(path_or_file)
1938-
path_or_file = _normalize_path(path_or_file)
1939-
else: # file-like object
1940-
engine = "scipy"
19411950

19421951
# validate Dataset keys, DataArray names, and attr keys/values
19431952
_validate_dataset_names(dataset)
@@ -1962,7 +1971,11 @@ def to_netcdf(
19621971
f"is not currently supported with dask's {scheduler} scheduler"
19631972
)
19641973

1965-
target = path_or_file if path_or_file is not None else BytesIO()
1974+
if path_or_file is None:
1975+
target = BytesIOProxy()
1976+
else:
1977+
target = path_or_file # type: ignore[assignment]
1978+
19661979
kwargs = dict(autoclose=True) if autoclose else {}
19671980
if invalid_netcdf:
19681981
if engine == "h5netcdf":
@@ -2002,17 +2015,19 @@ def to_netcdf(
20022015

20032016
writes = writer.sync(compute=compute)
20042017

2005-
if isinstance(target, BytesIO):
2006-
store.sync()
2007-
return target.getvalue()
20082018
finally:
20092019
if not multifile and compute: # type: ignore[redundant-expr]
20102020
store.close()
20112021

2022+
if path_or_file is None:
2023+
assert isinstance(target, BytesIOProxy) # created in this function
2024+
return target.getvalue_or_getbuffer()
2025+
20122026
if not compute:
20132027
import dask
20142028

20152029
return dask.delayed(_finalize_store)(writes, store)
2030+
20162031
return None
20172032

20182033

xarray/backends/common.py

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,18 @@
44
import os
55
import time
66
import traceback
7-
from collections.abc import Hashable, Iterable, Mapping, Sequence
7+
from collections.abc import Callable, Hashable, Iterable, Mapping, Sequence
8+
from dataclasses import dataclass
89
from glob import glob
9-
from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, Union, overload
10+
from typing import (
11+
TYPE_CHECKING,
12+
Any,
13+
ClassVar,
14+
Generic,
15+
TypeVar,
16+
Union,
17+
overload,
18+
)
1019

1120
import numpy as np
1221
import pandas as pd
@@ -188,6 +197,24 @@ def _normalize_path_list(
188197
return _normalize_path_list(paths)
189198

190199

200+
BytesOrMemory = TypeVar("BytesOrMemory", bytes, memoryview)
201+
202+
203+
@dataclass
204+
class BytesIOProxy(Generic[BytesOrMemory]):
205+
"""Proxy object for a write that returns either bytes or a memoryview."""
206+
207+
# TODO: remove this in favor of BytesIO when Dataset.to_netcdf() stops
208+
# returning bytes from the scipy engine
209+
getvalue: Callable[[], BytesOrMemory] | None = None
210+
211+
def getvalue_or_getbuffer(self) -> BytesOrMemory:
212+
"""Get the value of this write as bytes or memory."""
213+
if self.getvalue is None:
214+
raise ValueError("must set getvalue before fetching value")
215+
return self.getvalue()
216+
217+
191218
def _open_remote_file(file, mode, storage_options=None):
192219
import fsspec
193220

@@ -324,6 +351,11 @@ def __exit__(self, exception_type, exception_value, traceback):
324351
self.close()
325352

326353

354+
T_PathFileOrDataStore = (
355+
str | os.PathLike[Any] | ReadBuffer | bytes | memoryview | AbstractDataStore
356+
)
357+
358+
327359
class ArrayWriter:
328360
__slots__ = ("lock", "regions", "sources", "targets")
329361

@@ -705,7 +737,12 @@ def __repr__(self) -> str:
705737

706738
def open_dataset(
707739
self,
708-
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
740+
filename_or_obj: str
741+
| os.PathLike[Any]
742+
| ReadBuffer
743+
| bytes
744+
| memoryview
745+
| AbstractDataStore,
709746
*,
710747
drop_variables: str | Iterable[str] | None = None,
711748
) -> Dataset:
@@ -717,7 +754,12 @@ def open_dataset(
717754

718755
def guess_can_open(
719756
self,
720-
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
757+
filename_or_obj: str
758+
| os.PathLike[Any]
759+
| ReadBuffer
760+
| bytes
761+
| memoryview
762+
| AbstractDataStore,
721763
) -> bool:
722764
"""
723765
Backend open_dataset method used by Xarray in :py:func:`~xarray.open_dataset`.
@@ -727,7 +769,12 @@ def guess_can_open(
727769

728770
def open_datatree(
729771
self,
730-
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
772+
filename_or_obj: str
773+
| os.PathLike[Any]
774+
| ReadBuffer
775+
| bytes
776+
| memoryview
777+
| AbstractDataStore,
731778
*,
732779
drop_variables: str | Iterable[str] | None = None,
733780
) -> DataTree:
@@ -739,7 +786,12 @@ def open_datatree(
739786

740787
def open_groups_as_dict(
741788
self,
742-
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
789+
filename_or_obj: str
790+
| os.PathLike[Any]
791+
| ReadBuffer
792+
| bytes
793+
| memoryview
794+
| AbstractDataStore,
743795
*,
744796
drop_variables: str | Iterable[str] | None = None,
745797
) -> dict[str, Dataset]:

xarray/backends/file_manager.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -339,8 +339,11 @@ def __hash__(self):
339339
class DummyFileManager(FileManager):
340340
"""FileManager that simply wraps an open file in the FileManager interface."""
341341

342-
def __init__(self, value):
342+
def __init__(self, value, *, close=None):
343+
if close is None:
344+
close = value.close
343345
self._value = value
346+
self._close = close
344347

345348
def acquire(self, needs_lock=True):
346349
del needs_lock # ignored
@@ -353,4 +356,4 @@ def acquire_context(self, needs_lock=True):
353356

354357
def close(self, needs_lock=True):
355358
del needs_lock # ignored
356-
self._value.close()
359+
self._close()

0 commit comments

Comments
 (0)