-
Couldn't load subscription status.
- Fork 43
Zarr support (backend, in esmvalcore.preprocessor._io.py)
#2785
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 68 commits
e347f40
5c32b55
682f46d
81c254c
6a02757
84412ab
1f8e127
5b97169
8bcc15f
c0b049c
e5f8c4e
9265b0d
4be6152
28f647f
6da4183
fb7712a
95a92c9
872be18
971cf34
0eeeb50
f5d13c8
fa8b90a
cccdb39
1618076
e2ed41c
fe7326e
39df34e
2b44ac9
caff216
d48418c
0909770
e87b12b
37d8a31
0d446af
37fcfff
94d8677
7ac7b45
caa3657
b4c6b6f
48db5f3
0afcec7
0c4a16f
72d79c2
8e54f1e
1572fff
b1fe4b8
f5c5979
8cddb55
0d71de7
2ab8fc0
b01b578
ea9377a
72d87bc
76b32b4
90c8963
7af2ec4
c151b57
ab78052
b5c3301
d514b67
2852381
49fb643
6a554d8
683b6e8
f2923e6
8c49e20
8b6f221
63411cb
84a33f2
8909b7d
eff8956
a2e31ab
a387558
37266da
e13a19e
cef79ce
63b817f
71ebe4e
171ea74
464c9f3
66f9811
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,7 +9,9 @@ | |
| from itertools import groupby | ||
| from pathlib import Path | ||
| from typing import TYPE_CHECKING, Any | ||
| from urllib.parse import urlparse | ||
|
|
||
| import fsspec | ||
| import iris | ||
| import ncdata | ||
| import xarray as xr | ||
|
|
@@ -75,6 +77,7 @@ | |
| def load( | ||
| file: str | Path | Cube | CubeList | xr.Dataset | ncdata.NcData, | ||
| ignore_warnings: list[dict[str, Any]] | None = None, | ||
| backend_kwargs: dict[str, Any] | None = None, | ||
| ) -> CubeList: | ||
| """Load Iris cubes. | ||
|
|
@@ -83,10 +86,19 @@ | |
| file: | ||
| File to be loaded. If ``file`` is already a loaded dataset, return it | ||
| as a :class:`~iris.cube.CubeList`. | ||
| File as ``Path`` object could be a Zarr store. | ||
| ignore_warnings: | ||
| Keyword arguments passed to :func:`warnings.filterwarnings` used to | ||
| ignore warnings issued by :func:`iris.load_raw`. Each list element | ||
| corresponds to one call to :func:`warnings.filterwarnings`. | ||
| backend_kwargs: | ||
| Dict to hold info needed by storage backend e.g. to access | ||
| a PRIVATE S3 bucket containing object stores (e.g. netCDF4 files); | ||
| needed by ``fsspec`` and its extensions e.g. ``s3fs``, so | ||
| most of the times this will include ``storage_options``. Note that Zarr | ||
| files are opened via ``http`` extension of ``fsspec``, so no need | ||
| for ``storage_options`` in that case (ie anon/anon). Currently only used | ||
| as empty dict in Zarr file opening. | ||
valeriupredoi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| Returns | ||
| ------- | ||
|
|
@@ -102,7 +114,19 @@ | |
| """ | ||
| if isinstance(file, (str, Path)): | ||
| cubes = _load_from_file(file, ignore_warnings=ignore_warnings) | ||
| extension = ( | ||
| file.suffix | ||
| if isinstance(file, Path) | ||
| else os.path.splitext(file)[1] | ||
| ) | ||
| if "zarr" not in extension: | ||
| cubes = _load_from_file(file, ignore_warnings=ignore_warnings) | ||
schlunma marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| else: | ||
| cubes = _load_zarr( | ||
| file, | ||
| ignore_warnings=ignore_warnings, | ||
| backend_kwargs=backend_kwargs, | ||
| ) | ||
| elif isinstance(file, Cube): | ||
| cubes = CubeList([file]) | ||
| elif isinstance(file, CubeList): | ||
|
|
@@ -134,6 +158,52 @@ | |
| return cubes | ||
|
|
||
|
|
||
| def _load_zarr( | ||
| file: str | Path | Cube | CubeList | xr.Dataset | ncdata.NcData, | ||
| ignore_warnings: list[dict[str, Any]] | None = None, | ||
| backend_kwargs: dict[str, Any] | None = None, | ||
| ) -> CubeList: | ||
| # case 1: Zarr store is on remote object store | ||
| # file's URI will always be either http or https | ||
| if urlparse(str(file)).scheme in ["http", "https"]: | ||
| # basic test that opens the Zarr/.zmetadata file for Zarr2 | ||
| # or Zarr/zarr.json for Zarr3 | ||
| fs = fsspec.filesystem("http") | ||
| zarr2 = zarr3 = True | ||
valeriupredoi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| try: | ||
| fs.open(str(file) + "/.zmetadata", "rb") # Zarr2 | ||
| except Exception: # noqa: BLE001 | ||
| zarr2 = False | ||
valeriupredoi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| try: | ||
| fs.open(str(file) + "/zarr.json", "rb") # Zarr3 | ||
| except Exception: # noqa: BLE001 | ||
| zarr3 = False | ||
valeriupredoi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| # we don't want to catch any specific aiohttp/fsspec exception | ||
| # bottom line is that that file has issues, so raise | ||
| if not zarr2 and not zarr3: | ||
valeriupredoi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| msg = f"File '{file}' can not be open as Zarr file at the moment." | ||
valeriupredoi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| raise ValueError(msg) from None | ||
|
||
|
|
||
| time_coder = xr.coders.CFDatetimeCoder(use_cftime=True) | ||
| zarr_xr = xr.open_dataset( | ||
| file, | ||
| consolidated=True, | ||
| decode_times=time_coder, | ||
| engine="zarr", | ||
| backend_kwargs=backend_kwargs, | ||
| ) | ||
| # case 2: Zarr store is local to the file system | ||
| else: | ||
| zarr_xr = xr.open_dataset( | ||
| file, | ||
| consolidated=False, | ||
| engine="zarr", | ||
| backend_kwargs=backend_kwargs, | ||
| ) | ||
|
|
||
| return dataset_to_iris(zarr_xr, ignore_warnings=ignore_warnings) | ||
|
|
||
|
|
||
| def _load_from_file( | ||
| file: str | Path, | ||
| ignore_warnings: list[dict[str, Any]] | None = None, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,223 @@ | ||
| """ | ||
| Integration tests for :func:`esmvalcore.preprocessor._io._load_zarr`. | ||
|
|
||
| This is a dedicated test module for Zarr files IO; we have identified | ||
| a number of issues with Zarr IO so it deserves its own test module. | ||
|
|
||
| We have a permanent bucket: esmvaltool-zarr at CEDA's object store | ||
| "url": "https://uor-aces-o.s3-ext.jc.rl.ac.uk/esmvaltool-zarr", | ||
| where will host a number of test files. Bucket is anon/anon | ||
| (read/GET-only, but PUT can be allowed). Bucket operations are done | ||
| via usual MinIO client (mc command) e.g. ``mc list``, ``mc du`` etc. | ||
|
|
||
| Further performance investigations are being run with a number of tests | ||
| that look at ncdata at https://github.com/valeriupredoi/esmvaltool_zarr_tests | ||
| also see https://github.com/pp-mo/ncdata/issues/139 | ||
| """ | ||
|
|
||
| from importlib.resources import files as importlib_files | ||
| from pathlib import Path | ||
|
|
||
| import cf_units | ||
| import pytest | ||
|
|
||
| from esmvalcore.preprocessor._io import load | ||
|
|
||
|
|
||
| def test_load_zarr2_local(): | ||
| """Test loading a Zarr2 store from local FS.""" | ||
| zarr_path = ( | ||
| Path(importlib_files("tests")) | ||
| / "sample_data" | ||
| / "zarr-sample-data" | ||
| / "example_field_0.zarr2" | ||
| ) | ||
|
|
||
| cubes = load(zarr_path) | ||
valeriupredoi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| assert len(cubes) == 1 | ||
| cube = cubes[0] | ||
| assert cube.var_name == "q" | ||
| assert cube.standard_name == "specific_humidity" | ||
| assert cube.long_name is None | ||
| assert cube.units == cf_units.Unit("1") | ||
| coords = cube.coords() | ||
| coord_names = [coord.standard_name for coord in coords] | ||
| assert "longitude" in coord_names | ||
| assert "latitude" in coord_names | ||
|
|
||
|
|
||
| def test_load_zarr2_remote(): | ||
| """Test loading a Zarr2 store from a https Object Store.""" | ||
| zarr_path = ( | ||
| "https://uor-aces-o.s3-ext.jc.rl.ac.uk/" | ||
| "esmvaltool-zarr/example_field_0.zarr2" | ||
| ) | ||
|
|
||
| # with "dummy" storage options | ||
| cubes = load( | ||
| zarr_path, | ||
| ignore_warnings=None, | ||
| backend_kwargs={"storage_options": {}}, | ||
| ) | ||
|
|
||
| assert len(cubes) == 1 | ||
| cube = cubes[0] | ||
| assert cube.var_name == "q" | ||
| assert cube.standard_name == "specific_humidity" | ||
| assert cube.long_name is None | ||
| assert cube.units == cf_units.Unit("1") | ||
| coords = cube.coords() | ||
| coord_names = [coord.standard_name for coord in coords] | ||
| assert "longitude" in coord_names | ||
| assert "latitude" in coord_names | ||
|
|
||
| # without storage_options | ||
| cubes = load(zarr_path) | ||
|
|
||
| assert len(cubes) == 1 | ||
| cube = cubes[0] | ||
| assert cube.var_name == "q" | ||
| assert cube.standard_name == "specific_humidity" | ||
| assert cube.long_name is None | ||
| assert cube.units == cf_units.Unit("1") | ||
| coords = cube.coords() | ||
| coord_names = [coord.standard_name for coord in coords] | ||
| assert "longitude" in coord_names | ||
| assert "latitude" in coord_names | ||
|
|
||
|
|
||
| def test_load_zarr3_remote(): | ||
| """Test loading a Zarr3 store from a https Object Store.""" | ||
| zarr_path = ( | ||
| "https://uor-aces-o.s3-ext.jc.rl.ac.uk/" | ||
| "esmvaltool-zarr/example_field_0.zarr3" | ||
| ) | ||
|
|
||
| # with "dummy" storage options | ||
| cubes = load( | ||
| zarr_path, | ||
| ignore_warnings=None, | ||
| backend_kwargs={"storage_options": {}}, | ||
| ) | ||
|
|
||
| assert len(cubes) == 1 | ||
| cube = cubes[0] | ||
| assert cube.var_name == "q" | ||
| assert cube.standard_name == "specific_humidity" | ||
| assert cube.long_name is None | ||
| assert cube.units == cf_units.Unit("1") | ||
| coords = cube.coords() | ||
| coord_names = [coord.standard_name for coord in coords] | ||
| assert "longitude" in coord_names | ||
| assert "latitude" in coord_names | ||
|
|
||
|
|
||
| def test_load_zarr3_cmip6_metadata(): | ||
| """ | ||
| Test loading a Zarr3 store from a https Object Store. | ||
|
|
||
| This test loads just the metadata, no computations. | ||
|
|
||
| This is an actual CMIP6 dataset (Zarr built from netCDF4 via Xarray) | ||
| - Zarr store on disk: 243 MiB | ||
| - compression: Blosc | ||
| - Dimensions: (lat: 128, lon: 256, time: 2352, axis_nbounds: 2) | ||
| - chunking: time-slices; netCDF4.Dataset.chunking() = [1, 128, 256] | ||
|
|
||
| Test takes 8-9s (median: 8.5s) and needs max Res mem: 1GB | ||
| """ | ||
| zarr_path = ( | ||
| "https://uor-aces-o.s3-ext.jc.rl.ac.uk/" | ||
| "esmvaltool-zarr/pr_Amon_CNRM-ESM2-1_02Kpd-11_r1i1p2f2_gr_200601-220112.zarr3" | ||
| ) | ||
|
|
||
| # with "dummy" storage options | ||
| cubes = load( | ||
| zarr_path, | ||
| ignore_warnings=None, | ||
| backend_kwargs={"storage_options": {}}, | ||
| ) | ||
|
|
||
| assert len(cubes) == 1 | ||
| cube = cubes[0] | ||
| assert cube.var_name == "pr" | ||
| assert cube.standard_name == "precipitation_flux" | ||
| assert cube.long_name == "Precipitation" | ||
| assert cube.units == cf_units.Unit("kg m-2 s-1") | ||
| assert cube.has_lazy_data() | ||
|
|
||
|
|
||
| def test_load_zarr_remote_not_zarr_file(): | ||
| """ | ||
| Test loading a Zarr store from a https Object Store. | ||
|
|
||
| This fails due to the file being loaded is not a Zarr file. | ||
| """ | ||
| zarr_path = ( | ||
| "https://uor-aces-o.s3-ext.jc.rl.ac.uk/" | ||
| "esmvaltool-zarr/example_field_0.zarr17" | ||
| ) | ||
|
|
||
| msg = ( | ||
| "File 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/" | ||
| "esmvaltool-zarr/example_field_0.zarr17' can not " | ||
| "be open as Zarr file at the moment." | ||
| ) | ||
| with pytest.raises(ValueError, match=msg): | ||
| load(zarr_path) | ||
|
|
||
|
|
||
| def test_load_zarr_remote_not_file(): | ||
| """ | ||
| Test loading a Zarr store from a https Object Store. | ||
|
|
||
| This fails due to non-existing file. | ||
| """ | ||
| zarr_path = ( | ||
| "https://uor-aces-o.s3-ext.jc.rl.ac.uk/" | ||
| "esmvaltool-zarr/example_field_0.zarr22" | ||
| ) | ||
|
|
||
| msg = ( | ||
| "File 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/" | ||
| "esmvaltool-zarr/example_field_0.zarr22' can not " | ||
| "be open as Zarr file at the moment." | ||
| ) | ||
| with pytest.raises(ValueError, match=msg): | ||
| load(zarr_path) | ||
|
|
||
|
|
||
| def test_load_zarr_local_not_file(): | ||
| """ | ||
| Test loading something that has a zarr extension. | ||
|
|
||
| But file doesn't exist (on local FS). | ||
| """ | ||
| zarr_path = "esmvaltool-zarr/example_field_0.zarr22" | ||
|
|
||
| # "Unable to find group" or "No group found" | ||
| # Zarr keeps changing the exception string so matching | ||
| # is bound to fail the test | ||
| with pytest.raises(FileNotFoundError): | ||
| load(zarr_path) | ||
|
|
||
|
|
||
| def test_load_zarr_local_not_zarr_file(): | ||
| """ | ||
| Test loading something that has a zarr extension. | ||
|
|
||
| But file is plaintext (on local FS). | ||
| """ | ||
| zarr_path = ( | ||
| Path(importlib_files("tests")) | ||
| / "sample_data" | ||
| / "zarr-sample-data" | ||
| / "example_field_0.zarr17" | ||
| ) | ||
|
|
||
| # "Unable to find group" or "No group found" | ||
| # Zarr keeps changing the exception string so matching | ||
| # is bound to fail the test | ||
| with pytest.raises(FileNotFoundError): | ||
| load(zarr_path) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| This is not a Zarr file. Go grab lunch! |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| { | ||
| "Conventions": "CF-1.12" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| { | ||
| "zarr_format": 2 | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.