diff --git a/environment.yml b/environment.yml index 9c4a768d6d..7dca43d1b8 100644 --- a/environment.yml +++ b/environment.yml @@ -5,6 +5,7 @@ channels: - nodefaults dependencies: + - aiohttp - cartopy - cf-units - cftime @@ -18,6 +19,7 @@ dependencies: - fire - geopy - humanfriendly + - intake-esm - iris >=3.12.2 # https://github.com/SciTools/iris/issues/6417 - iris-esmf-regrid >=0.11.0 - iris-grib >=0.20.0 # github.com/ESMValGroup/ESMValCore/issues/2535 @@ -46,6 +48,7 @@ dependencies: - shapely >=2.0.0 - xarray - yamale + - zarr >3 # Python packages needed for building docs - autodocsumm >=0.2.2 - ipython <9.0 # github.com/ESMValGroup/ESMValCore/issues/2680 diff --git a/esmvalcore/preprocessor/_io.py b/esmvalcore/preprocessor/_io.py index 3c4aa3566e..dfee69136e 100644 --- a/esmvalcore/preprocessor/_io.py +++ b/esmvalcore/preprocessor/_io.py @@ -9,7 +9,9 @@ from itertools import groupby from pathlib import Path from typing import TYPE_CHECKING, Any +from urllib.parse import urlparse +import fsspec import iris import ncdata import xarray as xr @@ -75,6 +77,7 @@ def _restore_lat_lon_units( def load( file: str | Path | Cube | CubeList | xr.Dataset | ncdata.NcData, ignore_warnings: list[dict[str, Any]] | None = None, + backend_kwargs: dict[str, Any] | None = None, ) -> CubeList: """Load Iris cubes. @@ -83,10 +86,19 @@ def load( file: File to be loaded. If ``file`` is already a loaded dataset, return it as a :class:`~iris.cube.CubeList`. + File as ``Path`` object could be a Zarr store. ignore_warnings: Keyword arguments passed to :func:`warnings.filterwarnings` used to ignore warnings issued by :func:`iris.load_raw`. Each list element corresponds to one call to :func:`warnings.filterwarnings`. + backend_kwargs: + Dict to hold info needed by storage backend e.g. to access + a PRIVATE S3 bucket containing object stores (e.g. netCDF4 files); + needed by ``fsspec`` and its extensions e.g. ``s3fs``, so + most of the times this will include ``storage_options``. Note that Zarr + files are opened via ``http`` extension of ``fsspec``, so no need + for ``storage_options`` in that case (ie anon/anon). Currently only used + in Zarr file opening. Returns ------- @@ -102,7 +114,19 @@ def load( """ if isinstance(file, (str, Path)): - cubes = _load_from_file(file, ignore_warnings=ignore_warnings) + extension = ( + file.suffix + if isinstance(file, Path) + else os.path.splitext(file)[1] + ) + if "zarr" not in extension: + cubes = _load_from_file(file, ignore_warnings=ignore_warnings) + else: + cubes = _load_zarr( + file, + ignore_warnings=ignore_warnings, + backend_kwargs=backend_kwargs, + ) elif isinstance(file, Cube): cubes = CubeList([file]) elif isinstance(file, CubeList): @@ -134,6 +158,53 @@ def load( return cubes +def _load_zarr( + file: str | Path | Cube | CubeList | xr.Dataset | ncdata.NcData, + ignore_warnings: list[dict[str, Any]] | None = None, + backend_kwargs: dict[str, Any] | None = None, +) -> CubeList: + # case 1: Zarr store is on remote object store + # file's URI will always be either http or https + if urlparse(str(file)).scheme in ["http", "https"]: + # basic test that opens the Zarr/.zmetadata file for Zarr2 + # or Zarr/zarr.json for Zarr3 + fs = fsspec.filesystem("http") + valid_zarr = True + try: + fs.open(str(file) + "/zarr.json", "rb") # Zarr3 + except Exception: # noqa: BLE001 + try: + fs.open(str(file) + "/.zmetadata", "rb") # Zarr2 + except Exception: # noqa: BLE001 + valid_zarr = False + # we don't want to catch any specific aiohttp/fsspec exception + # bottom line is that that file has issues, so raise + if not valid_zarr: + msg = ( + f"File '{file}' can not be opened as Zarr file at the moment." + ) + raise ValueError(msg) + + time_coder = xr.coders.CFDatetimeCoder(use_cftime=True) + zarr_xr = xr.open_dataset( + file, + consolidated=True, + decode_times=time_coder, + engine="zarr", + backend_kwargs=backend_kwargs, + ) + # case 2: Zarr store is local to the file system + else: + zarr_xr = xr.open_dataset( + file, + consolidated=False, + engine="zarr", + backend_kwargs=backend_kwargs, + ) + + return dataset_to_iris(zarr_xr, ignore_warnings=ignore_warnings) + + def _load_from_file( file: str | Path, ignore_warnings: list[dict[str, Any]] | None = None, diff --git a/pyproject.toml b/pyproject.toml index 85d195e7ae..2a909e8038 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dynamic = [ "version", ] dependencies = [ + "aiohttp", "cartopy", "cf-units", "dask[array,distributed]>=2025", # Core/issues/2503 @@ -44,6 +45,7 @@ dependencies = [ "fire", "geopy", "humanfriendly", + "intake-esm", "iris-grib>=0.20.0", # github.com/ESMValGroup/ESMValCore/issues/2535 "isodate>=0.7.0", "jinja2", @@ -68,6 +70,7 @@ dependencies = [ "stratify>=0.3", "xarray", "yamale", + "zarr>3", ] description = "A community tool for pre-processing data from Earth system models in CMIP and running analysis scripts" license = {text = "Apache License, Version 2.0"} diff --git a/tests/integration/preprocessor/_io/test_zarr.py b/tests/integration/preprocessor/_io/test_zarr.py new file mode 100644 index 0000000000..fcb5242cae --- /dev/null +++ b/tests/integration/preprocessor/_io/test_zarr.py @@ -0,0 +1,224 @@ +""" +Integration tests for :func:`esmvalcore.preprocessor._io._load_zarr`. + +This is a dedicated test module for Zarr files IO; we have identified +a number of issues with Zarr IO so it deserves its own test module. + +We have a permanent bucket: esmvaltool-zarr at CEDA's object store +"url": "https://uor-aces-o.s3-ext.jc.rl.ac.uk/esmvaltool-zarr", +where will host a number of test files. Bucket is anon/anon +(read/GET-only, but PUT can be allowed). Bucket operations are done +via usual MinIO client (mc command) e.g. ``mc list``, ``mc du`` etc. + +Further performance investigations are being run with a number of tests +that look at ncdata at https://github.com/valeriupredoi/esmvaltool_zarr_tests +also see https://github.com/pp-mo/ncdata/issues/139 +""" + +from importlib.resources import files as importlib_files +from pathlib import Path + +import cf_units +import pytest + +from esmvalcore.preprocessor._io import load + + +@pytest.mark.parametrize("input_type", [str, Path]) +def test_load_zarr2_local(input_type): + """Test loading a Zarr2 store from local FS.""" + zarr_path = ( + Path(importlib_files("tests")) + / "sample_data" + / "zarr-sample-data" + / "example_field_0.zarr2" + ) + + cubes = load(input_type(zarr_path)) + + assert len(cubes) == 1 + cube = cubes[0] + assert cube.var_name == "q" + assert cube.standard_name == "specific_humidity" + assert cube.long_name is None + assert cube.units == cf_units.Unit("1") + coords = cube.coords() + coord_names = [coord.standard_name for coord in coords] + assert "longitude" in coord_names + assert "latitude" in coord_names + + +def test_load_zarr2_remote(): + """Test loading a Zarr2 store from a https Object Store.""" + zarr_path = ( + "https://uor-aces-o.s3-ext.jc.rl.ac.uk/" + "esmvaltool-zarr/example_field_0.zarr2" + ) + + # with "dummy" storage options + cubes = load( + zarr_path, + ignore_warnings=None, + backend_kwargs={"storage_options": {}}, + ) + + assert len(cubes) == 1 + cube = cubes[0] + assert cube.var_name == "q" + assert cube.standard_name == "specific_humidity" + assert cube.long_name is None + assert cube.units == cf_units.Unit("1") + coords = cube.coords() + coord_names = [coord.standard_name for coord in coords] + assert "longitude" in coord_names + assert "latitude" in coord_names + + # without storage_options + cubes = load(zarr_path) + + assert len(cubes) == 1 + cube = cubes[0] + assert cube.var_name == "q" + assert cube.standard_name == "specific_humidity" + assert cube.long_name is None + assert cube.units == cf_units.Unit("1") + coords = cube.coords() + coord_names = [coord.standard_name for coord in coords] + assert "longitude" in coord_names + assert "latitude" in coord_names + + +def test_load_zarr3_remote(): + """Test loading a Zarr3 store from a https Object Store.""" + zarr_path = ( + "https://uor-aces-o.s3-ext.jc.rl.ac.uk/" + "esmvaltool-zarr/example_field_0.zarr3" + ) + + # with "dummy" storage options + cubes = load( + zarr_path, + ignore_warnings=None, + backend_kwargs={"storage_options": {}}, + ) + + assert len(cubes) == 1 + cube = cubes[0] + assert cube.var_name == "q" + assert cube.standard_name == "specific_humidity" + assert cube.long_name is None + assert cube.units == cf_units.Unit("1") + coords = cube.coords() + coord_names = [coord.standard_name for coord in coords] + assert "longitude" in coord_names + assert "latitude" in coord_names + + +def test_load_zarr3_cmip6_metadata(): + """ + Test loading a Zarr3 store from a https Object Store. + + This test loads just the metadata, no computations. + + This is an actual CMIP6 dataset (Zarr built from netCDF4 via Xarray) + - Zarr store on disk: 243 MiB + - compression: Blosc + - Dimensions: (lat: 128, lon: 256, time: 2352, axis_nbounds: 2) + - chunking: time-slices; netCDF4.Dataset.chunking() = [1, 128, 256] + + Test takes 8-9s (median: 8.5s) and needs max Res mem: 1GB + """ + zarr_path = ( + "https://uor-aces-o.s3-ext.jc.rl.ac.uk/" + "esmvaltool-zarr/pr_Amon_CNRM-ESM2-1_02Kpd-11_r1i1p2f2_gr_200601-220112.zarr3" + ) + + # with "dummy" storage options + cubes = load( + zarr_path, + ignore_warnings=None, + backend_kwargs={"storage_options": {}}, + ) + + assert len(cubes) == 1 + cube = cubes[0] + assert cube.var_name == "pr" + assert cube.standard_name == "precipitation_flux" + assert cube.long_name == "Precipitation" + assert cube.units == cf_units.Unit("kg m-2 s-1") + assert cube.has_lazy_data() + + +def test_load_zarr_remote_not_zarr_file(): + """ + Test loading a Zarr store from a https Object Store. + + This fails due to the file being loaded is not a Zarr file. + """ + zarr_path = ( + "https://uor-aces-o.s3-ext.jc.rl.ac.uk/" + "esmvaltool-zarr/example_field_0.zarr17" + ) + + msg = ( + "File 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/" + "esmvaltool-zarr/example_field_0.zarr17' can not " + "be opened as Zarr file at the moment." + ) + with pytest.raises(ValueError, match=msg): + load(zarr_path) + + +def test_load_zarr_remote_not_file(): + """ + Test loading a Zarr store from a https Object Store. + + This fails due to non-existing file. + """ + zarr_path = ( + "https://uor-aces-o.s3-ext.jc.rl.ac.uk/" + "esmvaltool-zarr/example_field_0.zarr22" + ) + + msg = ( + "File 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/" + "esmvaltool-zarr/example_field_0.zarr22' can not " + "be opened as Zarr file at the moment." + ) + with pytest.raises(ValueError, match=msg): + load(zarr_path) + + +def test_load_zarr_local_not_file(): + """ + Test loading something that has a zarr extension. + + But file doesn't exist (on local FS). + """ + zarr_path = "esmvaltool-zarr/example_field_0.zarr22" + + # "Unable to find group" or "No group found" + # Zarr keeps changing the exception string so matching + # is bound to fail the test + with pytest.raises(FileNotFoundError): + load(zarr_path) + + +def test_load_zarr_local_not_zarr_file(): + """ + Test loading something that has a zarr extension. + + But file is plaintext (on local FS). + """ + zarr_path = ( + Path(importlib_files("tests")) + / "sample_data" + / "zarr-sample-data" + / "example_field_0.zarr17" + ) + + # "Unable to find group" or "No group found" + # Zarr keeps changing the exception string so matching + # is bound to fail the test + with pytest.raises(FileNotFoundError): + load(zarr_path) diff --git a/tests/sample_data/zarr-sample-data/example_field_0.zarr17 b/tests/sample_data/zarr-sample-data/example_field_0.zarr17 new file mode 100644 index 0000000000..9abbe8a42f --- /dev/null +++ b/tests/sample_data/zarr-sample-data/example_field_0.zarr17 @@ -0,0 +1 @@ +This is not a Zarr file. Go grab lunch! diff --git a/tests/sample_data/zarr-sample-data/example_field_0.zarr2/.zattrs b/tests/sample_data/zarr-sample-data/example_field_0.zarr2/.zattrs new file mode 100644 index 0000000000..bb815deaf3 --- /dev/null +++ b/tests/sample_data/zarr-sample-data/example_field_0.zarr2/.zattrs @@ -0,0 +1,3 @@ +{ + "Conventions": "CF-1.12" +} diff --git a/tests/sample_data/zarr-sample-data/example_field_0.zarr2/.zgroup b/tests/sample_data/zarr-sample-data/example_field_0.zarr2/.zgroup new file mode 100644 index 0000000000..3f3fad2d17 --- /dev/null +++ b/tests/sample_data/zarr-sample-data/example_field_0.zarr2/.zgroup @@ -0,0 +1,3 @@ +{ + "zarr_format": 2 +} diff --git a/tests/sample_data/zarr-sample-data/example_field_0.zarr2/.zmetadata b/tests/sample_data/zarr-sample-data/example_field_0.zarr2/.zmetadata new file mode 100644 index 0000000000..ab417b346a --- /dev/null +++ b/tests/sample_data/zarr-sample-data/example_field_0.zarr2/.zmetadata @@ -0,0 +1,171 @@ +{ + "metadata": { + ".zattrs": { + "Conventions": "CF-1.12" + }, + ".zgroup": { + "zarr_format": 2 + }, + "lat/.zarray": { + "chunks": [ + 5 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "