Skip to content

Commit e508bec

Browse files
Zarr support (backend, in esmvalcore.preprocessor._io.py) (#2785)
Co-authored-by: Manuel Schlund <[email protected]>
1 parent 05f8e4d commit e508bec

File tree

31 files changed

+640
-1
lines changed

31 files changed

+640
-1
lines changed

environment.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ channels:
55
- nodefaults
66

77
dependencies:
8+
- aiohttp
89
- cartopy
910
- cf-units
1011
- cftime
@@ -18,6 +19,7 @@ dependencies:
1819
- fire
1920
- geopy
2021
- humanfriendly
22+
- intake-esm
2123
- iris >=3.12.2 # https://github.com/SciTools/iris/issues/6417
2224
- iris-esmf-regrid >=0.11.0
2325
- iris-grib >=0.20.0 # github.com/ESMValGroup/ESMValCore/issues/2535
@@ -46,6 +48,7 @@ dependencies:
4648
- shapely >=2.0.0
4749
- xarray
4850
- yamale
51+
- zarr >3
4952
# Python packages needed for building docs
5053
- autodocsumm >=0.2.2
5154
- ipython <9.0 # github.com/ESMValGroup/ESMValCore/issues/2680

esmvalcore/preprocessor/_io.py

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
from itertools import groupby
1010
from pathlib import Path
1111
from typing import TYPE_CHECKING, Any
12+
from urllib.parse import urlparse
1213

14+
import fsspec
1315
import iris
1416
import ncdata
1517
import xarray as xr
@@ -75,6 +77,7 @@ def _restore_lat_lon_units(
7577
def load(
7678
file: str | Path | Cube | CubeList | xr.Dataset | ncdata.NcData,
7779
ignore_warnings: list[dict[str, Any]] | None = None,
80+
backend_kwargs: dict[str, Any] | None = None,
7881
) -> CubeList:
7982
"""Load Iris cubes.
8083
@@ -83,10 +86,19 @@ def load(
8386
file:
8487
File to be loaded. If ``file`` is already a loaded dataset, return it
8588
as a :class:`~iris.cube.CubeList`.
89+
File as ``Path`` object could be a Zarr store.
8690
ignore_warnings:
8791
Keyword arguments passed to :func:`warnings.filterwarnings` used to
8892
ignore warnings issued by :func:`iris.load_raw`. Each list element
8993
corresponds to one call to :func:`warnings.filterwarnings`.
94+
backend_kwargs:
95+
Dict to hold info needed by storage backend e.g. to access
96+
a PRIVATE S3 bucket containing object stores (e.g. netCDF4 files);
97+
needed by ``fsspec`` and its extensions e.g. ``s3fs``, so
98+
most of the times this will include ``storage_options``. Note that Zarr
99+
files are opened via ``http`` extension of ``fsspec``, so no need
100+
for ``storage_options`` in that case (ie anon/anon). Currently only used
101+
in Zarr file opening.
90102
91103
Returns
92104
-------
@@ -102,7 +114,19 @@ def load(
102114
103115
"""
104116
if isinstance(file, (str, Path)):
105-
cubes = _load_from_file(file, ignore_warnings=ignore_warnings)
117+
extension = (
118+
file.suffix
119+
if isinstance(file, Path)
120+
else os.path.splitext(file)[1]
121+
)
122+
if "zarr" not in extension:
123+
cubes = _load_from_file(file, ignore_warnings=ignore_warnings)
124+
else:
125+
cubes = _load_zarr(
126+
file,
127+
ignore_warnings=ignore_warnings,
128+
backend_kwargs=backend_kwargs,
129+
)
106130
elif isinstance(file, Cube):
107131
cubes = CubeList([file])
108132
elif isinstance(file, CubeList):
@@ -134,6 +158,53 @@ def load(
134158
return cubes
135159

136160

161+
def _load_zarr(
162+
file: str | Path | Cube | CubeList | xr.Dataset | ncdata.NcData,
163+
ignore_warnings: list[dict[str, Any]] | None = None,
164+
backend_kwargs: dict[str, Any] | None = None,
165+
) -> CubeList:
166+
# case 1: Zarr store is on remote object store
167+
# file's URI will always be either http or https
168+
if urlparse(str(file)).scheme in ["http", "https"]:
169+
# basic test that opens the Zarr/.zmetadata file for Zarr2
170+
# or Zarr/zarr.json for Zarr3
171+
fs = fsspec.filesystem("http")
172+
valid_zarr = True
173+
try:
174+
fs.open(str(file) + "/zarr.json", "rb") # Zarr3
175+
except Exception: # noqa: BLE001
176+
try:
177+
fs.open(str(file) + "/.zmetadata", "rb") # Zarr2
178+
except Exception: # noqa: BLE001
179+
valid_zarr = False
180+
# we don't want to catch any specific aiohttp/fsspec exception
181+
# bottom line is that that file has issues, so raise
182+
if not valid_zarr:
183+
msg = (
184+
f"File '{file}' can not be opened as Zarr file at the moment."
185+
)
186+
raise ValueError(msg)
187+
188+
time_coder = xr.coders.CFDatetimeCoder(use_cftime=True)
189+
zarr_xr = xr.open_dataset(
190+
file,
191+
consolidated=True,
192+
decode_times=time_coder,
193+
engine="zarr",
194+
backend_kwargs=backend_kwargs,
195+
)
196+
# case 2: Zarr store is local to the file system
197+
else:
198+
zarr_xr = xr.open_dataset(
199+
file,
200+
consolidated=False,
201+
engine="zarr",
202+
backend_kwargs=backend_kwargs,
203+
)
204+
205+
return dataset_to_iris(zarr_xr, ignore_warnings=ignore_warnings)
206+
207+
137208
def _load_from_file(
138209
file: str | Path,
139210
ignore_warnings: list[dict[str, Any]] | None = None,

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ dynamic = [
3232
"version",
3333
]
3434
dependencies = [
35+
"aiohttp",
3536
"cartopy",
3637
"cf-units",
3738
"dask[array,distributed]>=2025", # Core/issues/2503
@@ -44,6 +45,7 @@ dependencies = [
4445
"fire",
4546
"geopy",
4647
"humanfriendly",
48+
"intake-esm",
4749
"iris-grib>=0.20.0", # github.com/ESMValGroup/ESMValCore/issues/2535
4850
"isodate>=0.7.0",
4951
"jinja2",
@@ -68,6 +70,7 @@ dependencies = [
6870
"stratify>=0.3",
6971
"xarray",
7072
"yamale",
73+
"zarr>3",
7174
]
7275
description = "A community tool for pre-processing data from Earth system models in CMIP and running analysis scripts"
7376
license = {text = "Apache License, Version 2.0"}
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
"""
2+
Integration tests for :func:`esmvalcore.preprocessor._io._load_zarr`.
3+
4+
This is a dedicated test module for Zarr files IO; we have identified
5+
a number of issues with Zarr IO so it deserves its own test module.
6+
7+
We have a permanent bucket: esmvaltool-zarr at CEDA's object store
8+
"url": "https://uor-aces-o.s3-ext.jc.rl.ac.uk/esmvaltool-zarr",
9+
where will host a number of test files. Bucket is anon/anon
10+
(read/GET-only, but PUT can be allowed). Bucket operations are done
11+
via usual MinIO client (mc command) e.g. ``mc list``, ``mc du`` etc.
12+
13+
Further performance investigations are being run with a number of tests
14+
that look at ncdata at https://github.com/valeriupredoi/esmvaltool_zarr_tests
15+
also see https://github.com/pp-mo/ncdata/issues/139
16+
"""
17+
18+
from importlib.resources import files as importlib_files
19+
from pathlib import Path
20+
21+
import cf_units
22+
import pytest
23+
24+
from esmvalcore.preprocessor._io import load
25+
26+
27+
@pytest.mark.parametrize("input_type", [str, Path])
28+
def test_load_zarr2_local(input_type):
29+
"""Test loading a Zarr2 store from local FS."""
30+
zarr_path = (
31+
Path(importlib_files("tests"))
32+
/ "sample_data"
33+
/ "zarr-sample-data"
34+
/ "example_field_0.zarr2"
35+
)
36+
37+
cubes = load(input_type(zarr_path))
38+
39+
assert len(cubes) == 1
40+
cube = cubes[0]
41+
assert cube.var_name == "q"
42+
assert cube.standard_name == "specific_humidity"
43+
assert cube.long_name is None
44+
assert cube.units == cf_units.Unit("1")
45+
coords = cube.coords()
46+
coord_names = [coord.standard_name for coord in coords]
47+
assert "longitude" in coord_names
48+
assert "latitude" in coord_names
49+
50+
51+
def test_load_zarr2_remote():
52+
"""Test loading a Zarr2 store from a https Object Store."""
53+
zarr_path = (
54+
"https://uor-aces-o.s3-ext.jc.rl.ac.uk/"
55+
"esmvaltool-zarr/example_field_0.zarr2"
56+
)
57+
58+
# with "dummy" storage options
59+
cubes = load(
60+
zarr_path,
61+
ignore_warnings=None,
62+
backend_kwargs={"storage_options": {}},
63+
)
64+
65+
assert len(cubes) == 1
66+
cube = cubes[0]
67+
assert cube.var_name == "q"
68+
assert cube.standard_name == "specific_humidity"
69+
assert cube.long_name is None
70+
assert cube.units == cf_units.Unit("1")
71+
coords = cube.coords()
72+
coord_names = [coord.standard_name for coord in coords]
73+
assert "longitude" in coord_names
74+
assert "latitude" in coord_names
75+
76+
# without storage_options
77+
cubes = load(zarr_path)
78+
79+
assert len(cubes) == 1
80+
cube = cubes[0]
81+
assert cube.var_name == "q"
82+
assert cube.standard_name == "specific_humidity"
83+
assert cube.long_name is None
84+
assert cube.units == cf_units.Unit("1")
85+
coords = cube.coords()
86+
coord_names = [coord.standard_name for coord in coords]
87+
assert "longitude" in coord_names
88+
assert "latitude" in coord_names
89+
90+
91+
def test_load_zarr3_remote():
92+
"""Test loading a Zarr3 store from a https Object Store."""
93+
zarr_path = (
94+
"https://uor-aces-o.s3-ext.jc.rl.ac.uk/"
95+
"esmvaltool-zarr/example_field_0.zarr3"
96+
)
97+
98+
# with "dummy" storage options
99+
cubes = load(
100+
zarr_path,
101+
ignore_warnings=None,
102+
backend_kwargs={"storage_options": {}},
103+
)
104+
105+
assert len(cubes) == 1
106+
cube = cubes[0]
107+
assert cube.var_name == "q"
108+
assert cube.standard_name == "specific_humidity"
109+
assert cube.long_name is None
110+
assert cube.units == cf_units.Unit("1")
111+
coords = cube.coords()
112+
coord_names = [coord.standard_name for coord in coords]
113+
assert "longitude" in coord_names
114+
assert "latitude" in coord_names
115+
116+
117+
def test_load_zarr3_cmip6_metadata():
118+
"""
119+
Test loading a Zarr3 store from a https Object Store.
120+
121+
This test loads just the metadata, no computations.
122+
123+
This is an actual CMIP6 dataset (Zarr built from netCDF4 via Xarray)
124+
- Zarr store on disk: 243 MiB
125+
- compression: Blosc
126+
- Dimensions: (lat: 128, lon: 256, time: 2352, axis_nbounds: 2)
127+
- chunking: time-slices; netCDF4.Dataset.chunking() = [1, 128, 256]
128+
129+
Test takes 8-9s (median: 8.5s) and needs max Res mem: 1GB
130+
"""
131+
zarr_path = (
132+
"https://uor-aces-o.s3-ext.jc.rl.ac.uk/"
133+
"esmvaltool-zarr/pr_Amon_CNRM-ESM2-1_02Kpd-11_r1i1p2f2_gr_200601-220112.zarr3"
134+
)
135+
136+
# with "dummy" storage options
137+
cubes = load(
138+
zarr_path,
139+
ignore_warnings=None,
140+
backend_kwargs={"storage_options": {}},
141+
)
142+
143+
assert len(cubes) == 1
144+
cube = cubes[0]
145+
assert cube.var_name == "pr"
146+
assert cube.standard_name == "precipitation_flux"
147+
assert cube.long_name == "Precipitation"
148+
assert cube.units == cf_units.Unit("kg m-2 s-1")
149+
assert cube.has_lazy_data()
150+
151+
152+
def test_load_zarr_remote_not_zarr_file():
153+
"""
154+
Test loading a Zarr store from a https Object Store.
155+
156+
This fails due to the file being loaded is not a Zarr file.
157+
"""
158+
zarr_path = (
159+
"https://uor-aces-o.s3-ext.jc.rl.ac.uk/"
160+
"esmvaltool-zarr/example_field_0.zarr17"
161+
)
162+
163+
msg = (
164+
"File 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/"
165+
"esmvaltool-zarr/example_field_0.zarr17' can not "
166+
"be opened as Zarr file at the moment."
167+
)
168+
with pytest.raises(ValueError, match=msg):
169+
load(zarr_path)
170+
171+
172+
def test_load_zarr_remote_not_file():
173+
"""
174+
Test loading a Zarr store from a https Object Store.
175+
176+
This fails due to non-existing file.
177+
"""
178+
zarr_path = (
179+
"https://uor-aces-o.s3-ext.jc.rl.ac.uk/"
180+
"esmvaltool-zarr/example_field_0.zarr22"
181+
)
182+
183+
msg = (
184+
"File 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/"
185+
"esmvaltool-zarr/example_field_0.zarr22' can not "
186+
"be opened as Zarr file at the moment."
187+
)
188+
with pytest.raises(ValueError, match=msg):
189+
load(zarr_path)
190+
191+
192+
def test_load_zarr_local_not_file():
193+
"""
194+
Test loading something that has a zarr extension.
195+
196+
But file doesn't exist (on local FS).
197+
"""
198+
zarr_path = "esmvaltool-zarr/example_field_0.zarr22"
199+
200+
# "Unable to find group" or "No group found"
201+
# Zarr keeps changing the exception string so matching
202+
# is bound to fail the test
203+
with pytest.raises(FileNotFoundError):
204+
load(zarr_path)
205+
206+
207+
def test_load_zarr_local_not_zarr_file():
208+
"""
209+
Test loading something that has a zarr extension.
210+
211+
But file is plaintext (on local FS).
212+
"""
213+
zarr_path = (
214+
Path(importlib_files("tests"))
215+
/ "sample_data"
216+
/ "zarr-sample-data"
217+
/ "example_field_0.zarr17"
218+
)
219+
220+
# "Unable to find group" or "No group found"
221+
# Zarr keeps changing the exception string so matching
222+
# is bound to fail the test
223+
with pytest.raises(FileNotFoundError):
224+
load(zarr_path)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is not a Zarr file. Go grab lunch!
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"Conventions": "CF-1.12"
3+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"zarr_format": 2
3+
}

0 commit comments

Comments
 (0)