Skip to content

Commit 34827af

Browse files
authored
Merge pull request #62 from bcdev/forman-57-ml_dataset_as_datatree
refactor towards datatree
2 parents 8c0f848 + 04f1873 commit 34827af

File tree

6 files changed

+92
-35
lines changed

6 files changed

+92
-35
lines changed

tests/plugins/xcube/processors/test_mldataset.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,6 @@
1616
from xrlint.plugins.xcube.util import LevelInfo, LevelsMeta
1717
from xrlint.result import Message
1818

19-
# TODO: This tests requires zarr >=2, <3, because the test used fsspec's
20-
# memory filesystem, which is not async but zarr wants all filesystems
21-
# to be async now.
22-
2319

2420
class MultiLevelDatasetProcessorTest(TestCase):
2521
levels_name = "xrlint-test"

tests/plugins/xcube/test_util.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Copyright © 2025 Brockmann Consult GmbH.
2+
# This software is distributed under the terms and conditions of the
3+
# MIT license (https://mit-license.org/).
4+
5+
from unittest import TestCase
6+
7+
from xrlint.plugins.xcube.util import is_absolute_path
8+
from xrlint.plugins.xcube.util import resolve_path
9+
10+
11+
class UtilTest(TestCase):
12+
def test_is_absolute_path(self):
13+
self.assertTrue(is_absolute_path("/home/forman"))
14+
self.assertTrue(is_absolute_path("//bcserver2/fs1"))
15+
self.assertTrue(is_absolute_path("file://home/forman"))
16+
self.assertTrue(is_absolute_path("s3://xcube-data"))
17+
self.assertTrue(is_absolute_path(r"C:\Users\Norman"))
18+
self.assertTrue(is_absolute_path(r"C:/Users/Norman"))
19+
self.assertTrue(is_absolute_path(r"C:/Users/Norman"))
20+
self.assertTrue(is_absolute_path(r"\\bcserver2\fs1"))
21+
22+
self.assertFalse(is_absolute_path(r"data"))
23+
self.assertFalse(is_absolute_path(r"./data"))
24+
self.assertFalse(is_absolute_path(r"../data"))
25+
26+
def test_resolve_path(self):
27+
self.assertEqual(
28+
"/home/forman/data", resolve_path("data", root_path="/home/forman")
29+
)
30+
self.assertEqual(
31+
"/home/forman/data", resolve_path("./data", root_path="/home/forman")
32+
)
33+
self.assertEqual(
34+
"/home/data", resolve_path("../data", root_path="/home/forman")
35+
)
36+
self.assertEqual("s3://opensr/test.zarr", resolve_path("s3://opensr/test.zarr"))

xrlint/_linter/validate.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,11 @@ def _open_and_validate_dataset(
6262
except (OSError, ValueError, TypeError) as e:
6363
return [new_fatal_message(str(e))]
6464
access_latency = time.time() - t0
65-
return processor_op.postprocess(
66-
[
67-
_validate_dataset(config_obj, ds, path, i, access_latency)
68-
for i, (ds, path) in enumerate(ds_path_list)
69-
],
70-
file_path,
71-
)
65+
messages = [
66+
_validate_dataset(config_obj, ds, path, i, access_latency)
67+
for i, (ds, path) in enumerate(ds_path_list)
68+
]
69+
return processor_op.postprocess(messages, file_path)
7270
else:
7371
try:
7472
dataset, access_latency = _open_dataset(

xrlint/plugins/xcube/processors/mldataset.py

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,16 @@
1212

1313
from xrlint.plugins.xcube.constants import ML_FILE_PATTERN, ML_META_FILENAME
1414
from xrlint.plugins.xcube.plugin import plugin
15-
from xrlint.plugins.xcube.util import LevelsMeta, attach_dataset_level_infos, norm_path
15+
from xrlint.plugins.xcube.util import (
16+
LevelsMeta,
17+
attach_dataset_level_infos,
18+
resolve_path,
19+
)
1620
from xrlint.processor import ProcessorOp
1721
from xrlint.result import Message
1822

1923
level_pattern = re.compile(r"^(\d+)(?:\.zarr)?$")
24+
link_pattern = re.compile(r"^(\d+)(?:\.link)?$")
2025

2126

2227
@plugin.define_processor("multi-level-dataset")
@@ -25,7 +30,7 @@ class MultiLevelDatasetProcessor(ProcessorOp):
2530

2631
def preprocess(
2732
self, file_path: str, opener_options: dict[str, Any]
28-
) -> list[tuple[xr.Dataset, str]]:
33+
) -> list[tuple[xr.Dataset | xr.DataTree, str]]:
2934
fs, fs_path = get_filesystem(file_path, opener_options)
3035

3136
file_names = [
@@ -40,18 +45,17 @@ def preprocess(
4045
with fs.open(f"{fs_path}/{ML_META_FILENAME}") as stream:
4146
meta = LevelsMeta.from_value(json.load(stream))
4247

43-
# check for optional ".0.link" that locates level 0 somewhere else
44-
level_0_path = None
45-
if "0.link" in file_names:
46-
level_0_path = fs.read_text(f"{fs_path}/0.link")
48+
# check for optional ".zgroup"
49+
# if ".zgroup" in file_names:
50+
# with fs.open(f"{fs_path}/.zgroup") as stream:
51+
# group_props = json.load(stream)
4752

48-
level_names, num_levels = parse_levels(file_names, level_0_path)
53+
level_paths, num_levels = parse_levels(fs, file_path, file_names)
4954

5055
engine = opener_options.pop("engine", "zarr")
5156

5257
level_datasets: list[xr.Dataset | None] = []
53-
for level, level_name in level_names.items():
54-
level_path = norm_path(f"{file_path}/{level_name}")
58+
for level, level_path in level_paths.items():
5559
level_dataset = xr.open_dataset(level_path, engine=engine, **opener_options)
5660
level_datasets.append((level_dataset, level_path))
5761

@@ -80,22 +84,30 @@ def get_filesystem(file_path: str, opener_options: dict[str, Any]):
8084

8185

8286
def parse_levels(
83-
file_names: list[str], level_0_path: str | None
87+
fs: fsspec.AbstractFileSystem, dataset_path: str, file_names: list[str]
8488
) -> tuple[dict[int, str], int]:
85-
level_names: dict[int, str] = {0: level_0_path} if level_0_path else {}
86-
num_levels = 0
89+
level_paths: dict[int, str] = {}
8790
for file_name in file_names:
91+
# check for optional "<level>.link" that locates a level somewhere else
92+
m = link_pattern.match(file_name)
93+
if m is not None:
94+
level = int(m.group(1))
95+
link_path = fs.read_text(f"{dataset_path}/{file_name}")
96+
level_paths[level] = resolve_path(link_path, root_path=dataset_path)
97+
# check for regular "<level>.zarr"
8898
m = level_pattern.match(file_name)
8999
if m is not None:
90100
level = int(m.group(1))
91-
level_names[level] = file_name
92-
num_levels = max(num_levels, level + 1)
93-
if not level_names:
101+
level_paths[level] = f"{dataset_path}/{file_name}"
102+
103+
if not level_paths:
94104
raise ValueError("empty multi-level dataset")
95-
num_levels = max(level_names.keys()) + 1
105+
106+
num_levels = max(level_paths.keys()) + 1
96107
for level in range(num_levels):
97-
if level not in level_names:
108+
if level not in level_paths:
98109
raise ValueError(
99110
f"missing dataset for level {level} in multi-level dataset"
100111
)
101-
return level_names, num_levels
112+
113+
return level_paths, num_levels

xrlint/plugins/xcube/util.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,26 @@ def get_spatial_size(
9595
return None
9696

9797

98-
def norm_path(level_path: str) -> str:
99-
parts = level_path.replace("\\", "/").split("/")
100-
level_path = "/".join(
98+
def resolve_path(path: str, root_path: str | None = None) -> str:
99+
abs_level_path = path
100+
if root_path is not None and not is_absolute_path(path):
101+
abs_level_path = f"{root_path}/{path}"
102+
parts = abs_level_path.rstrip("/").replace("\\", "/").split("/")
103+
return "/".join(
101104
p
102105
for i, p in enumerate(parts)
103106
if p not in (".", "..") and (i == len(parts) - 1 or parts[i + 1] != "..")
104107
)
105-
return level_path
108+
109+
110+
def is_absolute_path(path: str) -> bool:
111+
return (
112+
# Unix abs path
113+
path.startswith("/")
114+
# URL
115+
or "://" in path
116+
# Windows abs paths
117+
or path.startswith("\\\\")
118+
or path.find(":\\", 1) == 1
119+
or path.find(":/", 1) == 1
120+
)

xrlint/processor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class ProcessorOp(ABC):
1818
@abstractmethod
1919
def preprocess(
2020
self, file_path: str, opener_options: dict[str, Any]
21-
) -> list[tuple[xr.Dataset, str]]:
21+
) -> list[tuple[xr.Dataset | xr.DataTree, str]]:
2222
"""Pre-process a dataset given by its `file_path` and `opener_options`.
2323
In this method you use the `file_path` to read zero, one, or more
2424
datasets to lint.
@@ -28,7 +28,7 @@ def preprocess(
2828
opener_options: The configuration's `opener_options`.
2929
3030
Returns:
31-
A list of (dataset, file_path) pairs
31+
A list of (dataset or datatree, file_path) pairs
3232
"""
3333

3434
@abstractmethod

0 commit comments

Comments
 (0)