diff --git a/CHANGES.md b/CHANGES.md index 59c884d..a4c104e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,7 +2,40 @@ ## Version 0.4.0 (in development) - +- New xcube multi-level dataset rules: + - `ml-dataset-meta`: verifies that a meta info file exists and is consistent + - `ml-dataset-xy`: verifies that the levels have expected spatial resolutions + - `ml-dataset-time`: verifies that the levels have expected time dimension, if any +- Now supporting xcube multi-level datasets `*.levels`: + - Added xcube plugin processor `"xcube/multi-level-dataset"` that is used + inside the predefined xcube configurations "all" and "recommended". +- Directories that are recognized by file patterns associated with a non-empty + configuration object are no longer recursively + traversed. +- Introduced method `Plugin.define_config` which defines a named plugin + configuration. It takes a name and a configuration object or list of + configuration objects. +- Changed the way how configuration is defined and exported from + Python configuration files: + - Renamed function that exports configuration from `export_configs` + into `export_config`. + - The returned value should be a list of values that can be + converted into configuration objects: mixed `Config` instances, + dictionary, or a name that refers to a named configuration of a plugin. +- Node path names now contain the dataset index if a file path + has been opened by a processor produced multiple + datasets to validate. + +- Other changes: + - Changed type of `Plugin.configs` from `dict[str, Config]` to + `dict[str, list[Config]]`. + - Inbuilt plugin rules now import their `plugin` instance from + `xrlint.plugins..plugin` module. + - `JsonSerializable` now recognizes `dataclass` instances and no longer + serializes property values that are also default values. + - Pinned zarr dependency to be >=2.18, <3 until test + `tests.plugins.xcube.processors.test_mldataset.MultiLevelDatasetProcessorTest` + is adjusted or fsspec's memory filesystem is updated. ## Version 0.3.0 (from 2025-01-20) diff --git a/docs/config.md b/docs/config.md index c3819fb..3061703 100644 --- a/docs/config.md +++ b/docs/config.md @@ -43,7 +43,7 @@ Same using JSON: And as Python script: ```python -def export_configs(): +def export_config(): return [ {"files": ["**/*.zarr", "**/*.nc"]}, { diff --git a/docs/rule-ref.md b/docs/rule-ref.md index d9aaf0b..bb4219d 100644 --- a/docs/rule-ref.md +++ b/docs/rule-ref.md @@ -114,6 +114,27 @@ Latitude and longitude coordinates and dimensions should be called 'lat' and 'lo Contained in: `all`-:material-lightning-bolt: `recommended`-:material-lightning-bolt: +### :material-lightbulb: `ml-dataset-meta` + +Multi-level datasets should provide '.zlevels' meta information file and if so, it should be consistent. +[:material-information-variant:](https://xcube.readthedocs.io/en/latest/mldatasets.html#the-xcube-levels-format) + +Contained in: `all`-:material-lightning-bolt: `recommended`-:material-lightning-bolt: + +### :material-bug: `ml-dataset-time` + +The `time` dimension of multi-level datasets should use a chunk size of 1. This allows for faster image tile generation for visualisation. +[:material-information-variant:](https://xcube.readthedocs.io/en/latest/mldatasets.html#definition) + +Contained in: `all`-:material-lightning-bolt: `recommended`-:material-alert: + +### :material-bug: `ml-dataset-xy` + +Multi-level dataset levels should provide spatial resolutions decreasing by powers of two. +[:material-information-variant:](https://xcube.readthedocs.io/en/latest/mldatasets.html#definition) + +Contained in: `all`-:material-lightning-bolt: `recommended`-:material-lightning-bolt: + ### :material-bug: `single-grid-mapping` A single grid mapping shall be used for all spatial data variables of a datacube. diff --git a/docs/todo.md b/docs/todo.md index b6e1afa..8931dfd 100644 --- a/docs/todo.md +++ b/docs/todo.md @@ -14,10 +14,6 @@ ## Desired - project logo -- support validating xcube 'levels' format. Options: - - implement xarray backend so we can open them using `xr.open_dataset` - with `opener_options: {"engine": "xc-levels"}`. - - implement a `xrlint.processor.Processor` for that purpose. - add some more tests so we reach 99% coverage - support rule op args/kwargs schema validation - Support `RuleTest.expected`, it is currently unused diff --git a/environment.yml b/environment.yml index 5b3fd1b..e5334fc 100644 --- a/environment.yml +++ b/environment.yml @@ -20,7 +20,8 @@ dependencies: - requests-mock - ruff # Testing Datasets + - dask - pandas - netcdf4 - numpy - - zarr + - zarr >=2.18,<3 # tests fail with zarr 3+ diff --git a/examples/plugin_config.py b/examples/plugin_config.py index b08139e..e98e8b1 100644 --- a/examples/plugin_config.py +++ b/examples/plugin_config.py @@ -3,27 +3,11 @@ using the `Plugin` class and its `define_rule()` decorator method. """ -from xrlint.config import Config from xrlint.node import DatasetNode from xrlint.plugin import new_plugin from xrlint.rule import RuleContext, RuleOp -plugin = new_plugin( - name="hello-plugin", - version="1.0.0", - configs={ - # "configs" entries must be `Config` objects! - "recommended": Config.from_value( - { - "rules": { - "hello/good-title": "warn", - # Configure more rules here... - }, - } - ), - # Add more configurations here... - }, -) +plugin = new_plugin(name="hello-plugin", version="1.0.0") @plugin.define_rule("good-title") @@ -42,7 +26,22 @@ def dataset(self, ctx: RuleContext, node: DatasetNode): # Define more rules here... -def export_configs(): +plugin.define_config( + "recommended", + [ + { + "rules": { + "hello/good-title": "warn", + # Configure more rules here... + }, + } + ], +) + +# Add more configurations here... + + +def export_config(): return [ # Use "hello" plugin { diff --git a/examples/virtual_plugin_config.py b/examples/virtual_plugin_config.py index 858d878..da9a3a8 100644 --- a/examples/virtual_plugin_config.py +++ b/examples/virtual_plugin_config.py @@ -22,7 +22,7 @@ def dataset(self, ctx: RuleContext, node: DatasetNode): # Define more rules here... -def export_configs(): +def export_config(): return [ # Define and use "hello" plugin { @@ -37,12 +37,14 @@ def export_configs(): # Add more rules here... }, "configs": { - "recommended": { - "rules": { - "hello/good-title": "warn", - # Configure more rules here... - }, - }, + "recommended": [ + { + "rules": { + "hello/good-title": "warn", + # Configure more rules here... + }, + } + ], # Add more configurations here... }, }, diff --git a/mkruleref.py b/mkruleref.py index ebd1a31..6e2d396 100644 --- a/mkruleref.py +++ b/mkruleref.py @@ -1,4 +1,5 @@ from xrlint.plugin import Plugin +from xrlint.rule import RuleConfig # for icons, see # https://squidfunk.github.io/mkdocs-material/reference/icons-emojis/ @@ -39,7 +40,7 @@ def write_rule_ref_page(): def write_plugin_rules(stream, plugin: Plugin): - configs = plugin.configs + config_rules = get_plugin_rule_configs(plugin) for rule_id in sorted(plugin.rules.keys()): rule_meta = plugin.rules[rule_id].meta stream.write( @@ -51,9 +52,8 @@ def write_plugin_rules(stream, plugin: Plugin): stream.write("\n\n") # List the predefined configurations that contain the rule stream.write("Contained in: ") - for config_id in sorted(configs.keys()): - config = configs[config_id] - rule_configs = config.rules or {} + for config_id in sorted(config_rules.keys()): + rule_configs = config_rules[config_id] rule_config = rule_configs.get(rule_id) or rule_configs.get( f"{plugin.meta.name}/{rule_id}" ) @@ -62,5 +62,21 @@ def write_plugin_rules(stream, plugin: Plugin): stream.write("\n\n") +def get_plugin_rule_configs(plugin): + configs = plugin.configs + config_rules: dict[str, dict[str, RuleConfig]] = {} + for config_name, config_list in configs.items(): + # note, here we assume most plugins configure their rules + # in one dedicated config object only. However, this is not + # the general case as file patterns may be used to make the + # rules configurations specific. + rule_configs = {} + for config in config_list: + if config.rules: + rule_configs.update(config.rules) + config_rules[config_name] = rule_configs + return config_rules + + if __name__ == "__main__": write_rule_ref_page() diff --git a/pyproject.toml b/pyproject.toml index 8981b1d..5b03f43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,10 +73,11 @@ dev = [ "ruff", "twine", # Dataset testing + "dask", "netcdf4", "numpy", "pandas", - "zarr", + "zarr >=2.18,<3", ] doc = [ "mkdocs", diff --git a/tests/_linter/test_rulectx.py b/tests/_linter/test_rulectx.py index a74600b..d7ce5fc 100644 --- a/tests/_linter/test_rulectx.py +++ b/tests/_linter/test_rulectx.py @@ -12,14 +12,15 @@ class RuleContextImplTest(TestCase): def test_defaults(self): config = Config() dataset = xr.Dataset() - context = RuleContextImpl(config, dataset, "./ds.zarr") + context = RuleContextImpl(config, dataset, "./ds.zarr", None) self.assertIs(config, context.config) self.assertIs(dataset, context.dataset) self.assertEqual({}, context.settings) self.assertEqual("./ds.zarr", context.file_path) + self.assertEqual(None, context.file_index) def test_report(self): - context = RuleContextImpl(Config(), xr.Dataset(), "./ds.zarr") + context = RuleContextImpl(Config(), xr.Dataset(), "./ds.zarr", None) with context.use_state(rule_id="no-xxx"): context.report( "What the heck do you mean?", diff --git a/tests/cli/configs/recommended.py b/tests/cli/configs/recommended.py index 23bc763..f998c12 100644 --- a/tests/cli/configs/recommended.py +++ b/tests/cli/configs/recommended.py @@ -1,4 +1,4 @@ -def export_configs(): +def export_config(): import xrlint.plugins.core import xrlint.plugins.xcube @@ -10,8 +10,8 @@ def export_configs(): "xcube": xcube, } }, - core.configs["recommended"], - xcube.configs["recommended"], + *core.configs["recommended"], + *xcube.configs["recommended"], { "rules": { "dataset-title-attr": "error", diff --git a/tests/cli/test_config.py b/tests/cli/test_config.py index ce70c6b..6bd49a5 100644 --- a/tests/cli/test_config.py +++ b/tests/cli/test_config.py @@ -34,7 +34,7 @@ """ py_text = """ -def export_configs(): +def export_config(): return [ { "name": "py-test", @@ -117,13 +117,13 @@ def test_read_config_yaml_with_format_error(self): read_config_list(config_path) def test_read_config_yaml_with_type_error(self): - with text_file("config.yaml", "prime: 97") as config_path: + with text_file("config.yaml", "97") as config_path: with pytest.raises( ConfigError, match=( - "'config.yaml: configuration list must be of" - " type ConfigList|list\\[Config|dict|str\\]," - " but got dict'" + r"config\.yaml\: config_list must be of" + r" type ConfigList \| list\[Config \| dict \| str\]," + r" but got int" ), ): read_config_list(config_path) @@ -141,14 +141,14 @@ def test_read_config_py_no_export(self): with pytest.raises( ConfigError, match=( - "config_1002.py: attribute 'export_configs'" + "config_1002.py: attribute 'export_config'" " not found in module 'config_1002'" ), ): read_config_list(config_path) def test_read_config_py_with_value_error(self): - py_code = "def export_configs():\n raise ValueError('value is useless!')\n" + py_code = "def export_config():\n raise ValueError('value is useless!')\n" with text_file(self.new_config_py(), py_code) as config_path: with pytest.raises( ValueError, @@ -157,7 +157,7 @@ def test_read_config_py_with_value_error(self): read_config_list(config_path) def test_read_config_py_with_os_error(self): - py_code = "def export_configs():\n raise OSError('where is my hat?')\n" + py_code = "def export_config():\n raise OSError('where is my hat?')\n" with text_file(self.new_config_py(), py_code) as config_path: with pytest.raises( ConfigError, @@ -166,15 +166,15 @@ def test_read_config_py_with_os_error(self): read_config_list(config_path) def test_read_config_py_with_invalid_config_list(self): - py_code = "def export_configs():\n return 42\n" + py_code = "def export_config():\n return 42\n" with text_file(self.new_config_py(), py_code) as config_path: with pytest.raises( ConfigError, match=( - ".py: return value of export_configs\\(\\):" - " configuration list must be of type" - " ConfigList|list\\[Config|dict|str\\]," - " but got int" + r"\.py: return value of export_config\(\):" + r" config_list must be of type" + r" ConfigList \| list\[Config\ | dict \| str\]," + r" but got int" ), ): read_config_list(config_path) @@ -198,10 +198,10 @@ def test_read_config_yaml(self): def assert_ok(self, config_list: ConfigList): self.assertIsInstance(config_list, ConfigList) - self.assertEqual(4, len(config_list.configs)) + self.assertEqual(7, len(config_list.configs)) config = config_list.compute_config("test.zarr") self.assertIsInstance(config, Config) - self.assertEqual("", config.name) + self.assertEqual(None, config.name) self.assertIsInstance(config.plugins, dict) self.assertEqual({"xcube"}, set(config.plugins.keys())) self.assertIsInstance(config.rules, dict) diff --git a/tests/cli/test_main.py b/tests/cli/test_main.py index bf0fb53..5aab920 100644 --- a/tests/cli/test_main.py +++ b/tests/cli/test_main.py @@ -191,7 +191,6 @@ def test_print_config_option(self): self.assertEqual( ( "{\n" - ' "name": "",\n' ' "plugins": {\n' ' "__core__": "xrlint.plugins.core:export_plugin"\n' " },\n" diff --git a/tests/plugins/core/test_plugin.py b/tests/plugins/core/test_plugin.py index 0c7baed..057ef4d 100644 --- a/tests/plugins/core/test_plugin.py +++ b/tests/plugins/core/test_plugin.py @@ -34,9 +34,9 @@ def test_configs_complete(self): all_rule_names = set(plugin.rules.keys()) self.assertEqual( all_rule_names, - set(plugin.configs["all"].rules.keys()), + set(plugin.configs["all"][-1].rules.keys()), ) self.assertEqual( all_rule_names, - set(plugin.configs["recommended"].rules.keys()), + set(plugin.configs["recommended"][-1].rules.keys()), ) diff --git a/tests/plugins/xcube/helpers.py b/tests/plugins/xcube/helpers.py new file mode 100644 index 0000000..181f0e2 --- /dev/null +++ b/tests/plugins/xcube/helpers.py @@ -0,0 +1,97 @@ +import math + +import numpy as np +import xarray as xr + +from xrlint.plugins.xcube.util import LevelsMeta, attach_dataset_level_infos + + +def make_cube_levels( + nl: int, + nx: int, + ny: int, + nt: int | None = None, + meta: LevelsMeta | None = None, + force_infos: bool = False, +) -> list[xr.Dataset]: + levels = [ + make_cube(math.ceil(nx >> level), math.ceil(ny >> level), nt) + for level in range(nl) + ] + if meta is not None or force_infos: + attach_dataset_level_infos( + [(ds, f"{i}.zarr") for i, ds in enumerate(levels)], meta=meta + ) + return levels + + +def make_cube(nx: int, ny: int, nt: int | None = None) -> xr.Dataset: + """Make an in-memory dataset that should pass all xcube rules. + + Args: + nx: length of the lon-dimension + ny: length of the lat-dimension + nt: length of the time-dimension, optional + + Returns: + an in-memory dataset with one 3-d data variable "chl" + with dimensions ["time",] "lat", "lon". + """ + x_attrs = dict( + long_name="longitude", + standard_name="longitude", + units="degrees_east", + ) + y_attrs = dict( + long_name="latitude", + standard_name="latitude", + units="degrees_north", + ) + + dx = 180.0 / nx + dy = 90.0 / ny + x_data = np.linspace(-180 + dx, 180 - dx, nx) + y_data = np.linspace(-90 + dy, 90 - dy, ny) + + chl_attrs = dict( + long_name="chlorophyll concentration", + standard_name="chlorophyll_concentration", + units="mg/m^3", + _FillValue=0, + ) + chl_chunks = dict(lat=min(ny, 90), lon=min(nx, 90)) + + ds_attrs = dict(title="Chlorophyll") + + coords = dict( + lon=xr.DataArray(x_data, dims="lon", attrs=x_attrs), + lat=xr.DataArray(y_data, dims="lat", attrs=y_attrs), + ) + + if nt is None: + return xr.Dataset( + data_vars=dict( + chl=xr.DataArray( + np.zeros((ny, nx)), dims=["lat", "lon"], attrs=chl_attrs + ).chunk(**chl_chunks), + ), + coords=coords, + attrs=ds_attrs, + ) + else: + time_attrs = dict( + long_name="time", + standard_name="time", + units="days since 2024-06-10:12:00:00 utc", + calendar="gregorian", + ) + coords.update(time=xr.DataArray(range(nt), dims="time", attrs=time_attrs)) + return xr.Dataset( + data_vars=dict( + chl=xr.DataArray( + np.zeros((nt, ny, nx)), dims=["time", "lat", "lon"], attrs=chl_attrs + ).chunk(time=1, **chl_chunks), + ), + coords=coords, + attrs=ds_attrs, + ) diff --git a/tests/plugins/xcube/processors/__init__.py b/tests/plugins/xcube/processors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/plugins/xcube/processors/test_mldataset.py b/tests/plugins/xcube/processors/test_mldataset.py new file mode 100644 index 0000000..69005e4 --- /dev/null +++ b/tests/plugins/xcube/processors/test_mldataset.py @@ -0,0 +1,145 @@ +import json +from typing import Any +from unittest import TestCase + +import fsspec +import pytest +import xarray as xr + +from tests.plugins.xcube.helpers import make_cube_levels +from xrlint.plugins.xcube.constants import ML_INFO_ATTR +from xrlint.plugins.xcube.processors.mldataset import MultiLevelDatasetProcessor +from xrlint.plugins.xcube.util import ( + LevelInfo, + LevelsMeta, +) +from xrlint.result import Message + +# TODO: This tests requires zarr >=2, <3, because the test used fsspec's +# memory filesystem, which is not async but zarr wants all filesystems +# to be async now. + + +class MultiLevelDatasetProcessorTest(TestCase): + levels_name = "xrlint-test" + levels_dir = f"memory://{levels_name}.levels" + + x_size = nx = 720 + y_size = ny = 360 + time_size = 3 + num_levels = 4 + + meta_path = f"{levels_dir}/.zlevels" + meta_content = { + "version": "1.0", + "num_levels": num_levels, + "use_saved_levels": False, + # tile_size is optional + "agg_methods": { + "chl": "mean", + }, + } + + @classmethod + def setUpClass(cls): + fs, _ = fsspec.core.url_to_fs(cls.levels_dir) + cls.fs: fsspec.AbstractFileSystem = fs + + def setUp(self): + self._delete_files() + self.fs.mkdir(self.levels_dir) + + level_datasets = make_cube_levels( + self.num_levels, self.x_size, self.y_size, self.time_size + ) + for level, dataset in enumerate(level_datasets): + dataset.to_zarr(f"{self.levels_dir}/{level}.zarr", write_empty_chunks=False) + + with self.fs.open(self.meta_path, mode="wt") as stream: + json.dump(self.meta_content, stream, indent=2) + + def tearDown(self): + self._delete_files() + + def _delete_files(self): + if self.fs.exists(self.levels_dir): + self.fs.delete(self.levels_dir, recursive=True) + + def assert_levels_ok( + self, datasets: Any, expect_meta: bool = False, expect_link: bool = False + ): + self.assertIsInstance(datasets, list) + self.assertEqual(self.num_levels, len(datasets)) + for i, (dataset, file_path) in enumerate(datasets): + if expect_link and i == 0: + self.assertEqual(f"memory://{self.levels_name}.zarr", file_path) + else: + self.assertEqual( + f"memory://{self.levels_name}.levels/{i}.zarr", file_path + ) + level_info = dataset.attrs.get(ML_INFO_ATTR) + self.assertIsInstance(level_info, LevelInfo) + self.assertEqual(i, level_info.level) + self.assertEqual(self.num_levels, level_info.num_levels) + self.assertIsInstance(level_info.datasets, list) + self.assertEqual(self.num_levels, len(level_info.datasets)) + meta = level_info.meta + if expect_meta: + self.assertIsInstance(meta, LevelsMeta) + self.assertEqual("1.0", meta.version) + self.assertEqual(self.num_levels, meta.num_levels) + self.assertEqual(False, meta.use_saved_levels) + self.assertEqual(None, meta.tile_size) + self.assertEqual({"chl": "mean"}, meta.agg_methods) + else: + self.assertIsNone(meta) + + def preprocess(self) -> list[tuple[xr.Dataset, str]]: + processor = MultiLevelDatasetProcessor() + return processor.preprocess(self.levels_dir, {}) + + def test_preprocess(self): + datasets = self.preprocess() + self.assert_levels_ok(datasets, expect_meta=True) + + def test_preprocess_no_meta(self): + self.fs.delete(self.meta_path) + datasets = self.preprocess() + self.assert_levels_ok(datasets, expect_meta=False) + + def test_preprocess_with_link(self): + self.fs.copy( + f"{self.levels_dir}/0.zarr", + f"memory://{self.levels_name}.zarr", + recursive=True, + ) + self.fs.delete(f"{self.levels_dir}/0.zarr", recursive=True) + self.fs.write_text(f"{self.levels_dir}/0.link", f"../{self.levels_name}.zarr") + datasets = self.preprocess() + self.assert_levels_ok(datasets, expect_meta=True, expect_link=True) + + def test_preprocess_fail_empty(self): + for i in range(self.num_levels): + self.fs.delete(f"{self.levels_dir}/{i}.zarr", recursive=True) + with pytest.raises(ValueError, match="empty multi-level dataset"): + self.preprocess() + + def test_preprocess_fail_missing(self): + self.fs.delete(f"{self.levels_dir}/1.zarr", recursive=True) + with pytest.raises( + ValueError, match="missing dataset for level 1 in multi-level dataset" + ): + self.preprocess() + + def test_postprocess(self): + processor = MultiLevelDatasetProcessor() + ml0 = [Message("m00"), Message("m01")] + ml1 = [Message("10"), Message("m11"), Message("m12")] + messages = processor.postprocess( + [ + ml0, + ml1, + ], + self.levels_dir, + ) + self.assertEqual([*ml0, *ml1], messages) diff --git a/tests/plugins/xcube/rules/test_ml_dataset_meta.py b/tests/plugins/xcube/rules/test_ml_dataset_meta.py new file mode 100644 index 0000000..5ad27a0 --- /dev/null +++ b/tests/plugins/xcube/rules/test_ml_dataset_meta.py @@ -0,0 +1,87 @@ +import xarray as xr +from tests.plugins.xcube.helpers import make_cube_levels +from xrlint.plugins.xcube.rules.ml_dataset_meta import MLDatasetMeta +from xrlint.plugins.xcube.util import ( + LevelsMeta, + set_dataset_level_info, + LevelInfo, + get_dataset_level_info, +) +from xrlint.testing import RuleTest, RuleTester + + +def _replace_meta(dataset: xr.Dataset, meta: LevelsMeta) -> xr.Dataset: + dataset = dataset.copy() + old_level_info = get_dataset_level_info(dataset) + new_level_info = LevelInfo( + level=old_level_info.level, + num_levels=old_level_info.num_levels, + datasets=old_level_info.datasets, + meta=meta, + ) + set_dataset_level_info(dataset, new_level_info) + return dataset + + +levels_with_meta = make_cube_levels( + 4, + 720, + 360, + meta=LevelsMeta( + version="1.0", + num_levels=4, + use_saved_levels=True, + agg_methods={"chl": "mean"}, + ), +) + +valid_dataset_0 = levels_with_meta[0] +valid_dataset_1 = levels_with_meta[1] +valid_dataset_2 = levels_with_meta[2] +valid_dataset_3 = xr.Dataset() + +levels_wo_meta = make_cube_levels(4, 720, 360, force_infos=True) +invalid_dataset_0 = levels_wo_meta[0] +invalid_dataset_1 = _replace_meta( + levels_wo_meta[0].copy(), + meta=LevelsMeta( + version="2.0", # error: != "1.x" + num_levels=0, # error: < 1 + # error: missing use_saved_levels=False + # error: missing agg_methods={"chl": "mean"} + ), +) +invalid_dataset_2 = _replace_meta( + levels_wo_meta[0], + meta=LevelsMeta( + version="1.0", # ok + num_levels=3, # error: != level_info.num_levels + ), +) + +invalid_dataset_3 = _replace_meta( + levels_wo_meta[0], + meta=LevelsMeta( + version="1.0", # ok + num_levels=4, # ok + use_saved_levels=False, # ok + agg_methods={"tsm": "median"}, # error: where is "chl"? + ), +) + +MLDatasetMetaTest = RuleTester.define_test( + "ml-dataset-meta", + MLDatasetMeta, + valid=[ + RuleTest(dataset=valid_dataset_0), + RuleTest(dataset=valid_dataset_1), + RuleTest(dataset=valid_dataset_2), + RuleTest(dataset=valid_dataset_3), + ], + invalid=[ + RuleTest(dataset=invalid_dataset_0), + RuleTest(dataset=invalid_dataset_1), + RuleTest(dataset=invalid_dataset_2), + RuleTest(dataset=invalid_dataset_3), + ], +) diff --git a/tests/plugins/xcube/rules/test_ml_dataset_time.py b/tests/plugins/xcube/rules/test_ml_dataset_time.py new file mode 100644 index 0000000..8d3d6f4 --- /dev/null +++ b/tests/plugins/xcube/rules/test_ml_dataset_time.py @@ -0,0 +1,39 @@ +import xarray as xr +from tests.plugins.xcube.helpers import make_cube_levels +from xrlint.plugins.xcube.rules.ml_dataset_time import MLDatasetTime +from xrlint.plugins.xcube.util import ( + LevelsMeta, +) +from xrlint.testing import RuleTest, RuleTester + + +meta = LevelsMeta( + version="1.0", + num_levels=4, + use_saved_levels=True, + agg_methods={"chl": "mean"}, +) +levels_with_time = make_cube_levels(4, 720, 360, nt=6, meta=meta) +levels_wo_time = make_cube_levels(4, 720, 360, meta=meta) + +valid_dataset_0 = levels_with_time[0] +valid_dataset_1 = levels_with_time[1] +valid_dataset_2 = levels_wo_time[0] +valid_dataset_3 = xr.Dataset() + +invalid_dataset_0 = levels_with_time[0].copy() +invalid_dataset_0["chl"] = invalid_dataset_0["chl"].chunk(time=3) + +MLDatasetTimeTest = RuleTester.define_test( + "ml-dataset-time", + MLDatasetTime, + valid=[ + RuleTest(dataset=valid_dataset_0), + RuleTest(dataset=valid_dataset_1), + RuleTest(dataset=valid_dataset_2), + RuleTest(dataset=valid_dataset_3), + ], + invalid=[ + RuleTest(dataset=invalid_dataset_0), + ], +) diff --git a/tests/plugins/xcube/rules/test_ml_dataset_xy.py b/tests/plugins/xcube/rules/test_ml_dataset_xy.py new file mode 100644 index 0000000..6faa58a --- /dev/null +++ b/tests/plugins/xcube/rules/test_ml_dataset_xy.py @@ -0,0 +1,57 @@ +import xarray as xr +from tests.plugins.xcube.helpers import make_cube_levels +from xrlint.plugins.xcube.rules.ml_dataset_xy import MLDatasetXY +from xrlint.plugins.xcube.util import ( + LevelsMeta, + get_dataset_level_info, +) +from xrlint.testing import RuleTest, RuleTester + + +meta = LevelsMeta( + version="1.0", + num_levels=4, + use_saved_levels=True, + agg_methods={"chl": "mean"}, +) +levels = make_cube_levels(4, 720, 360, nt=3, meta=meta) + +valid_dataset_0 = levels[0] +valid_dataset_1 = levels[1] +valid_dataset_2 = levels[2] +valid_dataset_3 = levels[3] +valid_dataset_4 = xr.Dataset() + +levels = make_cube_levels(4, 720, 360, meta=meta) +valid_dataset_5 = levels[2] +level_info = get_dataset_level_info(valid_dataset_5) +for ds, _ in level_info.datasets: + # remove spatial vars + del ds["chl"] + +levels = make_cube_levels(4, 720, 360, meta=meta) +invalid_dataset_0 = levels[2].copy() +# simulate resolution mismatch by exchanging 2 levels +level_info = get_dataset_level_info(invalid_dataset_0) +level_datasets = level_info.datasets +level_0_dataset = level_datasets[0] +level_1_dataset = level_datasets[1] +level_info.datasets[0] = level_1_dataset +level_info.datasets[1] = level_0_dataset + + +MLDatasetXYTest = RuleTester.define_test( + "ml-dataset-xy", + MLDatasetXY, + valid=[ + RuleTest(dataset=valid_dataset_0), + RuleTest(dataset=valid_dataset_1), + RuleTest(dataset=valid_dataset_2), + RuleTest(dataset=valid_dataset_3), + RuleTest(dataset=valid_dataset_4), + RuleTest(dataset=valid_dataset_5), + ], + invalid=[ + RuleTest(dataset=invalid_dataset_0), + ], +) diff --git a/tests/plugins/xcube/test_plugin.py b/tests/plugins/xcube/test_plugin.py index e0f4654..e469d74 100644 --- a/tests/plugins/xcube/test_plugin.py +++ b/tests/plugins/xcube/test_plugin.py @@ -14,6 +14,9 @@ def test_rules_complete(self): "grid-mapping-naming", "increasing-time", "lat-lon-naming", + "ml-dataset-meta", + "ml-dataset-time", + "ml-dataset-xy", "single-grid-mapping", "time-naming", }, @@ -32,9 +35,9 @@ def test_configs_complete(self): all_rule_names = set(f"xcube/{k}" for k in plugin.rules.keys()) self.assertEqual( all_rule_names, - set(plugin.configs["all"].rules.keys()), + set(plugin.configs["all"][-1].rules.keys()), ) self.assertEqual( all_rule_names, - set(plugin.configs["recommended"].rules.keys()), + set(plugin.configs["recommended"][-1].rules.keys()), ) diff --git a/tests/test_config.py b/tests/test_config.py index dcdfd5b..8fab8c3 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -140,23 +140,30 @@ def test_from_value_ok(self): self.assertIsInstance(config_list, ConfigList) self.assertEqual([Config()], config_list.configs) + config_list = ConfigList.from_value({}) + self.assertIsInstance(config_list, ConfigList) + self.assertEqual([Config()], config_list.configs) + + config = Config.from_value({}) + config_list = ConfigList.from_value(config) + self.assertIsInstance(config_list, ConfigList) + self.assertIs(config, config_list.configs[0]) + # noinspection PyMethodMayBeStatic def test_from_value_fail(self): with pytest.raises( TypeError, match=( r"config_list must be of type" - r" ConfigList \| list\[Config \| dict\], but got dict" + r" ConfigList \| list\[Config \| dict \| str\], but got int" ), ): - ConfigList.from_value({}) + ConfigList.from_value(264) def test_compute_config(self): config_list = ConfigList([Config()]) file_path = "s3://wq-services/datacubes/chl-2.zarr" - self.assertEqual( - Config(name=""), config_list.compute_config(file_path) - ) + self.assertEqual(Config(), config_list.compute_config(file_path)) config_list = ConfigList( [ @@ -167,18 +174,18 @@ def test_compute_config(self): ) file_path = "s3://wq-services/datacubes/chl-2.zarr" self.assertEqual( - Config(name="", settings={"a": 1, "b": 2}), + Config(settings={"a": 1, "b": 2}), config_list.compute_config(file_path), ) # global ignores file_path = "s3://wq-services/datacubes/chl-2.txt" self.assertEqual( - Config(name="", settings={"a": 2, "b": 1}), + Config(settings={"a": 2, "b": 1}), config_list.compute_config(file_path), ) - def test_get_global_filter(self): + def test_split_global_filter(self): config_list = ConfigList( [ Config(files=["**/*.hdf"]), # global file @@ -189,16 +196,18 @@ def test_get_global_filter(self): ] ) - file_filter = config_list.get_global_filter() + new_config_list, file_filter = config_list.split_global_filter() self.assertEqual( FileFilter.from_patterns(["**/*.hdf"], ["**/chl-?.txt"]), file_filter, ) + self.assertEqual(3, len(new_config_list.configs)) - file_filter = config_list.get_global_filter( + new_config_list, file_filter = config_list.split_global_filter( default=FileFilter.from_patterns(["**/*.h5"], None) ) self.assertEqual( FileFilter.from_patterns(["**/*.h5", "**/*.hdf"], ["**/chl-?.txt"]), file_filter, ) + self.assertEqual(3, len(new_config_list.configs)) diff --git a/tests/test_examples.py b/tests/test_examples.py index bf1650c..feb684e 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -9,7 +9,7 @@ class ExamplesTest(TestCase): def test_plugin_config(self): config_list, _ = import_value( - "examples.plugin_config", "export_configs", factory=ConfigList.from_value + "examples.plugin_config", "export_config", factory=ConfigList.from_value ) self.assertIsInstance(config_list, ConfigList) self.assertEqual(3, len(config_list.configs)) @@ -17,7 +17,7 @@ def test_plugin_config(self): def test_virtual_plugin_config(self): config_list, _ = import_value( "examples.virtual_plugin_config", - "export_configs", + "export_config", factory=ConfigList.from_value, ) self.assertIsInstance(config_list, ConfigList) diff --git a/tests/test_linter.py b/tests/test_linter.py index 1dfd196..f3b5a29 100644 --- a/tests/test_linter.py +++ b/tests/test_linter.py @@ -277,13 +277,13 @@ def test_processor(self): [ Message( message="Dataset does not have data variables", - node_path="dataset", + node_path="dataset[0]", rule_id="test/dataset-without-data-vars", severity=1, ), Message( message="Dataset does not have data variables", - node_path="dataset", + node_path="dataset[1]", rule_id="test/dataset-without-data-vars", severity=1, ), diff --git a/tests/test_operation.py b/tests/test_operation.py index f0abc79..43c5fc5 100644 --- a/tests/test_operation.py +++ b/tests/test_operation.py @@ -146,14 +146,9 @@ class MyThingOp3(ThingOp): ) self.assertEqual( { - "meta": { - "name": "t3", - "version": "0.0.0", - "description": "What a thing.", - }, - "op_class": ".MyThingOp3'>", + "meta": {"description": "What a thing.", "name": "t3"}, + "op_class": ".MyThingOp3'>", }, rule.to_json(), ) diff --git a/tests/test_plugin.py b/tests/test_plugin.py index 3c3bafb..e909050 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -38,12 +38,14 @@ class MyRule2(RuleOp): "r2": MyRule2, }, "configs": { - "recommended": { - "rules": { - "hello/r1": "warn", - "hello/r2": "error", - }, - }, + "recommended": [ + { + "rules": { + "hello/r1": "warn", + "hello/r2": "error", + }, + } + ], }, } ) diff --git a/tests/test_rule.py b/tests/test_rule.py index fc20fee..e794f6c 100644 --- a/tests/test_rule.py +++ b/tests/test_rule.py @@ -64,9 +64,8 @@ class MyRule3(RuleOp): rule = Rule(meta=RuleMeta(name="r3"), op_class=MyRule3) self.assertEqual( { - "meta": {"name": "r3", "version": "0.0.0", "type": "problem"}, - "op_class": ".MyRule3'>", + "meta": {"name": "r3"}, + "op_class": ".MyRule3'>", }, rule.to_json(), ) @@ -95,12 +94,7 @@ def test_from_value(self): def test_to_json(self): rule_meta = RuleMeta(name="r1", version="0.1.2", description="Nice one.") self.assertEqual( - { - "name": "r1", - "version": "0.1.2", - "type": "problem", - "description": "Nice one.", - }, + {"description": "Nice one.", "name": "r1", "version": "0.1.2"}, rule_meta.to_json(), ) diff --git a/tests/util/test_codec.py b/tests/util/test_constructible.py similarity index 79% rename from tests/util/test_codec.py rename to tests/util/test_constructible.py index a0cdec1..f5dac61 100644 --- a/tests/util/test_codec.py +++ b/tests/util/test_constructible.py @@ -121,118 +121,6 @@ def test_assumptions(self): # noinspection PyMethodMayBeStatic -class JsonSerializableTest(TestCase): - def test_simple_ok(self): - self.assertEqual( - { - "a": None, - "b": False, - "c": 0, - "d": 0.0, - "e": "abc", - "f": "", - "g": 0, - }, - SimpleTypesContainer().to_json(), - ) - self.assertEqual( - { - "a": "?", - "b": True, - "c": 12, - "d": 34.56, - "e": "uvw", - "f": "", - "g": "off", - }, - SimpleTypesContainer( - a="?", b=True, c=12, d=34.56, e="uvw", f=bool, g="off" - ).to_json(), - ) - - def test_complex_ok(self): - container = ComplexTypesContainer( - q=dict(p=True, q=False), - r=dict(u=SimpleTypesContainer(), v=SimpleTypesContainer()), - s=[1, 2, 3], - t=[ - SimpleTypesContainer(c=5, d=6.7), - SimpleTypesContainer(c=8, d=9.1, f=SimpleTypesContainer), - ], - ) - self.assertEqual( - { - "p": { - "a": None, - "b": False, - "c": 0, - "d": 0.0, - "e": "abc", - "f": "", - "g": 0, - }, - "q": {"p": True, "q": False}, - "r": { - "u": { - "a": None, - "b": False, - "c": 0, - "d": 0.0, - "e": "abc", - "f": "", - "g": 0, - }, - "v": { - "a": None, - "b": False, - "c": 0, - "d": 0.0, - "e": "abc", - "f": "", - "g": 0, - }, - }, - "s": [1, 2, 3], - "t": [ - { - "a": None, - "b": False, - "c": 5, - "d": 6.7, - "e": "abc", - "f": "", - "g": 0, - }, - { - "a": None, - "b": False, - "c": 8, - "d": 9.1, - "e": "abc", - "f": "", - "g": 0, - }, - ], - "u": None, - }, - container.to_json(), - ) - - def test_fail(self): - @dataclass() - class Problematic(JsonSerializable): - data: Any - - with pytest.raises( - TypeError, - match=( - "problematic.data must be of type" - " None|bool|int|float|str|dict|list|tuple, but got object" - ), - ): - Problematic(data=object()).to_json(value_name="problematic") - - class ValueConstructibleTest(TestCase): def test_useless_ok(self): container = UselessContainer() diff --git a/tests/util/test_serializable.py b/tests/util/test_serializable.py new file mode 100644 index 0000000..4206af9 --- /dev/null +++ b/tests/util/test_serializable.py @@ -0,0 +1,237 @@ +from dataclasses import dataclass, field +from typing import Any, Literal + +from unittest import TestCase + +import pytest + +from xrlint.util.serializable import JsonSerializable + + +class PlainSimpleTypesContainer(JsonSerializable): + def __init__( + self, + a: Any = None, + b: bool = False, + c: int = 0, + d: float = 0.0, + e: str = "abc", + f: type = int, + g: Literal[0, 1, True, False, "on", "off"] = 0, + ): + self.a = a + self.b = b + self.c = c + self.d = d + self.e = e + self.f = f + self.g = g + + +class PlainComplexTypesContainer(JsonSerializable): + def __init__( + self, + p: PlainSimpleTypesContainer = PlainSimpleTypesContainer(), + q: dict[str, bool] = None, + r: dict[str, PlainSimpleTypesContainer] = None, + s: list[int] = None, + t: list[PlainSimpleTypesContainer] = None, + u: int | float | None = None, + ): + self.p = p + self.q = q or {} + self.r = r or {} + self.s = s or [] + self.t = t or [] + self.u = u + + +@dataclass() +class DataclassSimpleTypesContainer(JsonSerializable): + a: Any = None + b: bool = False + c: int = 0 + d: float = 0.0 + e: str = "abc" + f: type = int + g: Literal[0, 1, True, False, "on", "off"] = 0 + + +@dataclass() +class DataclassComplexTypesContainer(JsonSerializable): + p: DataclassSimpleTypesContainer = field( + default_factory=DataclassSimpleTypesContainer + ) + q: dict[str, bool] = field(default_factory=dict) + r: dict[str, DataclassSimpleTypesContainer] = field(default_factory=dict) + s: list[int] = field(default_factory=list) + t: list[DataclassSimpleTypesContainer] = field(default_factory=list) + u: int | float | None = None + + +# noinspection PyMethodMayBeStatic +class JsonSerializableTest(TestCase): + def test_plain_simple_ok(self): + self.assertEqual( + { + "a": None, + "b": False, + "c": 0, + "d": 0.0, + "e": "abc", + "f": "", + "g": 0, + }, + PlainSimpleTypesContainer().to_json(), + ) + self.assertEqual( + { + "a": "?", + "b": True, + "c": 12, + "d": 34.56, + "e": "uvw", + "f": "", + "g": "off", + }, + PlainSimpleTypesContainer( + a="?", b=True, c=12, d=34.56, e="uvw", f=bool, g="off" + ).to_json(), + ) + + def test_plain_complex_ok(self): + container = PlainComplexTypesContainer( + q=dict(p=True, q=False), + r=dict(u=PlainSimpleTypesContainer(), v=PlainSimpleTypesContainer()), + s=[1, 2, 3], + t=[ + PlainSimpleTypesContainer(c=5, d=6.7), + PlainSimpleTypesContainer(c=8, d=9.1, f=PlainSimpleTypesContainer), + ], + ) + self.assertEqual( + { + "p": { + "a": None, + "b": False, + "c": 0, + "d": 0.0, + "e": "abc", + "f": "", + "g": 0, + }, + "q": {"p": True, "q": False}, + "r": { + "u": { + "a": None, + "b": False, + "c": 0, + "d": 0.0, + "e": "abc", + "f": "", + "g": 0, + }, + "v": { + "a": None, + "b": False, + "c": 0, + "d": 0.0, + "e": "abc", + "f": "", + "g": 0, + }, + }, + "s": [1, 2, 3], + "t": [ + { + "a": None, + "b": False, + "c": 5, + "d": 6.7, + "e": "abc", + "f": "", + "g": 0, + }, + { + "a": None, + "b": False, + "c": 8, + "d": 9.1, + "e": "abc", + "f": "", + "g": 0, + }, + ], + "u": None, + }, + container.to_json(), + ) + + def test_dataclass_simple_ok(self): + self.assertEqual( + {}, + DataclassSimpleTypesContainer().to_json(), + ) + self.assertEqual( + { + "a": "?", + "b": True, + "c": 12, + "d": 34.56, + "e": "uvw", + "f": "", + "g": "off", + }, + DataclassSimpleTypesContainer( + a="?", b=True, c=12, d=34.56, e="uvw", f=bool, g="off" + ).to_json(), + ) + + def test_dataclass_complex_ok(self): + container = DataclassComplexTypesContainer( + q=dict(p=True, q=False), + r=dict( + u=DataclassSimpleTypesContainer(), v=DataclassSimpleTypesContainer() + ), + s=[1, 2, 3], + t=[ + DataclassSimpleTypesContainer(c=5, d=6.7), + DataclassSimpleTypesContainer( + c=8, d=9.1, f=DataclassSimpleTypesContainer + ), + ], + ) + self.assertEqual( + { + "p": {}, + "q": {"p": True, "q": False}, + "r": {"u": {}, "v": {}}, + "s": [1, 2, 3], + "t": [ + {"c": 5, "d": 6.7}, + { + "c": 8, + "d": 9.1, + "f": ( + "" + ), + }, + ], + }, + container.to_json(), + ) + + def test_fail(self): + @dataclass() + class Problematic(JsonSerializable): + data: Any + + with pytest.raises( + TypeError, + match=( + "problematic.data must be of type" + " None|bool|int|float|str|dict|list|tuple, but got object" + ), + ): + Problematic(data=object()).to_json(value_name="problematic") diff --git a/xrlint/_linter/apply.py b/xrlint/_linter/apply.py index 7c0993c..0080eb8 100644 --- a/xrlint/_linter/apply.py +++ b/xrlint/_linter/apply.py @@ -30,7 +30,15 @@ def apply_rule( _visit_dataset_node( rule_op, context, - DatasetNode(parent=None, path="dataset", dataset=context.dataset), + DatasetNode( + parent=None, + path=( + "dataset" + if context.file_index is None + else f"dataset[{context.file_index}]" + ), + dataset=context.dataset, + ), ) except RuleExit: # This is ok, the rule requested it. diff --git a/xrlint/_linter/rulectx.py b/xrlint/_linter/rulectx.py index d86e8e2..db3252f 100644 --- a/xrlint/_linter/rulectx.py +++ b/xrlint/_linter/rulectx.py @@ -16,13 +16,16 @@ def __init__( config: Config, dataset: xr.Dataset, file_path: str, + file_index: int | None, ): assert config is not None assert dataset is not None assert file_path is not None + assert file_index is None or isinstance(file_index, int) self._config = config self._dataset = dataset self._file_path = file_path + self._file_index = file_index self.messages: list[Message] = [] self.rule_id: str | None = None self.severity: Literal[1, 2] = SEVERITY_ERROR @@ -34,7 +37,6 @@ def config(self) -> Config: @property def settings(self) -> dict[str, Any]: - assert self._config is not None return self._config.settings or {} @property @@ -43,9 +45,12 @@ def dataset(self) -> xr.Dataset: @property def file_path(self) -> str: - assert self._file_path is not None return self._file_path + @property + def file_index(self) -> int | None: + return self._file_index + def report( self, message: str, diff --git a/xrlint/_linter/verify.py b/xrlint/_linter/verify.py index fa95148..328bb9c 100644 --- a/xrlint/_linter/verify.py +++ b/xrlint/_linter/verify.py @@ -18,7 +18,7 @@ def verify_dataset(config: Config, dataset: Any, file_path: str | None): assert isinstance(file_path, (str, type(None))) if isinstance(dataset, xr.Dataset): file_path = file_path or _get_file_path_for_dataset(dataset) - messages = _verify_dataset(config, dataset, file_path) + messages = _verify_dataset(config, dataset, file_path, None) else: file_path = file_path or _get_file_path_for_source(dataset) messages = _open_and_verify_dataset(config, dataset, file_path) @@ -29,12 +29,13 @@ def _verify_dataset( config: Config, dataset: xr.Dataset, file_path: str, + file_index: int | None, ) -> list[Message]: assert isinstance(config, Config) assert isinstance(dataset, xr.Dataset) assert isinstance(file_path, str) - context = RuleContextImpl(config, dataset, file_path) + context = RuleContextImpl(config, dataset, file_path, file_index) if not config.rules: context.report("No rules configured or applicable.", fatal=True) @@ -61,7 +62,10 @@ def _open_and_verify_dataset( except (OSError, ValueError, TypeError) as e: return [Message(message=str(e), fatal=True, severity=2)] return processor_op.postprocess( - [_verify_dataset(config, ds, path) for ds, path in ds_path_list], + [ + _verify_dataset(config, ds, path, i) + for i, (ds, path) in enumerate(ds_path_list) + ], file_path, ) else: @@ -70,7 +74,7 @@ def _open_and_verify_dataset( except (OSError, ValueError, TypeError) as e: return [Message(message=str(e), fatal=True, severity=2)] with dataset: - return _verify_dataset(config, dataset, file_path) + return _verify_dataset(config, dataset, file_path, None) def _open_dataset( diff --git a/xrlint/cli/config.py b/xrlint/cli/config.py index 25c2aaf..f9cf0b5 100644 --- a/xrlint/cli/config.py +++ b/xrlint/cli/config.py @@ -87,7 +87,7 @@ def _read_config_python(config_path: str) -> Any: try: return import_value( module_name, - "export_configs", + "export_config", factory=ConfigList.from_value, )[0] except ValueImportError as e: diff --git a/xrlint/cli/engine.py b/xrlint/cli/engine.py index 71d44bd..b40d99c 100644 --- a/xrlint/cli/engine.py +++ b/xrlint/cli/engine.py @@ -21,7 +21,7 @@ from xrlint.formatters import export_formatters from xrlint.linter import Linter from xrlint.plugin import Plugin -from xrlint.result import Message, Result, ResultStats +from xrlint.result import Result, ResultStats from xrlint.util.filefilter import FileFilter DEFAULT_GLOBAL_FILTER = FileFilter.from_patterns( @@ -147,25 +147,52 @@ def verify_datasets(self, files: Iterable[str]) -> Iterator[Result]: Returns: Iterator of reports. """ - global_filter = self.config_list.get_global_filter( + linter = Linter() + for file_path, config in self.get_files(files): + yield linter.verify_dataset(file_path, config=config) + + def get_files(self, file_paths: Iterable[str]) -> Iterator[tuple[str, Config]]: + """Provide an iterator for the list of files or directories. + + Directories in `files` that are not filtered out will be + recursively traversed. + + Args: + file_paths: Iterable of files or directory. + + Returns: + An iterator of filtered files or directories. + """ + config_list, global_filter = self.config_list.split_global_filter( default=DEFAULT_GLOBAL_FILTER ) - linter = Linter() - for file_path, is_dir in get_files(files, global_filter): - config = self.get_config_for_file(file_path) + + def compute_config(p: str): + return config_list.compute_config(p) if global_filter.accept(p) else None + + for file_path in file_paths: + _fs, root = fsspec.url_to_fs(file_path) + fs: fsspec.AbstractFileSystem = _fs + + config = compute_config(file_path) if config is not None: - yield linter.verify_dataset(file_path, config=config) - else: - yield Result.new( - config=config, - file_path=file_path, - messages=[ - Message( - message="No configuration matches this file.", - severity=2, - ) - ], - ) + yield file_path, config + continue + + if fs.isdir(root): + for path, dirs, files in fs.walk(root, topdown=True): + for d in list(dirs): + d_path = f"{path}/{d}" + c = compute_config(d_path) + if c is not None: + dirs.remove(d) + yield d_path, c + + for f in files: + f_path = f"{path}/{f}" + c = compute_config(f_path) + if c is not None: + yield f_path, c def format_results(self, results: Iterable[Result]) -> str: """Format the given results. @@ -220,38 +247,3 @@ def init_config_file(cls) -> None: with open(file_path, "w") as f: f.write(INIT_CONFIG_YAML) click.echo(f"Configuration template written to {file_path}") - - -def get_files( - file_paths: Iterable[str], global_filter: FileFilter -) -> Iterator[tuple[str, bool | None]]: - """Provide an iterator for the list of files or directories. - - Directories in `files` that are not filtered out will be - recursively traversed. - - Args: - file_paths: Iterable of files or directory. - global_filter: A file filter that includes files that - covered by global file patterns and not excluded - by global ignore patterns. - - Returns: - An iterator of filtered files or directories. - """ - for file_path in file_paths: - _fs, root = fsspec.url_to_fs(file_path) - fs: fsspec.AbstractFileSystem = _fs - _dir = fs.isdir(root) - if global_filter.accept(file_path): - yield file_path, _dir - elif _dir: - for path, dirs, files in fs.walk(root): - for d in dirs: - d_path = f"{path}/{d}" - if global_filter.accept(d_path): - yield d_path, True - for f in files: - f_path = f"{path}/{f}" - if global_filter.accept(f_path): - yield f_path, False diff --git a/xrlint/config.py b/xrlint/config.py index 6d26d72..ee294d4 100644 --- a/xrlint/config.py +++ b/xrlint/config.py @@ -6,7 +6,7 @@ from xrlint.util.constructible import MappingConstructible, ValueConstructible from xrlint.util.filefilter import FileFilter from xrlint.util.merge import merge_arrays, merge_dicts, merge_set_lists, merge_values -from xrlint.util.serializable import JsonSerializable, JsonValue +from xrlint.util.serializable import JsonSerializable if TYPE_CHECKING: # pragma: no cover # make IDEs and flake8 happy @@ -45,10 +45,11 @@ def get_core_config( CORE_PLUGIN_NAME: core_plugin, }, ) - if config_name: - return config.merge(core_plugin.configs[config_name]) - else: + if config_name is None: return config + config_list = core_plugin.configs[config_name] + assert len(config_list) == 1 + return config.merge(config_list[0]) def split_config_spec(config_spec: str) -> tuple[str, str]: @@ -154,6 +155,7 @@ def empty(self) -> bool: return not ( self.linter_options or self.opener_options + or self.processor or self.plugins or self.rules or self.settings @@ -292,9 +294,6 @@ def value_name(cls) -> str: def value_type_name(cls) -> str: return "Config | dict | None" - def to_dict(self, value_name: str | None = None) -> dict[str, JsonValue]: - return {k: v for k, v in super().to_dict().items() if v is not None} - @dataclass(frozen=True) class ConfigList(ValueConstructible, JsonSerializable): @@ -308,16 +307,21 @@ class ConfigList(ValueConstructible, JsonSerializable): configs: list[Config] = field(default_factory=list) """The list of configuration objects.""" - def get_global_filter(self, default: FileFilter | None = None) -> FileFilter: + def split_global_filter( + self, default: FileFilter | None = None + ) -> tuple["ConfigList", FileFilter]: """Get a global file filter for this configuration list.""" global_filter = FileFilter( default.files if default else (), default.ignores if default else (), ) + configs = [] for c in self.configs: if c.empty and not c.file_filter.empty: global_filter = global_filter.merge(c.file_filter) - return global_filter + else: + configs.append(c) + return ConfigList(configs=configs), global_filter def compute_config(self, file_path: str) -> Config | None: """Compute the configuration object for the given file path. @@ -341,7 +345,6 @@ def compute_config(self, file_path: str) -> Config | None: # Note, computed configurations do not have "files" and "ignores" return Config( - name="", linter_options=config.linter_options, opener_options=config.opener_options, processor=config.processor, @@ -359,12 +362,14 @@ def from_value(cls, value: Any, value_name: str | None = None) -> "ConfigList": Args: value: A `ConfigList` object or `list` of items which can be converted into `Config` objects including configuration - names of tyype `str`. The latter are resolved against + names of type `str`. The latter are resolved against the plugin configurations seen so far in the list. value_name: A value's name. Returns: A `ConfigList` object. """ + if isinstance(value, (Config, dict)): + return ConfigList(configs=[Config.from_value(value)]) return super().from_value(value, value_name=value_name) @classmethod @@ -375,12 +380,13 @@ def _from_sequence(cls, value: Sequence, value_name: str) -> "ConfigList": if isinstance(item, str): if CORE_PLUGIN_NAME not in plugins: plugins.update({CORE_PLUGIN_NAME: get_core_plugin()}) - config = cls._get_named_config(item, plugins) + new_configs = cls._get_named_config_list(item, plugins) else: - config = Config.from_value(item) - configs.append(config) - plugins.update(config.plugins if config.plugins else {}) - return ConfigList(configs) + new_configs = [Config.from_value(item)] + for config in new_configs: + configs.append(config) + plugins.update(config.plugins if config.plugins else {}) + return ConfigList(configs=configs) @classmethod def value_name(cls) -> str: @@ -388,10 +394,12 @@ def value_name(cls) -> str: @classmethod def value_type_name(cls) -> str: - return "ConfigList | list[Config | dict]" + return "ConfigList | list[Config | dict | str]" @classmethod - def _get_named_config(cls, config_spec: str, plugins: dict[str, "Plugin"]): + def _get_named_config_list( + cls, config_spec: str, plugins: dict[str, "Plugin"] + ) -> list[Config]: plugin_name, config_name = ( config_spec.split("/", maxsplit=1) if "/" in config_spec @@ -400,5 +408,4 @@ def _get_named_config(cls, config_spec: str, plugins: dict[str, "Plugin"]): plugin: Plugin | None = plugins.get(plugin_name) if plugin is None or not plugin.configs or config_name not in plugin.configs: raise ValueError(f"configuration {config_spec!r} not found") - config_value = plugin.configs[config_name] - return config_value + return ConfigList.from_value(plugin.configs[config_name]).configs diff --git a/xrlint/operation.py b/xrlint/operation.py index 9f1c35f..87332bd 100644 --- a/xrlint/operation.py +++ b/xrlint/operation.py @@ -46,13 +46,6 @@ class OperationMeta(MappingConstructible["OpMetadata"], JsonSerializable): Must have the form ":", if given. """ - def to_dict(self, value_name: str | None = None) -> dict[str, JsonValue]: - return { - k: v - for k, v in super().to_dict(value_name=value_name).items() - if v is not None - } - class Operation(MappingConstructible["Operation"], JsonSerializable): """A mixin class that is used by operation classes. diff --git a/xrlint/plugin.py b/xrlint/plugin.py index 1900d79..a7cab18 100644 --- a/xrlint/plugin.py +++ b/xrlint/plugin.py @@ -1,7 +1,7 @@ from dataclasses import dataclass, field from typing import Any, Callable, Literal, Type -from xrlint.config import Config +from xrlint.config import Config, ConfigList from xrlint.processor import Processor, ProcessorOp, define_processor from xrlint.rule import Rule, RuleOp, define_rule from xrlint.util.constructible import MappingConstructible @@ -50,8 +50,8 @@ class Plugin(MappingConstructible, JsonSerializable): """A dictionary containing named processors. """ - configs: dict[str, Config] = field(default_factory=dict) - """A dictionary containing named configurations.""" + configs: dict[str, list[Config]] = field(default_factory=dict) + """A dictionary containing named configuration lists.""" def define_rule( self, @@ -99,6 +99,27 @@ def define_processor( registry=self.processors, ) + def define_config( + self, + name: str, + value: list[Config | dict[str, Any]] | Config | dict[str, Any], + ) -> list[Config]: + """Define a named configuration. + + Args: + name: The name of the configuration. + value: The configuration-like object or list. + A configuration-like object is either a + [Config][xrlint.config.Config] or a `dict` that + represents a configuration. + + Returns: + A list of `Config` objects. + """ + configs = ConfigList.from_value(value).configs + self.configs[name] = configs + return configs + @classmethod def _from_str(cls, value: str, value_name: str) -> "Plugin": plugin, plugin_ref = import_value( diff --git a/xrlint/plugins/core/__init__.py b/xrlint/plugins/core/__init__.py index 2457614..5a69302 100644 --- a/xrlint/plugins/core/__init__.py +++ b/xrlint/plugins/core/__init__.py @@ -1,14 +1,14 @@ -from xrlint.config import Config from xrlint.plugin import Plugin from xrlint.util.importutil import import_submodules def export_plugin() -> Plugin: - from .rules import plugin + from .plugin import plugin import_submodules("xrlint.plugins.core.rules") - plugin.configs["recommended"] = Config.from_value( + plugin.define_config( + "recommended", { "name": "recommended", "rules": { @@ -23,13 +23,15 @@ def export_plugin() -> Plugin: "time-coordinate": "error", "var-units-attr": "warn", }, - } + }, ) - plugin.configs["all"] = Config.from_value( + + plugin.define_config( + "all", { "name": "all", "rules": {rule_id: "error" for rule_id in plugin.rules.keys()}, - } + }, ) return plugin diff --git a/xrlint/plugins/core/plugin.py b/xrlint/plugins/core/plugin.py new file mode 100644 index 0000000..ab64c5d --- /dev/null +++ b/xrlint/plugins/core/plugin.py @@ -0,0 +1,9 @@ +from xrlint.constants import CORE_PLUGIN_NAME +from xrlint.plugin import new_plugin +from xrlint.version import version + +plugin = new_plugin( + name=CORE_PLUGIN_NAME, + version=version, + ref="xrlint.plugins.core:export_plugin", +) diff --git a/xrlint/plugins/core/rules/__init__.py b/xrlint/plugins/core/rules/__init__.py index ab64c5d..e69de29 100644 --- a/xrlint/plugins/core/rules/__init__.py +++ b/xrlint/plugins/core/rules/__init__.py @@ -1,9 +0,0 @@ -from xrlint.constants import CORE_PLUGIN_NAME -from xrlint.plugin import new_plugin -from xrlint.version import version - -plugin = new_plugin( - name=CORE_PLUGIN_NAME, - version=version, - ref="xrlint.plugins.core:export_plugin", -) diff --git a/xrlint/plugins/core/rules/coords_for_dims.py b/xrlint/plugins/core/rules/coords_for_dims.py index 8a4a056..b6c5f44 100644 --- a/xrlint/plugins/core/rules/coords_for_dims.py +++ b/xrlint/plugins/core/rules/coords_for_dims.py @@ -1,5 +1,5 @@ from xrlint.node import DatasetNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext, RuleOp from xrlint.util.formatting import format_item diff --git a/xrlint/plugins/core/rules/dataset_title_attr.py b/xrlint/plugins/core/rules/dataset_title_attr.py index cd27ddc..4abb5e2 100644 --- a/xrlint/plugins/core/rules/dataset_title_attr.py +++ b/xrlint/plugins/core/rules/dataset_title_attr.py @@ -1,5 +1,5 @@ from xrlint.node import DatasetNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/core/rules/flags.py b/xrlint/plugins/core/rules/flags.py index 0138680..7d88213 100644 --- a/xrlint/plugins/core/rules/flags.py +++ b/xrlint/plugins/core/rules/flags.py @@ -3,7 +3,7 @@ import numpy as np from xrlint.node import DataArrayNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleOp, RuleContext diff --git a/xrlint/plugins/core/rules/grid_mappings.py b/xrlint/plugins/core/rules/grid_mappings.py index 0f85c76..2050c3e 100644 --- a/xrlint/plugins/core/rules/grid_mappings.py +++ b/xrlint/plugins/core/rules/grid_mappings.py @@ -1,5 +1,5 @@ from xrlint.node import DatasetNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/core/rules/lat_lon_coordinate.py b/xrlint/plugins/core/rules/lat_lon_coordinate.py index bdcb6e1..903cf15 100644 --- a/xrlint/plugins/core/rules/lat_lon_coordinate.py +++ b/xrlint/plugins/core/rules/lat_lon_coordinate.py @@ -3,7 +3,7 @@ import xarray as xr from xrlint.node import DataArrayNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext from xrlint.rule import RuleOp diff --git a/xrlint/plugins/core/rules/no_empty_attrs.py b/xrlint/plugins/core/rules/no_empty_attrs.py index 22c3ad3..7405c8e 100644 --- a/xrlint/plugins/core/rules/no_empty_attrs.py +++ b/xrlint/plugins/core/rules/no_empty_attrs.py @@ -1,5 +1,5 @@ from xrlint.node import AttrsNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.result import Suggestion from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/core/rules/no_empty_chunks.py b/xrlint/plugins/core/rules/no_empty_chunks.py index a4b7186..0e842f9 100644 --- a/xrlint/plugins/core/rules/no_empty_chunks.py +++ b/xrlint/plugins/core/rules/no_empty_chunks.py @@ -1,5 +1,5 @@ from xrlint.node import DataArrayNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext, RuleExit, RuleOp diff --git a/xrlint/plugins/core/rules/time_coordinate.py b/xrlint/plugins/core/rules/time_coordinate.py index 6842ca9..c2156c3 100644 --- a/xrlint/plugins/core/rules/time_coordinate.py +++ b/xrlint/plugins/core/rules/time_coordinate.py @@ -1,5 +1,5 @@ from xrlint.node import DataArrayNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/core/rules/var_units_attr.py b/xrlint/plugins/core/rules/var_units_attr.py index cec977f..81d2ba7 100644 --- a/xrlint/plugins/core/rules/var_units_attr.py +++ b/xrlint/plugins/core/rules/var_units_attr.py @@ -1,5 +1,5 @@ from xrlint.node import DataArrayNode -from xrlint.plugins.core.rules import plugin +from xrlint.plugins.core.plugin import plugin from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/xcube/__init__.py b/xrlint/plugins/xcube/__init__.py index ee60293..0c008a9 100644 --- a/xrlint/plugins/xcube/__init__.py +++ b/xrlint/plugins/xcube/__init__.py @@ -1,39 +1,63 @@ -from xrlint.config import Config from xrlint.plugin import Plugin +from xrlint.plugins.xcube.constants import ML_FILE_PATTERN from xrlint.util.importutil import import_submodules def export_plugin() -> Plugin: - from .rules import plugin + from .plugin import plugin import_submodules("xrlint.plugins.xcube.rules") + import_submodules("xrlint.plugins.xcube.processors") - plugin.configs["recommended"] = Config.from_value( + common_configs = [ { - "name": "xcube-recommended", "plugins": { "xcube": plugin, }, - "rules": { - "xcube/any-spatial-data-var": "error", - "xcube/cube-dims-order": "error", - "xcube/data-var-colors": "warn", - "xcube/grid-mapping-naming": "warn", - "xcube/increasing-time": "error", - "xcube/lat-lon-naming": "error", - "xcube/single-grid-mapping": "error", - "xcube/time-naming": "error", + }, + { + # Add *.levels to globally included list of file types + "files": [ML_FILE_PATTERN], + }, + { + # Specify a processor for *.levels files + "files": [ML_FILE_PATTERN], + "processor": "xcube/multi-level-dataset", + }, + ] + + plugin.define_config( + "recommended", + [ + *common_configs, + { + "rules": { + "xcube/any-spatial-data-var": "error", + "xcube/cube-dims-order": "error", + "xcube/data-var-colors": "warn", + "xcube/grid-mapping-naming": "warn", + "xcube/increasing-time": "error", + "xcube/lat-lon-naming": "error", + "xcube/ml-dataset-meta": "error", + "xcube/ml-dataset-time": "warn", + "xcube/ml-dataset-xy": "error", + "xcube/single-grid-mapping": "error", + "xcube/time-naming": "error", + }, }, - } + ], ) - plugin.configs["all"] = Config.from_value( - { - "name": "xcube-all", - "plugins": { - "xcube": plugin, + + plugin.define_config( + "all", + [ + *common_configs, + { + "rules": { + f"xcube/{rule_id}": "error" for rule_id in plugin.rules.keys() + }, }, - "rules": {f"xcube/{rule_id}": "error" for rule_id in plugin.rules.keys()}, - } + ], ) return plugin diff --git a/xrlint/plugins/xcube/constants.py b/xrlint/plugins/xcube/constants.py index d3727d7..b10b532 100644 --- a/xrlint/plugins/xcube/constants.py +++ b/xrlint/plugins/xcube/constants.py @@ -1,8 +1,14 @@ -LON_NAME = "lon" -LAT_NAME = "lat" -X_NAME = "x" -Y_NAME = "y" -TIME_NAME = "time" +from typing import Final -GM_NAMES = "spatial_ref", "crs" -GM_NAMES_TEXT = " or ".join(repr(gm_name) for gm_name in GM_NAMES) +LON_NAME: Final = "lon" +LAT_NAME: Final = "lat" +X_NAME: Final = "x" +Y_NAME: Final = "y" +TIME_NAME: Final = "time" + +GM_NAMES: Final = "spatial_ref", "crs" +GM_NAMES_TEXT: Final = " or ".join(repr(gm_name) for gm_name in GM_NAMES) + +ML_FILE_PATTERN: Final = "**/*.levels" +ML_META_FILENAME: Final = ".zlevels" +ML_INFO_ATTR: Final = "_LEVEL_INFO" diff --git a/xrlint/plugins/xcube/plugin.py b/xrlint/plugins/xcube/plugin.py new file mode 100644 index 0000000..1de8247 --- /dev/null +++ b/xrlint/plugins/xcube/plugin.py @@ -0,0 +1,8 @@ +from xrlint.plugin import new_plugin +from xrlint.version import version + +plugin = new_plugin( + name="xcube", + version=version, + ref="xrlint.plugins.xcube:export_plugin", +) diff --git a/xrlint/plugins/xcube/processors/__init__.py b/xrlint/plugins/xcube/processors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/xrlint/plugins/xcube/processors/mldataset.py b/xrlint/plugins/xcube/processors/mldataset.py new file mode 100644 index 0000000..fc44eaf --- /dev/null +++ b/xrlint/plugins/xcube/processors/mldataset.py @@ -0,0 +1,104 @@ +import itertools +import json +import re +from typing import Any + +import fsspec +import xarray as xr + +from xrlint.plugins.xcube.constants import ( + ML_FILE_PATTERN, + ML_META_FILENAME, +) +from xrlint.plugins.xcube.plugin import plugin +from xrlint.plugins.xcube.util import ( + LevelsMeta, + norm_path, + attach_dataset_level_infos, +) +from xrlint.processor import ProcessorOp +from xrlint.result import Message + +level_pattern = re.compile(r"^(\d+)(?:\.zarr)?$") + + +@plugin.define_processor("multi-level-dataset") +class MultiLevelDatasetProcessor(ProcessorOp): + f"""This processor should be used with `files: [{ML_FILE_PATTERN}"]`.""" + + def preprocess( + self, file_path: str, opener_options: dict[str, Any] + ) -> list[tuple[xr.Dataset, str]]: + fs, fs_path = get_filesystem(file_path, opener_options) + + file_names = [ + # extracting the filename could be done more robustly + f.replace(fs_path, "").strip("/") + for f in fs.listdir(fs_path, detail=False) + ] + + # check for optional ".zlevels" that provides meta-info + meta = None + if ML_META_FILENAME in file_names: + with fs.open(f"{fs_path}/{ML_META_FILENAME}") as stream: + meta = LevelsMeta.from_value(json.load(stream)) + + # check for optional ".0.link" that locates level 0 somewhere else + level_0_path = None + if "0.link" in file_names: + level_0_path = fs.read_text(f"{fs_path}/0.link") + + level_names, num_levels = parse_levels(file_names, level_0_path) + + engine = opener_options.pop("engine", "zarr") + + level_datasets: list[xr.Dataset | None] = [] + for level, level_name in level_names.items(): + level_path = norm_path(f"{file_path}/{level_name}") + level_dataset = xr.open_dataset(level_path, engine=engine, **opener_options) + level_datasets.append((level_dataset, level_path)) + + attach_dataset_level_infos(level_datasets, meta=meta) + + return level_datasets + + def postprocess( + self, messages: list[list[Message]], file_path: str + ) -> list[Message]: + return list(itertools.chain(*messages)) + + +def get_filesystem(file_path: str, opener_options: dict[str, Any]): + storage_options = ( + opener_options.get( + "storage_options", + opener_options.get("backend_kwargs", {}).get("storage_options"), + ) + or {} + ) + _fs, fs_path = fsspec.core.url_to_fs(file_path, **storage_options) + fs: fsspec.AbstractFileSystem = _fs + fs_path: str = fs_path.replace("\\", "/") + return fs, fs_path + + +def parse_levels( + file_names: list[str], level_0_path: str | None +) -> tuple[dict[int, str], int]: + level_names: dict[int, str] = {0: level_0_path} if level_0_path else {} + num_levels = 0 + for file_name in file_names: + m = level_pattern.match(file_name) + if m is not None: + level = int(m.group(1)) + level_names[level] = file_name + num_levels = max(num_levels, level + 1) + if not level_names: + raise ValueError("empty multi-level dataset") + num_levels = max(level_names.keys()) + 1 + for level in range(num_levels): + if level not in level_names: + raise ValueError( + f"missing dataset for level {level} in multi-level dataset" + ) + return level_names, num_levels diff --git a/xrlint/plugins/xcube/rules/__init__.py b/xrlint/plugins/xcube/rules/__init__.py index 1de8247..e69de29 100644 --- a/xrlint/plugins/xcube/rules/__init__.py +++ b/xrlint/plugins/xcube/rules/__init__.py @@ -1,8 +0,0 @@ -from xrlint.plugin import new_plugin -from xrlint.version import version - -plugin = new_plugin( - name="xcube", - version=version, - ref="xrlint.plugins.xcube:export_plugin", -) diff --git a/xrlint/plugins/xcube/rules/any_spatial_data_var.py b/xrlint/plugins/xcube/rules/any_spatial_data_var.py index 697d2df..26f9375 100644 --- a/xrlint/plugins/xcube/rules/any_spatial_data_var.py +++ b/xrlint/plugins/xcube/rules/any_spatial_data_var.py @@ -1,5 +1,5 @@ from xrlint.node import DatasetNode -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.plugins.xcube.util import is_spatial_var from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/xcube/rules/cube_dims_order.py b/xrlint/plugins/xcube/rules/cube_dims_order.py index 103ed42..aee767e 100644 --- a/xrlint/plugins/xcube/rules/cube_dims_order.py +++ b/xrlint/plugins/xcube/rules/cube_dims_order.py @@ -1,6 +1,6 @@ from xrlint.node import DataArrayNode from xrlint.plugins.xcube.constants import LAT_NAME, LON_NAME, X_NAME, Y_NAME, TIME_NAME -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/xcube/rules/data_var_colors.py b/xrlint/plugins/xcube/rules/data_var_colors.py index 39225d2..48d8fc1 100644 --- a/xrlint/plugins/xcube/rules/data_var_colors.py +++ b/xrlint/plugins/xcube/rules/data_var_colors.py @@ -1,5 +1,5 @@ from xrlint.node import DataArrayNode -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.plugins.xcube.util import is_spatial_var from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/xcube/rules/grid_mapping_naming.py b/xrlint/plugins/xcube/rules/grid_mapping_naming.py index dc079a3..cca1f8d 100644 --- a/xrlint/plugins/xcube/rules/grid_mapping_naming.py +++ b/xrlint/plugins/xcube/rules/grid_mapping_naming.py @@ -1,6 +1,6 @@ from xrlint.node import DatasetNode from xrlint.plugins.xcube.constants import GM_NAMES, GM_NAMES_TEXT -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/xcube/rules/increasing_time.py b/xrlint/plugins/xcube/rules/increasing_time.py index 331fb05..ee38997 100644 --- a/xrlint/plugins/xcube/rules/increasing_time.py +++ b/xrlint/plugins/xcube/rules/increasing_time.py @@ -1,7 +1,7 @@ import numpy as np from xrlint.node import DataArrayNode -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.rule import RuleContext, RuleExit, RuleOp from xrlint.util.formatting import format_count, format_seq diff --git a/xrlint/plugins/xcube/rules/lat_lon_naming.py b/xrlint/plugins/xcube/rules/lat_lon_naming.py index 9ae2c98..e3d4a14 100644 --- a/xrlint/plugins/xcube/rules/lat_lon_naming.py +++ b/xrlint/plugins/xcube/rules/lat_lon_naming.py @@ -1,6 +1,6 @@ from xrlint.node import DatasetNode from xrlint.plugins.xcube.constants import LAT_NAME, LON_NAME -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.rule import RuleContext, RuleOp INVALID_LAT_NAMES = {"ltd", "latitude"} diff --git a/xrlint/plugins/xcube/rules/ml_dataset_meta.py b/xrlint/plugins/xcube/rules/ml_dataset_meta.py new file mode 100644 index 0000000..7d19ab3 --- /dev/null +++ b/xrlint/plugins/xcube/rules/ml_dataset_meta.py @@ -0,0 +1,80 @@ +from xrlint.node import DatasetNode +from xrlint.plugins.xcube.constants import ML_META_FILENAME +from xrlint.plugins.xcube.plugin import plugin +from xrlint.plugins.xcube.util import get_dataset_level_info, is_spatial_var +from xrlint.rule import RuleContext, RuleOp +from xrlint.util.formatting import format_item + + +@plugin.define_rule( + "ml-dataset-meta", + version="1.0.0", + type="suggestion", + description=( + f"Multi-level datasets should provide {ML_META_FILENAME!r}" + f" meta information file and if so, it should be consistent." + ), + docs_url=( + "https://xcube.readthedocs.io/en/latest/mldatasets.html#the-xcube-levels-format" + ), +) +class MLDatasetMeta(RuleOp): + def dataset(self, ctx: RuleContext, node: DatasetNode): + level_info = get_dataset_level_info(node.dataset) + if level_info is None: + # ok, this rules applies only to level datasets opened + # by the xcube multi-level processor + return + + level = level_info.level + if level > 0: + # ok, this rule does only apply to level 0 + return + + meta = level_info.meta + if meta is None: + ctx.report( + f"Missing {ML_META_FILENAME!r} meta-info file," + f" therefore dataset cannot be extended." + ) + return + + if not meta.version.startswith("1."): + ctx.report(f"Unsupported {ML_META_FILENAME!r} meta-info version") + + if meta.num_levels <= 0: + ctx.report( + f"Invalid 'num_levels' in {ML_META_FILENAME!r} meta-info:" + f" {meta.num_levels}" + ) + elif meta.num_levels != level_info.num_levels: + ctx.report( + f"Expected {format_item(meta.num_levels, 'level')}," + f" but found {level_info.num_levels}" + ) + + if meta.use_saved_levels is None: + ctx.report( + f"Missing value for 'use_saved_levels'" + f" in {ML_META_FILENAME!r} meta-info." + ) + + if not meta.agg_methods: + ctx.report( + f"Missing value for 'agg_methods' in {ML_META_FILENAME!r} meta-info." + ) + else: + for var_name, var in node.dataset.data_vars.items(): + if is_spatial_var(var) and not meta.agg_methods.get(var_name): + ctx.report( + f"Missing value for variable {var_name!r}" + f" in 'agg_methods' of {ML_META_FILENAME!r} meta-info." + ) + for var_name in meta.agg_methods.keys(): + if var_name not in node.dataset: + ctx.report( + f"Variable {var_name!r} not found in dataset, but specified" + f" in 'agg_methods' of {ML_META_FILENAME!r} meta-info." + ) + + # Later: check meta.tile_size as well... diff --git a/xrlint/plugins/xcube/rules/ml_dataset_time.py b/xrlint/plugins/xcube/rules/ml_dataset_time.py new file mode 100644 index 0000000..0273af4 --- /dev/null +++ b/xrlint/plugins/xcube/rules/ml_dataset_time.py @@ -0,0 +1,39 @@ +from xrlint.node import DatasetNode +from xrlint.plugins.xcube.constants import TIME_NAME +from xrlint.plugins.xcube.plugin import plugin +from xrlint.plugins.xcube.util import get_dataset_level_info, is_spatial_var +from xrlint.rule import RuleContext, RuleOp +from xrlint.util.formatting import format_seq + + +@plugin.define_rule( + "ml-dataset-time", + version="1.0.0", + type="problem", + description=( + "The `time` dimension of multi-level datasets should use a chunk size of 1." + " This allows for faster image tile generation for visualisation." + ), + docs_url="https://xcube.readthedocs.io/en/latest/mldatasets.html#definition", +) +class MLDatasetTime(RuleOp): + def dataset(self, ctx: RuleContext, node: DatasetNode): + level_info = get_dataset_level_info(node.dataset) + if level_info is None: + # ok, this rules applies only to level datasets opened + # by the xcube multi-level processor + return + + if TIME_NAME not in node.dataset.sizes or node.dataset.sizes[TIME_NAME] <= 1: + # ok, no time dimension used or no time extent + return + + for var_name, var in node.dataset.data_vars.items(): + if is_spatial_var(var) and TIME_NAME in var.dims and var.chunks is not None: + time_index = var.dims.index(TIME_NAME) + time_chunks = var.chunks[time_index] + if not all(c == 1 for c in time_chunks): + ctx.report( + f"Variable {var_name!r} uses chunking for {TIME_NAME!r}" + f" that differs from from one: {format_seq(time_chunks)}" + ) diff --git a/xrlint/plugins/xcube/rules/ml_dataset_xy.py b/xrlint/plugins/xcube/rules/ml_dataset_xy.py new file mode 100644 index 0000000..863f9a7 --- /dev/null +++ b/xrlint/plugins/xcube/rules/ml_dataset_xy.py @@ -0,0 +1,57 @@ +import math + +from xrlint.node import DatasetNode +from xrlint.plugins.xcube.plugin import plugin +from xrlint.plugins.xcube.util import get_spatial_size, get_dataset_level_info +from xrlint.rule import RuleContext, RuleOp + + +@plugin.define_rule( + "ml-dataset-xy", + version="1.0.0", + type="problem", + description=( + "Multi-level dataset levels should provide spatial resolutions" + " decreasing by powers of two." + ), + docs_url="https://xcube.readthedocs.io/en/latest/mldatasets.html#definition", +) +class MLDatasetXY(RuleOp): + def dataset(self, ctx: RuleContext, node: DatasetNode): + level_info = get_dataset_level_info(node.dataset) + if level_info is None: + # ok, this rules applies only to level datasets opened + # by the xcube multi-level processor + return + + level = level_info.level + if level == 0: + # ok, this rule does only apply to level > 0 + return + + datasets = level_info.datasets + level_0_dataset, _ = datasets[0] + l0_size = get_spatial_size(level_0_dataset) + if l0_size is None: + # ok, maybe no spatial data vars? + return + + (x_name, level_0_width), (y_name, level_0_height) = l0_size + level_width = node.dataset.sizes.get(x_name) + level_height = node.dataset.sizes.get(y_name) + expected_level_width = math.ceil(level_0_width >> level) + expected_level_height = math.ceil(level_0_height >> level) + + if level_width != expected_level_width: + ctx.report( + f"Expected size of dimension {x_name!r} in level {level}" + f" to be {expected_level_width}, but was {level_width}" + ) + + if level_height != expected_level_height: + ctx.report( + f"Expected size of dimension {y_name!r} in level {level}" + f" to be {expected_level_height}, but was {level_height}" + ) + + # Here: check spatial coordinates... diff --git a/xrlint/plugins/xcube/rules/single_grid_mapping.py b/xrlint/plugins/xcube/rules/single_grid_mapping.py index 0e75e5c..cabb7c8 100644 --- a/xrlint/plugins/xcube/rules/single_grid_mapping.py +++ b/xrlint/plugins/xcube/rules/single_grid_mapping.py @@ -1,6 +1,6 @@ from xrlint.node import DatasetNode from xrlint.plugins.xcube.constants import GM_NAMES_TEXT, LAT_NAME, LON_NAME -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.plugins.xcube.util import is_spatial_var from xrlint.rule import RuleContext, RuleOp diff --git a/xrlint/plugins/xcube/rules/time_naming.py b/xrlint/plugins/xcube/rules/time_naming.py index 5c2db10..08d3f5e 100644 --- a/xrlint/plugins/xcube/rules/time_naming.py +++ b/xrlint/plugins/xcube/rules/time_naming.py @@ -5,7 +5,7 @@ from xrlint.node import DatasetNode from xrlint.plugins.xcube.constants import TIME_NAME -from xrlint.plugins.xcube.rules import plugin +from xrlint.plugins.xcube.plugin import plugin from xrlint.rule import RuleOp, RuleContext diff --git a/xrlint/plugins/xcube/util.py b/xrlint/plugins/xcube/util.py index 09a4e20..2d56d18 100644 --- a/xrlint/plugins/xcube/util.py +++ b/xrlint/plugins/xcube/util.py @@ -1,11 +1,100 @@ +from collections.abc import Hashable +from dataclasses import dataclass + import xarray as xr -from .constants import LAT_NAME, LON_NAME, X_NAME, Y_NAME +from xrlint.util.constructible import MappingConstructible +from .constants import LAT_NAME, LON_NAME, X_NAME, Y_NAME, ML_INFO_ATTR + + +@dataclass(frozen=True, kw_only=True) +class LevelsMeta(MappingConstructible): + """The contents of a xcube `.zlevels` meta-info file.""" + + version: str + num_levels: int + tile_size: tuple[int, int] | None = None + use_saved_levels: bool | None = None + agg_methods: dict[str, str] | None = None + + +@dataclass(frozen=True, kw_only=True) +class LevelInfo: + """Level info added to each level dataset to allow for validation + of multi-level datasets. + + Note, this need to be aligned with + + """ + + level: int + """Current level index.""" + + num_levels: int + """Actual number of levels found.""" + + datasets: list[tuple[xr.Dataset, str]] + """A list of num_levels pairs comprising + level dataset and level file path. + """ + + meta: LevelsMeta | None = None + """Content of a `.zlevels` file, if file was found.""" + + +def get_dataset_level_info(dataset: xr.Dataset) -> LevelInfo | None: + return dataset.attrs.get(ML_INFO_ATTR) + + +def set_dataset_level_info(dataset: xr.Dataset, level_info: LevelInfo): + dataset.attrs[ML_INFO_ATTR] = level_info + + +def attach_dataset_level_infos( + level_datasets: list[tuple[xr.Dataset, str]], + meta: LevelsMeta | None = None, +): + for level, (level_dataset, _) in enumerate(level_datasets): + set_dataset_level_info( + level_dataset, + LevelInfo( + level=level, + num_levels=len(level_datasets), + meta=meta, + datasets=level_datasets, + ), + ) def is_spatial_var(var: xr.DataArray) -> bool: - """Return 'True' if `var` looks like a spatial variable.""" - dims = var.dims - return (X_NAME in dims and Y_NAME in dims) or ( - LON_NAME in dims and LAT_NAME in dims + """Return 'True' if `var` looks like a spatial 2+d variable.""" + if var.ndim < 2: + return False + y_name, x_name = var.dims[-2:] + return (x_name == X_NAME and y_name == Y_NAME) or ( + x_name == LON_NAME and y_name == LAT_NAME + ) + + +def get_spatial_size( + dataset: xr.Dataset, +) -> tuple[tuple[Hashable, int], tuple[Hashable, int]] | None: + """Return (x_size, y_size) for given dataset.""" + for k, v in dataset.data_vars.items(): + if is_spatial_var(v): + y_name, x_name = v.dims[-2:] + x_size = dataset.sizes[x_name] + y_size = dataset.sizes[y_name] + if x_size and y_size: + return (x_name, x_size), (y_name, y_size) + return None + + +def norm_path(level_path: str) -> str: + parts = level_path.replace("\\", "/").split("/") + level_path = "/".join( + p + for i, p in enumerate(parts) + if p not in (".", "..") and (i == len(parts) - 1 or parts[i + 1] != "..") ) + return level_path diff --git a/xrlint/util/importutil.py b/xrlint/util/importutil.py index 7430ab0..48ccd39 100644 --- a/xrlint/util/importutil.py +++ b/xrlint/util/importutil.py @@ -57,7 +57,7 @@ def import_value( If it is not given, the module itself will be the exported value. attr_ref: Attribute reference. Should be given in the case where `module_ref` does not contain an attribute reference. - Example values are "export_plugin", "export_configs". + Example values are "export_plugin", "export_config". constant: If `True` the value is expected to be a constant. If `False`, the default, the referenced attribute can be a no-arg callable that yields the actual exported value. diff --git a/xrlint/util/serializable.py b/xrlint/util/serializable.py index b8b7415..51e305e 100644 --- a/xrlint/util/serializable.py +++ b/xrlint/util/serializable.py @@ -1,3 +1,4 @@ +from dataclasses import is_dataclass, fields from typing import Any, Final, Mapping, Sequence, TypeAlias from xrlint.util.formatting import format_message_type_of @@ -61,11 +62,16 @@ def _value_to_json(cls, value: Any, value_name: str) -> JsonValue: @classmethod def _object_to_json(cls, value: Any, value_name: str) -> dict[str, JsonValue]: - return { - k: cls._value_to_json(v, f"{value_name}.{k}") - for k, v in vars(value).items() - if cls._is_non_protected_property_name(k) - } + if is_dataclass(value): + _d = {f.name: (f, getattr(value, f.name)) for f in fields(value)} + d = {k: v for k, (f, v) in _d.items() if v != f.default} + else: + d = { + k: v + for k, v in vars(value).items() + if cls._is_non_protected_property_name(k) + } + return {k: cls._value_to_json(v, f"{value_name}.{k}") for k, v in d.items()} @classmethod def _mapping_to_json(