diff --git a/CHANGES.md b/CHANGES.md index a4c104e..682500d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,10 +2,11 @@ ## Version 0.4.0 (in development) +- New xcube rule `no-chunked-coords`. (#29) - New xcube multi-level dataset rules: - - `ml-dataset-meta`: verifies that a meta info file exists and is consistent - - `ml-dataset-xy`: verifies that the levels have expected spatial resolutions - - `ml-dataset-time`: verifies that the levels have expected time dimension, if any + - `ml-dataset-meta`: verifies that a meta info file exists and is consistent; + - `ml-dataset-xy`: verifies that the levels have expected spatial resolutions; + - `ml-dataset-time`: verifies that the levels have expected time dimension, if any. - Now supporting xcube multi-level datasets `*.levels`: - Added xcube plugin processor `"xcube/multi-level-dataset"` that is used inside the predefined xcube configurations "all" and "recommended". diff --git a/docs/rule-ref.md b/docs/rule-ref.md index bb4219d..9cb8076 100644 --- a/docs/rule-ref.md +++ b/docs/rule-ref.md @@ -135,6 +135,12 @@ Multi-level dataset levels should provide spatial resolutions decreasing by powe Contained in: `all`-:material-lightning-bolt: `recommended`-:material-lightning-bolt: +### :material-bug: `no-chunked-coords` + +Coordinate variables should not be chunked. Can be used to identify performance issues, where chunked coordinates can cause slow opening if datasets due to the many chunk-fetching requests made to (remote) filesystems with low bandwidth. You can use the `limit` parameter to specify an acceptable number of chunks. Its default is 5. + +Contained in: `all`-:material-lightning-bolt: `recommended`-:material-alert: + ### :material-bug: `single-grid-mapping` A single grid mapping shall be used for all spatial data variables of a datacube. diff --git a/tests/plugins/xcube/rules/test_no_chunked_coords.py b/tests/plugins/xcube/rules/test_no_chunked_coords.py new file mode 100644 index 0000000..51bd7ba --- /dev/null +++ b/tests/plugins/xcube/rules/test_no_chunked_coords.py @@ -0,0 +1,28 @@ +import xarray as xr + +from xrlint.plugins.xcube.rules.no_chunked_coords import NoChunkedCoords +from xrlint.testing import RuleTest, RuleTester +from tests.plugins.xcube.helpers import make_cube + +valid_dataset_0 = xr.Dataset(attrs=dict(title="Empty")) +valid_dataset_1 = make_cube(360, 180, 3) +valid_dataset_2 = make_cube(90, 45, 20) +# ok, below default limit 5: ceil(20 / 5) = 4 +valid_dataset_2.time.encoding["chunks"] = [4] + +invalid_dataset_0 = make_cube(90, 45, 10) +# exceed default limit 5: ceil(10 / 1) = 10 +invalid_dataset_0.time.encoding["chunks"] = [1] + +NoChunkedCoordsTest = RuleTester.define_test( + "no-chunked-coords", + NoChunkedCoords, + valid=[ + RuleTest(dataset=valid_dataset_0), + RuleTest(dataset=valid_dataset_1), + RuleTest(dataset=valid_dataset_2), + ], + invalid=[ + RuleTest(dataset=invalid_dataset_0), + ], +) diff --git a/tests/plugins/xcube/test_plugin.py b/tests/plugins/xcube/test_plugin.py index e469d74..12a8e42 100644 --- a/tests/plugins/xcube/test_plugin.py +++ b/tests/plugins/xcube/test_plugin.py @@ -17,6 +17,7 @@ def test_rules_complete(self): "ml-dataset-meta", "ml-dataset-time", "ml-dataset-xy", + "no-chunked-coords", "single-grid-mapping", "time-naming", }, diff --git a/xrlint/plugins/xcube/__init__.py b/xrlint/plugins/xcube/__init__.py index 0c008a9..10c974e 100644 --- a/xrlint/plugins/xcube/__init__.py +++ b/xrlint/plugins/xcube/__init__.py @@ -41,6 +41,7 @@ def export_plugin() -> Plugin: "xcube/ml-dataset-meta": "error", "xcube/ml-dataset-time": "warn", "xcube/ml-dataset-xy": "error", + "xcube/no-chunked-coords": "warn", "xcube/single-grid-mapping": "error", "xcube/time-naming": "error", }, diff --git a/xrlint/plugins/xcube/rules/no_chunked_coords.py b/xrlint/plugins/xcube/rules/no_chunked_coords.py new file mode 100644 index 0000000..e74fc9e --- /dev/null +++ b/xrlint/plugins/xcube/rules/no_chunked_coords.py @@ -0,0 +1,50 @@ +import math + +from xrlint.node import DataArrayNode +from xrlint.plugins.xcube.plugin import plugin +from xrlint.rule import RuleContext, RuleOp +from xrlint.util.schema import schema + +DEFAULT_LIMIT = 5 + + +@plugin.define_rule( + "no-chunked-coords", + version="1.0.0", + type="problem", + description=( + "Coordinate variables should not be chunked." + " Can be used to identify performance issues, where chunked coordinates" + " can cause slow opening if datasets due to the many chunk-fetching" + " requests made to (remote) filesystems with low bandwidth." + " You can use the `limit` parameter to specify an acceptable number " + f" of chunks. Its default is {DEFAULT_LIMIT}." + ), + schema=schema( + "object", + properties=dict( + limit=schema( + "integer", + minimum=0, + default=DEFAULT_LIMIT, + title="Acceptable number of chunks", + ) + ), + ), +) +class NoChunkedCoords(RuleOp): + def __init__(self, limit: int = DEFAULT_LIMIT): + self.limit = limit + + def data_array(self, ctx: RuleContext, node: DataArrayNode): + if node.name not in ctx.dataset.coords or node.data_array.ndim != 1: + return + + chunks = node.data_array.encoding.get("chunks") + if isinstance(chunks, (list, tuple)) and len(chunks) == 1: + num_chunks = math.ceil(node.data_array.size / chunks[0]) + if num_chunks > self.limit: + ctx.report( + f"Number of chunks exceeds limit: {num_chunks} > {self.limit}.", + suggestions=["Combine chunks into a one or more larger ones."], + ) diff --git a/xrlint/util/schema.py b/xrlint/util/schema.py index 872fd1c..dc7f8ab 100644 --- a/xrlint/util/schema.py +++ b/xrlint/util/schema.py @@ -33,6 +33,8 @@ def schema( default: Any | None = None, const: Any | None = None, enum: list[Any] | None = None, + title: str | None = None, + description: str | None = None, # "integer", "number" minimum: int | float | None = None, maximum: int | float | None = None, @@ -43,6 +45,7 @@ def schema( additionalProperties: bool | None = None, required: list[str] | None = None, ) -> JsonSchema: + """Helper function so you have keyword-arguments for creating schemas.""" return { k: v for k, v in dict( @@ -56,6 +59,8 @@ def schema( properties=properties, additionalProperties=False if additionalProperties is False else None, required=required, + title=title, + description=description, ).items() if v is not None }