Skip to content

Commit 65b6b37

Browse files
authored
Merge pull request #32 from bcdev/forman-29-no_chunked_coords
New xcube rule `no-chunked-coords`
2 parents 4a79e95 + a1a67f8 commit 65b6b37

File tree

7 files changed

+95
-3
lines changed

7 files changed

+95
-3
lines changed

CHANGES.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22

33
## Version 0.4.0 (in development)
44

5+
- New xcube rule `no-chunked-coords`. (#29)
56
- New xcube multi-level dataset rules:
6-
- `ml-dataset-meta`: verifies that a meta info file exists and is consistent
7-
- `ml-dataset-xy`: verifies that the levels have expected spatial resolutions
8-
- `ml-dataset-time`: verifies that the levels have expected time dimension, if any
7+
- `ml-dataset-meta`: verifies that a meta info file exists and is consistent;
8+
- `ml-dataset-xy`: verifies that the levels have expected spatial resolutions;
9+
- `ml-dataset-time`: verifies that the levels have expected time dimension, if any.
910
- Now supporting xcube multi-level datasets `*.levels`:
1011
- Added xcube plugin processor `"xcube/multi-level-dataset"` that is used
1112
inside the predefined xcube configurations "all" and "recommended".

docs/rule-ref.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,12 @@ Multi-level dataset levels should provide spatial resolutions decreasing by powe
135135

136136
Contained in: `all`-:material-lightning-bolt: `recommended`-:material-lightning-bolt:
137137

138+
### :material-bug: `no-chunked-coords`
139+
140+
Coordinate variables should not be chunked. Can be used to identify performance issues, where chunked coordinates can cause slow opening if datasets due to the many chunk-fetching requests made to (remote) filesystems with low bandwidth. You can use the `limit` parameter to specify an acceptable number of chunks. Its default is 5.
141+
142+
Contained in: `all`-:material-lightning-bolt: `recommended`-:material-alert:
143+
138144
### :material-bug: `single-grid-mapping`
139145

140146
A single grid mapping shall be used for all spatial data variables of a datacube.
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import xarray as xr
2+
3+
from xrlint.plugins.xcube.rules.no_chunked_coords import NoChunkedCoords
4+
from xrlint.testing import RuleTest, RuleTester
5+
from tests.plugins.xcube.helpers import make_cube
6+
7+
valid_dataset_0 = xr.Dataset(attrs=dict(title="Empty"))
8+
valid_dataset_1 = make_cube(360, 180, 3)
9+
valid_dataset_2 = make_cube(90, 45, 20)
10+
# ok, below default limit 5: ceil(20 / 5) = 4
11+
valid_dataset_2.time.encoding["chunks"] = [4]
12+
13+
invalid_dataset_0 = make_cube(90, 45, 10)
14+
# exceed default limit 5: ceil(10 / 1) = 10
15+
invalid_dataset_0.time.encoding["chunks"] = [1]
16+
17+
NoChunkedCoordsTest = RuleTester.define_test(
18+
"no-chunked-coords",
19+
NoChunkedCoords,
20+
valid=[
21+
RuleTest(dataset=valid_dataset_0),
22+
RuleTest(dataset=valid_dataset_1),
23+
RuleTest(dataset=valid_dataset_2),
24+
],
25+
invalid=[
26+
RuleTest(dataset=invalid_dataset_0),
27+
],
28+
)

tests/plugins/xcube/test_plugin.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def test_rules_complete(self):
1717
"ml-dataset-meta",
1818
"ml-dataset-time",
1919
"ml-dataset-xy",
20+
"no-chunked-coords",
2021
"single-grid-mapping",
2122
"time-naming",
2223
},

xrlint/plugins/xcube/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ def export_plugin() -> Plugin:
4141
"xcube/ml-dataset-meta": "error",
4242
"xcube/ml-dataset-time": "warn",
4343
"xcube/ml-dataset-xy": "error",
44+
"xcube/no-chunked-coords": "warn",
4445
"xcube/single-grid-mapping": "error",
4546
"xcube/time-naming": "error",
4647
},
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import math
2+
3+
from xrlint.node import DataArrayNode
4+
from xrlint.plugins.xcube.plugin import plugin
5+
from xrlint.rule import RuleContext, RuleOp
6+
from xrlint.util.schema import schema
7+
8+
DEFAULT_LIMIT = 5
9+
10+
11+
@plugin.define_rule(
12+
"no-chunked-coords",
13+
version="1.0.0",
14+
type="problem",
15+
description=(
16+
"Coordinate variables should not be chunked."
17+
" Can be used to identify performance issues, where chunked coordinates"
18+
" can cause slow opening if datasets due to the many chunk-fetching"
19+
" requests made to (remote) filesystems with low bandwidth."
20+
" You can use the `limit` parameter to specify an acceptable number "
21+
f" of chunks. Its default is {DEFAULT_LIMIT}."
22+
),
23+
schema=schema(
24+
"object",
25+
properties=dict(
26+
limit=schema(
27+
"integer",
28+
minimum=0,
29+
default=DEFAULT_LIMIT,
30+
title="Acceptable number of chunks",
31+
)
32+
),
33+
),
34+
)
35+
class NoChunkedCoords(RuleOp):
36+
def __init__(self, limit: int = DEFAULT_LIMIT):
37+
self.limit = limit
38+
39+
def data_array(self, ctx: RuleContext, node: DataArrayNode):
40+
if node.name not in ctx.dataset.coords or node.data_array.ndim != 1:
41+
return
42+
43+
chunks = node.data_array.encoding.get("chunks")
44+
if isinstance(chunks, (list, tuple)) and len(chunks) == 1:
45+
num_chunks = math.ceil(node.data_array.size / chunks[0])
46+
if num_chunks > self.limit:
47+
ctx.report(
48+
f"Number of chunks exceeds limit: {num_chunks} > {self.limit}.",
49+
suggestions=["Combine chunks into a one or more larger ones."],
50+
)

xrlint/util/schema.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ def schema(
3333
default: Any | None = None,
3434
const: Any | None = None,
3535
enum: list[Any] | None = None,
36+
title: str | None = None,
37+
description: str | None = None,
3638
# "integer", "number"
3739
minimum: int | float | None = None,
3840
maximum: int | float | None = None,
@@ -43,6 +45,7 @@ def schema(
4345
additionalProperties: bool | None = None,
4446
required: list[str] | None = None,
4547
) -> JsonSchema:
48+
"""Helper function so you have keyword-arguments for creating schemas."""
4649
return {
4750
k: v
4851
for k, v in dict(
@@ -56,6 +59,8 @@ def schema(
5659
properties=properties,
5760
additionalProperties=False if additionalProperties is False else None,
5861
required=required,
62+
title=title,
63+
description=description,
5964
).items()
6065
if v is not None
6166
}

0 commit comments

Comments
 (0)