Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## Version 0.4.0 (in development)

- Fixed and enhanced core rule `time-coordinate`. `(#33)
- New xcube rule `no-chunked-coords`. (#29)
- New xcube multi-level dataset rules:
- `ml-dataset-meta`: verifies that a meta info file exists and is consistent;
Expand Down
2 changes: 1 addition & 1 deletion docs/rule-ref.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ Contained in: `all`-:material-lightning-bolt: `recommended`-:material-alert:

### :material-bug: `time-coordinate`

Time coordinate (standard_name='time') should have unambiguous time units encoding.
Time coordinates should have valid and unambiguous time units encoding.
[:material-information-variant:](https://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate)

Contained in: `all`-:material-lightning-bolt: `recommended`-:material-lightning-bolt:
Expand Down
65 changes: 53 additions & 12 deletions tests/plugins/core/rules/test_time_coordinate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
dims="time",
attrs={
"standard_name": "time",
"long_name": "time",
},
),
},
Expand All @@ -27,36 +26,70 @@
valid_dataset_2 = valid_dataset_1.copy()
del valid_dataset_2.time.encoding["units"]
del valid_dataset_2.time.encoding["calendar"]
valid_dataset_2.time.attrs["units"] = "seconds since 2000-01-01 UTC"
valid_dataset_2.time.attrs["units"] = "seconds since 2000-1-1 +2:00"
valid_dataset_2.time.attrs["calendar"] = "gregorian"

# OK, because not identified as time
valid_dataset_3 = valid_dataset_1.copy()
del valid_dataset_3.time.encoding["units"]
del valid_dataset_3.time.attrs["standard_name"]

# OK, because we only look for standard_name
# OK, because we only look for time units
valid_dataset_4 = valid_dataset_1.rename_vars({"time": "tm"})
del valid_dataset_4.tm.attrs["standard_name"]

# Invalid, because long_name is missing
# OK, because not recognized as time coord
valid_dataset_5 = valid_dataset_1.copy()
valid_dataset_5.time.encoding["units"] = 1
del valid_dataset_5.time.attrs["standard_name"]

# Invalid, because units is invalid but standard_name given
invalid_dataset_0 = valid_dataset_1.copy()
del invalid_dataset_0.time.attrs["long_name"]
invalid_dataset_0.time.encoding["units"] = 1

# Invalid, because we require units
# Invalid, because we require units but standard_name given
invalid_dataset_1 = valid_dataset_1.copy(deep=True)
del invalid_dataset_1.time.encoding["units"]

# Invalid, because we require calendar
# Invalid, because we no time units although standard_name given
invalid_dataset_2 = valid_dataset_1.copy(deep=True)
del invalid_dataset_2.time.encoding["calendar"]
invalid_dataset_2.time.encoding["units"] = "years from 2000-1-1 +0:0"

# Invalid, because we require TZ units part
# Invalid, because we require calendar
invalid_dataset_3 = valid_dataset_1.copy(deep=True)
invalid_dataset_3.time.encoding["units"] = "seconds since 2000-01-01 00:00:00"
del invalid_dataset_3.time.encoding["calendar"]

# Invalid, because we require units format wrong
# Invalid, because we use invalid UOT
invalid_dataset_4 = valid_dataset_1.copy(deep=True)
invalid_dataset_4.time.encoding["units"] = "2000-01-01 00:00:00 UTC"
invalid_dataset_4.time.encoding["units"] = "millis since 2000-1-1 +0:0"

# Invalid, because we use ambiguous UOT
invalid_dataset_5 = valid_dataset_1.copy(deep=True)
invalid_dataset_5.time.encoding["units"] = "years since 2000-1-1 +0:0"

# Invalid, because we require timezone
invalid_dataset_6 = valid_dataset_1.copy(deep=True)
invalid_dataset_6.time.encoding["units"] = "seconds since 2000-01-01 00:00:00"

# Invalid, because we require timezone
invalid_dataset_7 = valid_dataset_1.copy(deep=True)
invalid_dataset_7.time.encoding["units"] = "seconds since 2000-01-01"

# Invalid, because we have 6 units parts
invalid_dataset_8 = valid_dataset_1.copy(deep=True)
invalid_dataset_8.time.encoding["units"] = "days since 2000-01-01 12:00:00 +0:00 utc"

# Invalid, because we date part is invalid
invalid_dataset_9 = valid_dataset_1.copy(deep=True)
invalid_dataset_9.time.encoding["units"] = "days since 00-01-01 12:00:00 +0:00"

# Invalid, because we time part is invalid
invalid_dataset_10 = valid_dataset_1.copy(deep=True)
invalid_dataset_10.time.encoding["units"] = "days since 2000-01-01 12:00 +0:00"

# Invalid, because we tz part is invalid
invalid_dataset_11 = valid_dataset_1.copy(deep=True)
invalid_dataset_11.time.encoding["units"] = "days since 2000-01-01 12:00:00 utc"

TimeCoordinateTest = RuleTester.define_test(
"time-coordinate",
Expand All @@ -67,12 +100,20 @@
RuleTest(dataset=valid_dataset_2),
RuleTest(dataset=valid_dataset_3),
RuleTest(dataset=valid_dataset_4),
RuleTest(dataset=valid_dataset_5),
],
invalid=[
RuleTest(dataset=invalid_dataset_0),
RuleTest(dataset=invalid_dataset_1),
RuleTest(dataset=invalid_dataset_2),
RuleTest(dataset=invalid_dataset_3),
RuleTest(dataset=invalid_dataset_4),
RuleTest(dataset=invalid_dataset_5),
RuleTest(dataset=invalid_dataset_6),
RuleTest(dataset=invalid_dataset_7),
RuleTest(dataset=invalid_dataset_8),
RuleTest(dataset=invalid_dataset_9),
RuleTest(dataset=invalid_dataset_10),
RuleTest(dataset=invalid_dataset_11),
],
)
168 changes: 133 additions & 35 deletions xrlint/plugins/core/rules/time_coordinate.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,53 @@
import re


from xrlint.node import DataArrayNode
from xrlint.plugins.core.plugin import plugin
from xrlint.rule import RuleContext, RuleOp


_EXPECTED_UNITY_FORMAT = "<unit> since <date> <time> <timezone>"
_EXAMPLE_UNIT_1 = "seconds since 2010-10-8 15:15:42.5 -6:00"
_EXAMPLE_UNIT_2 = "days since 2000-01-01 +0:00"

_AMBIGUOUS_UNITS_OF_TIME = (
"years",
"year",
"y",
"months",
"month",
"m",
)

_UNAMBIGUOUS_UNITS_OF_TIME = (
"days",
"day",
"d",
"hours",
"hour",
"hr",
"h",
"minutes",
"minute",
"min",
"seconds",
"second",
"sec",
"s",
)

_ALL_UNITS_OF_TIME = (*_AMBIGUOUS_UNITS_OF_TIME, *_UNAMBIGUOUS_UNITS_OF_TIME)

_RE_DATE = re.compile(r"^\d{4}-\d{1,2}-\d{1,2}$")
_RE_TIME = re.compile(r"^\d{1,2}:\d{1,2}:\d{1,2}(\.\d{1,6})?$")
_RE_TZ = re.compile(r"^[+-]\d{1,2}:\d{1,2}$")


@plugin.define_rule(
"time-coordinate",
version="1.0.0",
type="problem",
description=(
"Time coordinate (standard_name='time') should have"
" unambiguous time units encoding."
"Time coordinates should have valid and unambiguous time units encoding."
),
docs_url=(
"https://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate"
Expand All @@ -24,50 +59,113 @@ def data_array(self, ctx: RuleContext, node: DataArrayNode):
attrs = array.attrs
encoding = array.encoding

if node.name not in ctx.dataset.coords or attrs.get("standard_name") != "time":
units: str | None = encoding.get("units", attrs.get("units"))
if units is None:
if _is_time_by_name(attrs):
ctx.report("Missing 'units' attribute for time coordinate.")
# No more to check w.o. time units
return
elif not isinstance(units, str):
if _is_time_by_name(attrs):
ctx.report(
f"Invalid 'units' attribute for time coordinate,"
f" expected type str, got {type(units).__name__}."
)
# No more to check w.o. time units
return

if attrs.get("long_name") != "time":
ctx.report("Attribute 'long_name' should be 'time'.")
units_ok = True
units_parts = units.split(" ")
num_unit_parts = len(units_parts)
is_time_by_units = num_unit_parts >= 3 and units_parts[1] == "since"
if not is_time_by_units:
if not _is_time_by_name(attrs):
# Not a time coordinate
return
units_ok = False
else:
# We have time units

use_units_format_msg = (
f"Specify 'units' attribute using format {_EXPECTED_UNITY_FORMAT!r}."
)
if not encoding.get("calendar", attrs.get("calendar")):
ctx.report(
"Attribute/encoding 'calendar' should be specified.",
)

calendar: str | None = encoding.get("calendar", attrs.get("calendar"))
units: str | None = encoding.get("units", attrs.get("units"))
if not units or not calendar:
if not calendar:
uot_part = units_parts[0]
date_part = units_parts[2]
time_part = None
tz_part = None

if uot_part not in _ALL_UNITS_OF_TIME:
ctx.report(
"Attribute 'calendar' should be specified.",
f"Unrecognized units of measure for time"
f" in 'units' attribute: {units!r}.",
suggestions=[
_units_format_suggestion(),
_units_of_time_suggestion(),
],
)
if not units:
elif uot_part in _AMBIGUOUS_UNITS_OF_TIME:
ctx.report(
"Attribute 'units' should be specified.",
suggestions=[use_units_format_msg],
f"Ambiguous units of measure for time in"
f" 'units' attribute: {units!r}.",
suggestions=[
_units_format_suggestion(),
_units_of_time_suggestion(),
],
)
# next checks concern units only
return

units_parts = units.split(" ")
# note, may use regex here
if len(units_parts) >= 4 and units_parts[1] == "since":
# format seems ok, check timezone part
last_part = units_parts[-1]
has_tz = last_part.lower() == "utc" or last_part[0] in ("+", "-")
if not has_tz:
if num_unit_parts == 3:
pass
elif num_unit_parts == 4:
time_or_tz_part = units_parts[3]
if _RE_TIME.match(time_or_tz_part):
time_part = time_or_tz_part
else:
tz_part = time_or_tz_part
elif num_unit_parts == 5:
time_part = units_parts[3]
tz_part = units_parts[4]
else:
time_part = units_parts[-2]
tz_part = units_parts[-1]
units_ok = False

if units_ok and not _RE_DATE.match(date_part):
units_ok = False
if units_ok and time_part and not _RE_TIME.match(time_part):
units_ok = False
if units_ok and tz_part and not _RE_TZ.match(tz_part):
units_ok = False

if not tz_part:
ctx.report(
f"Missing timezone in 'units' attribute: {units!r}.",
f"Missing timezone '+H:MM' or '-H:MM' in 'units' attribute: {units!r}.",
suggestions=[
use_units_format_msg,
_units_format_suggestion(),
f"Append timezone specification, e.g., use"
f" {' '.join(units_parts[:-1] + ['utc'])!r}.",
f" {' '.join(units_parts[:-1] + ['+0:00'])!r}.",
],
)
# units ok
return

ctx.report(
f"Invalid 'units' attribute: {units!r}.",
suggestions=[use_units_format_msg],
)
if not units_ok:
ctx.report(
f"Invalid 'units' attribute: {units!r}.",
suggestions=[_units_format_suggestion()],
)


def _units_format_suggestion():
use_units_format_msg = (
f"Specify 'units' attribute using the UDUNITS format,"
f" e.g., {_EXAMPLE_UNIT_1!r} or {_EXAMPLE_UNIT_2!r}."
)
return use_units_format_msg


def _units_of_time_suggestion():
return f"Use one of {', '.join(map(repr, _UNAMBIGUOUS_UNITS_OF_TIME))}"


def _is_time_by_name(attrs):
return attrs.get("standard_name") == "time" or attrs.get("axis") == "T"
Loading