Skip to content

Commit bd4f1e7

Browse files
authored
implement more rigorous checks of compressed payloads (#686)
1 parent 4891020 commit bd4f1e7

File tree

3 files changed

+165
-63
lines changed

3 files changed

+165
-63
lines changed

.github/workflows/build_docs.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ on:
44
push:
55
branches:
66
- master # Triggers deployment on push to the master branch
7-
- restructure-docs
87

98
env:
109
python-version: 3.12

src/pynxtools/dataconverter/helpers.py

Lines changed: 98 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@
4646
logger = logging.getLogger("pynxtools")
4747

4848

49+
ISO8601 = re.compile(
50+
r"^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}(?:"
51+
r"\.\d*)?)(((?!-00:00)(\+|-)(\d{2}):(\d{2})|Z){1})$"
52+
)
53+
54+
4955
class ValidationProblem(Enum):
5056
DifferentVariadicNodesWithTheSameName = auto()
5157
UnitWithoutDocumentation = auto()
@@ -76,6 +82,9 @@ class ValidationProblem(Enum):
7682
ReservedPrefixInWrongContext = auto()
7783
InvalidNexusTypeForNamedConcept = auto()
7884
KeysWithAndWithoutConcept = auto()
85+
InvalidCompressionStrength = auto()
86+
CompressionStrengthZero = auto()
87+
# DoNotCompressStringsBoolean = auto()
7988

8089

8190
class Collector:
@@ -200,6 +209,28 @@ def _log(self, path: str, log_type: ValidationProblem, value: Optional[Any], *ar
200209
logger.warning(
201210
f"The key '{path}' uses the valid concept name '{args[0]}', but there is another valid key {value} that uses the non-variadic name of the node.'"
202211
)
212+
elif log_type == ValidationProblem.CompressionStrengthZero:
213+
value = cast(dict, value)
214+
logger.info(
215+
f"Compression strength for {path} is 0. The value '{value['compress']}' will be written uncompressed."
216+
)
217+
elif log_type == ValidationProblem.InvalidCompressionStrength:
218+
value = cast(dict, value)
219+
logger.warning(
220+
f"Compression strength for {path} = {value} should be between 0 and 9."
221+
)
222+
# elif log_type == ValidationProblem.DoNotCompressStringsBoolean:
223+
# value = cast(dict, value)
224+
# dtype = type(value["compress"]).__name__
225+
# dtype_map = {
226+
# "str": "string",
227+
# "bool": "boolean",
228+
# }
229+
# dtype_str = dtype_map.get(dtype, dtype)
230+
231+
# logger.info(
232+
# f"Compression for {path} = {value} should not be used for {dtype_str} values."
233+
# )
203234

204235
def collect_and_log(
205236
self,
@@ -222,6 +253,7 @@ def collect_and_log(
222253
if log_type not in (
223254
ValidationProblem.UnitWithoutDocumentation,
224255
ValidationProblem.OpenEnumWithNewItem,
256+
ValidationProblem.CompressionStrengthZero,
225257
):
226258
self.data.add(path + str(log_type) + str(value))
227259

@@ -723,78 +755,94 @@ def convert_int_to_float(value):
723755
return {convert_int_to_float(v) for v in value}
724756
elif isinstance(value, np.ndarray) and np.issubdtype(value.dtype, np.integer):
725757
return value.astype(float)
758+
elif isinstance(value, np.generic) and np.issubdtype(type(value), np.integer):
759+
return float(value)
726760
else:
727761
return value
728762

729763

730764
def is_valid_data_field(
731765
value: Any, nxdl_type: str, nxdl_enum: list, nxdl_enum_open: bool, path: str
732766
) -> Any:
733-
# todo: Check this function and write test for it. It seems the function is not
734-
# working as expected.
735-
"""Checks whether a given value is valid according to the type defined in the NXDL.
767+
"""Checks whether a given value is valid according to the type defined in the NXDL."""
768+
769+
def validate_data_value(
770+
value: Any, nxdl_type: str, nxdl_enum: list, nxdl_enum_open: bool, path: str
771+
) -> Any:
772+
"""Validate and possibly convert a primitive value according to NXDL type/enum rules."""
773+
accepted_types = NEXUS_TO_PYTHON_DATA_TYPES[nxdl_type]
774+
original_value = value
775+
776+
# Do not count other dicts as they represent a link value
777+
if not isinstance(value, dict):
778+
# Attempt type conversion
779+
if accepted_types[0] is bool and isinstance(value, str):
780+
try:
781+
value = convert_str_to_bool_safe(value)
782+
except (ValueError, TypeError):
783+
value = original_value
784+
elif accepted_types[0] is float:
785+
value = convert_int_to_float(value)
736786

737-
This function only tries to convert boolean value in str format (e.g. "true" ) to
738-
python Boolean (True). In case, it fails to convert, it raises an Exception.
787+
if not is_valid_data_type(value, accepted_types):
788+
collector.collect_and_log(
789+
path, ValidationProblem.InvalidType, accepted_types, nxdl_type
790+
)
739791

740-
Return:
741-
value: the possibly converted data value
742-
"""
792+
# Type-specific validation
793+
if nxdl_type == "NX_POSINT" and not is_positive_int(value):
794+
collector.collect_and_log(path, ValidationProblem.IsNotPosInt, value)
795+
796+
if nxdl_type in ("ISO8601", "NX_DATE_TIME"):
797+
results = ISO8601.search(value)
798+
if results is None:
799+
collector.collect_and_log(
800+
path, ValidationProblem.InvalidDatetime, value
801+
)
802+
803+
if nxdl_enum is not None and value not in nxdl_enum:
804+
if nxdl_enum_open:
805+
collector.collect_and_log(
806+
path, ValidationProblem.OpenEnumWithNewItem, nxdl_enum
807+
)
808+
else:
809+
collector.collect_and_log(
810+
path, ValidationProblem.InvalidEnum, nxdl_enum
811+
)
743812

744-
accepted_types = NEXUS_TO_PYTHON_DATA_TYPES[nxdl_type]
813+
return value
745814

746815
if isinstance(value, dict) and set(value.keys()) == {"compress", "strength"}:
747-
value = value["compress"]
816+
compressed_value = value["compress"]
748817

749-
# Do not count other dicts as they represent a link value
750-
if not isinstance(value, dict) and not is_valid_data_type(value, accepted_types):
751-
# try to convert string to bool
752-
if accepted_types[0] is bool and isinstance(value, str):
753-
try:
754-
value = convert_str_to_bool_safe(value)
755-
except (ValueError, TypeError):
818+
if not (1 <= value["strength"] <= 9):
819+
if value["strength"] == 0:
756820
collector.collect_and_log(
757-
path, ValidationProblem.InvalidType, accepted_types, nxdl_type
821+
path, ValidationProblem.CompressionStrengthZero, value
758822
)
759-
elif accepted_types[0] is float:
760-
value = convert_int_to_float(value)
761-
if not is_valid_data_type(value, accepted_types):
823+
else:
762824
collector.collect_and_log(
763-
path, ValidationProblem.InvalidType, accepted_types, nxdl_type
825+
path, ValidationProblem.InvalidCompressionStrength, value
764826
)
765-
else:
766-
collector.collect_and_log(
767-
path, ValidationProblem.InvalidType, accepted_types, nxdl_type
827+
# In this case, we remove the compression.
828+
return validate_data_value(
829+
value["compress"], nxdl_type, nxdl_enum, nxdl_enum_open, path
768830
)
769831

770-
if nxdl_type == "NX_POSINT" and not is_positive_int(value):
771-
collector.collect_and_log(path, ValidationProblem.IsNotPosInt, value)
832+
# TODO: Do we need to issue a warning if string/bool compression is used
833+
# # elif isinstance(compressed_value, (str, bool)):
834+
# collector.collect_and_log(
835+
# path, ValidationProblem.DoNotCompressStringsBoolean, value
836+
# )
772837

773-
if nxdl_type in ("ISO8601", "NX_DATE_TIME"):
774-
iso8601 = re.compile(
775-
r"^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}(?:"
776-
r"\.\d*)?)(((?!-00:00)(\+|-)(\d{2}):(\d{2})|Z){1})$"
838+
# Apply standard validation to compressed value
839+
value["compress"] = validate_data_value(
840+
compressed_value, nxdl_type, nxdl_enum, nxdl_enum_open, path
777841
)
778-
results = iso8601.search(value)
779-
if results is None:
780-
collector.collect_and_log(path, ValidationProblem.InvalidDatetime, value)
781-
782-
# Check enumeration
783-
if nxdl_enum is not None and value not in nxdl_enum:
784-
if nxdl_enum_open:
785-
collector.collect_and_log(
786-
path,
787-
ValidationProblem.OpenEnumWithNewItem,
788-
nxdl_enum,
789-
)
790-
else:
791-
collector.collect_and_log(
792-
path,
793-
ValidationProblem.InvalidEnum,
794-
nxdl_enum,
795-
)
796842

797-
return value
843+
return value
844+
845+
return validate_data_value(value, nxdl_type, nxdl_enum, nxdl_enum_open, path)
798846

799847

800848
@cache

tests/dataconverter/test_validation.py

Lines changed: 67 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,24 @@ def set_whole_group_to_none(
5252
return internal_dict
5353

5454

55+
def compress_paths_in_dict(data_dict: Template, paths=list[str]):
56+
"""For each path, compress the value in data_dict using a strength of 3."""
57+
types = {
58+
"int": np.int64,
59+
"float": np.float32,
60+
}
61+
if data_dict is not None:
62+
internal_dict = Template(data_dict)
63+
for path in paths:
64+
if (value := internal_dict.get(path)) is not None:
65+
if np_type := types.get(type(value).__name__):
66+
value = np_type(value)
67+
internal_dict[path] = {"compress": value, "strength": 3}
68+
return internal_dict
69+
70+
return None
71+
72+
5573
def remove_from_dict(data_dict: Template, key: str, optionality: str = "optional"):
5674
"""Helper function to remove a key from dict"""
5775
if data_dict is not None and key in data_dict[optionality]:
@@ -1694,27 +1712,63 @@ def listify_template(data_dict: Template):
16941712
id="reserved-prefix",
16951713
),
16961714
pytest.param(
1697-
alter_dict(
1715+
compress_paths_in_dict(
16981716
TEMPLATE,
1699-
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/float_value",
1700-
{"compress": np.float32(2.0), "strength": 1},
1717+
[
1718+
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/float_value"
1719+
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/number_value",
1720+
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/bool_value",
1721+
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/int_value",
1722+
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/posint_value",
1723+
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/char_value",
1724+
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/date_value",
1725+
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/type",
1726+
],
17011727
),
17021728
[],
1703-
id="appdef-compressed-payload",
1729+
id="appdef-compressed",
1730+
),
1731+
pytest.param(
1732+
alter_dict(
1733+
alter_dict(
1734+
TEMPLATE,
1735+
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/float_value",
1736+
{"compress": np.int64(2.0), "strength": 1},
1737+
),
1738+
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/int_value",
1739+
{"compress": np.float32(2.0), "strength": 1},
1740+
),
1741+
[
1742+
"The value at /ENTRY[my_entry]/NXODD_name[nxodd_name]/int_value "
1743+
"should be one of the following Python types: "
1744+
"(<class 'int'>, <class 'numpy.integer'>), as defined in the "
1745+
"NXDL as NX_INT."
1746+
],
1747+
id="appdef-compressed-wrong-type",
1748+
),
1749+
pytest.param(
1750+
alter_dict(
1751+
TEMPLATE,
1752+
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/int_value",
1753+
{"compress": np.int64(2), "strength": 11},
1754+
),
1755+
[
1756+
"Compression strength for /ENTRY[my_entry]/NXODD_name[nxodd_name]/int_value = "
1757+
"{'compress': 2, 'strength': 11} should be between 0 and 9.",
1758+
],
1759+
id="appdef-compressed-invalid-strength",
17041760
),
17051761
pytest.param(
17061762
alter_dict(
17071763
TEMPLATE,
17081764
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/float_value",
1709-
{"compress": np.int64(2.0), "strength": 1},
1765+
{"compress": np.float32(2.0), "strength": 0},
17101766
),
17111767
[
1712-
"The value at /ENTRY[my_entry]/NXODD_name[nxodd_name]/float_value "
1713-
"should be one of the following Python types: "
1714-
"(<class 'float'>, <class 'numpy.floating'>), as defined in the "
1715-
"NXDL as NX_FLOAT."
1768+
"Compression strength for /ENTRY[my_entry]/NXODD_name[nxodd_name]/float_value "
1769+
"is 0. The value '2.0' will be written uncompressed.",
17161770
],
1717-
id="appdef-compressed-payload-wrong-type",
1771+
id="appdef-compressed-strength-0",
17181772
),
17191773
pytest.param(
17201774
alter_dict(
@@ -1723,7 +1777,7 @@ def listify_template(data_dict: Template):
17231777
{"compress": np.int64(2), "strength": 1},
17241778
),
17251779
[],
1726-
id="baseclass-compressed-payload",
1780+
id="baseclass-compressed",
17271781
),
17281782
pytest.param(
17291783
alter_dict(
@@ -1737,7 +1791,7 @@ def listify_template(data_dict: Template):
17371791
"(<class 'int'>, <class 'numpy.integer'>), as defined in the "
17381792
"NXDL as NX_INT."
17391793
],
1740-
id="baseclass-compressed-payload-wrong-type",
1794+
id="baseclass-compressed-wrong-type",
17411795
),
17421796
],
17431797
)
@@ -1760,6 +1814,7 @@ def format_error_message(msg: str) -> str:
17601814
"baseclass-field-with-illegal-unit",
17611815
"open-enum-with-new-item",
17621816
"baseclass-open-enum-with-new-item",
1817+
"appdef-compressed-strength-0",
17631818
):
17641819
with caplog.at_level(logging.INFO):
17651820
assert validate_dict_against("NXtest", data_dict)

0 commit comments

Comments
 (0)