Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/build_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ on:
push:
branches:
- master # Triggers deployment on push to the master branch
- restructure-docs

env:
python-version: 3.12
Expand Down
148 changes: 98 additions & 50 deletions src/pynxtools/dataconverter/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@
logger = logging.getLogger("pynxtools")


ISO8601 = re.compile(
r"^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}(?:"
r"\.\d*)?)(((?!-00:00)(\+|-)(\d{2}):(\d{2})|Z){1})$"
)


class ValidationProblem(Enum):
DifferentVariadicNodesWithTheSameName = auto()
UnitWithoutDocumentation = auto()
Expand Down Expand Up @@ -76,6 +82,9 @@ class ValidationProblem(Enum):
ReservedPrefixInWrongContext = auto()
InvalidNexusTypeForNamedConcept = auto()
KeysWithAndWithoutConcept = auto()
InvalidCompressionStrength = auto()
CompressionStrengthZero = auto()
# DoNotCompressStringsBoolean = auto()


class Collector:
Expand Down Expand Up @@ -200,6 +209,28 @@ def _log(self, path: str, log_type: ValidationProblem, value: Optional[Any], *ar
logger.warning(
f"The key '{path}' uses the valid concept name '{args[0]}', but there is another valid key {value} that uses the non-variadic name of the node.'"
)
elif log_type == ValidationProblem.CompressionStrengthZero:
value = cast(dict, value)
logger.info(
f"Compression strength for {path} is 0. The value '{value['compress']}' will be written uncompressed."
)
elif log_type == ValidationProblem.InvalidCompressionStrength:
value = cast(dict, value)
logger.warning(
f"Compression strength for {path} = {value} should be between 0 and 9."
)
# elif log_type == ValidationProblem.DoNotCompressStringsBoolean:
# value = cast(dict, value)
# dtype = type(value["compress"]).__name__
# dtype_map = {
# "str": "string",
# "bool": "boolean",
# }
# dtype_str = dtype_map.get(dtype, dtype)

# logger.info(
# f"Compression for {path} = {value} should not be used for {dtype_str} values."
# )

def collect_and_log(
self,
Expand All @@ -222,6 +253,7 @@ def collect_and_log(
if log_type not in (
ValidationProblem.UnitWithoutDocumentation,
ValidationProblem.OpenEnumWithNewItem,
ValidationProblem.CompressionStrengthZero,
):
self.data.add(path + str(log_type) + str(value))

Expand Down Expand Up @@ -723,78 +755,94 @@ def convert_int_to_float(value):
return {convert_int_to_float(v) for v in value}
elif isinstance(value, np.ndarray) and np.issubdtype(value.dtype, np.integer):
return value.astype(float)
elif isinstance(value, np.generic) and np.issubdtype(type(value), np.integer):
return float(value)
else:
return value


def is_valid_data_field(
value: Any, nxdl_type: str, nxdl_enum: list, nxdl_enum_open: bool, path: str
) -> Any:
# todo: Check this function and write test for it. It seems the function is not
# working as expected.
"""Checks whether a given value is valid according to the type defined in the NXDL.
"""Checks whether a given value is valid according to the type defined in the NXDL."""

def validate_data_value(
value: Any, nxdl_type: str, nxdl_enum: list, nxdl_enum_open: bool, path: str
) -> Any:
"""Validate and possibly convert a primitive value according to NXDL type/enum rules."""
accepted_types = NEXUS_TO_PYTHON_DATA_TYPES[nxdl_type]
original_value = value

# Do not count other dicts as they represent a link value
if not isinstance(value, dict):
# Attempt type conversion
if accepted_types[0] is bool and isinstance(value, str):
try:
value = convert_str_to_bool_safe(value)
except (ValueError, TypeError):
value = original_value
elif accepted_types[0] is float:
value = convert_int_to_float(value)

This function only tries to convert boolean value in str format (e.g. "true" ) to
python Boolean (True). In case, it fails to convert, it raises an Exception.
if not is_valid_data_type(value, accepted_types):
collector.collect_and_log(
path, ValidationProblem.InvalidType, accepted_types, nxdl_type
)

Return:
value: the possibly converted data value
"""
# Type-specific validation
if nxdl_type == "NX_POSINT" and not is_positive_int(value):
collector.collect_and_log(path, ValidationProblem.IsNotPosInt, value)

if nxdl_type in ("ISO8601", "NX_DATE_TIME"):
results = ISO8601.search(value)
if results is None:
collector.collect_and_log(
path, ValidationProblem.InvalidDatetime, value
)

if nxdl_enum is not None and value not in nxdl_enum:
if nxdl_enum_open:
collector.collect_and_log(
path, ValidationProblem.OpenEnumWithNewItem, nxdl_enum
)
else:
collector.collect_and_log(
path, ValidationProblem.InvalidEnum, nxdl_enum
)

accepted_types = NEXUS_TO_PYTHON_DATA_TYPES[nxdl_type]
return value

if isinstance(value, dict) and set(value.keys()) == {"compress", "strength"}:
value = value["compress"]
compressed_value = value["compress"]

# Do not count other dicts as they represent a link value
if not isinstance(value, dict) and not is_valid_data_type(value, accepted_types):
# try to convert string to bool
if accepted_types[0] is bool and isinstance(value, str):
try:
value = convert_str_to_bool_safe(value)
except (ValueError, TypeError):
if not (1 <= value["strength"] <= 9):
if value["strength"] == 0:
collector.collect_and_log(
path, ValidationProblem.InvalidType, accepted_types, nxdl_type
path, ValidationProblem.CompressionStrengthZero, value
)
elif accepted_types[0] is float:
value = convert_int_to_float(value)
if not is_valid_data_type(value, accepted_types):
else:
collector.collect_and_log(
path, ValidationProblem.InvalidType, accepted_types, nxdl_type
path, ValidationProblem.InvalidCompressionStrength, value
)
else:
collector.collect_and_log(
path, ValidationProblem.InvalidType, accepted_types, nxdl_type
# In this case, we remove the compression.
return validate_data_value(
value["compress"], nxdl_type, nxdl_enum, nxdl_enum_open, path
)

if nxdl_type == "NX_POSINT" and not is_positive_int(value):
collector.collect_and_log(path, ValidationProblem.IsNotPosInt, value)
# TODO: Do we need to issue a warning if string/bool compression is used
# # elif isinstance(compressed_value, (str, bool)):
# collector.collect_and_log(
# path, ValidationProblem.DoNotCompressStringsBoolean, value
# )

if nxdl_type in ("ISO8601", "NX_DATE_TIME"):
iso8601 = re.compile(
r"^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}(?:"
r"\.\d*)?)(((?!-00:00)(\+|-)(\d{2}):(\d{2})|Z){1})$"
# Apply standard validation to compressed value
value["compress"] = validate_data_value(
compressed_value, nxdl_type, nxdl_enum, nxdl_enum_open, path
)
results = iso8601.search(value)
if results is None:
collector.collect_and_log(path, ValidationProblem.InvalidDatetime, value)

# Check enumeration
if nxdl_enum is not None and value not in nxdl_enum:
if nxdl_enum_open:
collector.collect_and_log(
path,
ValidationProblem.OpenEnumWithNewItem,
nxdl_enum,
)
else:
collector.collect_and_log(
path,
ValidationProblem.InvalidEnum,
nxdl_enum,
)

return value
return value

return validate_data_value(value, nxdl_type, nxdl_enum, nxdl_enum_open, path)


@cache
Expand Down
79 changes: 67 additions & 12 deletions tests/dataconverter/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,24 @@ def set_whole_group_to_none(
return internal_dict


def compress_paths_in_dict(data_dict: Template, paths=list[str]):
"""For each path, compress the value in data_dict using a strength of 3."""
types = {
"int": np.int64,
"float": np.float32,
}
if data_dict is not None:
internal_dict = Template(data_dict)
for path in paths:
if (value := internal_dict.get(path)) is not None:
if np_type := types.get(type(value).__name__):
value = np_type(value)
internal_dict[path] = {"compress": value, "strength": 3}
return internal_dict

return None


def remove_from_dict(data_dict: Template, key: str, optionality: str = "optional"):
"""Helper function to remove a key from dict"""
if data_dict is not None and key in data_dict[optionality]:
Expand Down Expand Up @@ -1694,27 +1712,63 @@ def listify_template(data_dict: Template):
id="reserved-prefix",
),
pytest.param(
alter_dict(
compress_paths_in_dict(
TEMPLATE,
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/float_value",
{"compress": np.float32(2.0), "strength": 1},
[
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/float_value"
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/number_value",
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/bool_value",
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/int_value",
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/posint_value",
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/char_value",
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/date_value",
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/type",
],
),
[],
id="appdef-compressed-payload",
id="appdef-compressed",
),
pytest.param(
alter_dict(
alter_dict(
TEMPLATE,
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/float_value",
{"compress": np.int64(2.0), "strength": 1},
),
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/int_value",
{"compress": np.float32(2.0), "strength": 1},
),
[
"The value at /ENTRY[my_entry]/NXODD_name[nxodd_name]/int_value "
"should be one of the following Python types: "
"(<class 'int'>, <class 'numpy.integer'>), as defined in the "
"NXDL as NX_INT."
],
id="appdef-compressed-wrong-type",
),
pytest.param(
alter_dict(
TEMPLATE,
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/int_value",
{"compress": np.int64(2), "strength": 11},
),
[
"Compression strength for /ENTRY[my_entry]/NXODD_name[nxodd_name]/int_value = "
"{'compress': 2, 'strength': 11} should be between 0 and 9.",
],
id="appdef-compressed-invalid-strength",
),
pytest.param(
alter_dict(
TEMPLATE,
"/ENTRY[my_entry]/NXODD_name[nxodd_name]/float_value",
{"compress": np.int64(2.0), "strength": 1},
{"compress": np.float32(2.0), "strength": 0},
),
[
"The value at /ENTRY[my_entry]/NXODD_name[nxodd_name]/float_value "
"should be one of the following Python types: "
"(<class 'float'>, <class 'numpy.floating'>), as defined in the "
"NXDL as NX_FLOAT."
"Compression strength for /ENTRY[my_entry]/NXODD_name[nxodd_name]/float_value "
"is 0. The value '2.0' will be written uncompressed.",
],
id="appdef-compressed-payload-wrong-type",
id="appdef-compressed-strength-0",
),
pytest.param(
alter_dict(
Expand All @@ -1723,7 +1777,7 @@ def listify_template(data_dict: Template):
{"compress": np.int64(2), "strength": 1},
),
[],
id="baseclass-compressed-payload",
id="baseclass-compressed",
),
pytest.param(
alter_dict(
Expand All @@ -1737,7 +1791,7 @@ def listify_template(data_dict: Template):
"(<class 'int'>, <class 'numpy.integer'>), as defined in the "
"NXDL as NX_INT."
],
id="baseclass-compressed-payload-wrong-type",
id="baseclass-compressed-wrong-type",
),
],
)
Expand All @@ -1760,6 +1814,7 @@ def format_error_message(msg: str) -> str:
"baseclass-field-with-illegal-unit",
"open-enum-with-new-item",
"baseclass-open-enum-with-new-item",
"appdef-compressed-strength-0",
):
with caplog.at_level(logging.INFO):
assert validate_dict_against("NXtest", data_dict)
Expand Down
Loading