Skip to content

Commit 37343ba

Browse files
authored
Merge pull request #646 from FAIRmat-NFDI/643-bug-raise-error-if-there-are-two-variadic-concepts-with-the-same-name
Error for multiple different variadic concepts with the same name
2 parents 02e4d65 + 14f9562 commit 37343ba

File tree

3 files changed

+272
-3
lines changed

3 files changed

+272
-3
lines changed

src/pynxtools/dataconverter/helpers.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646

4747

4848
class ValidationProblem(Enum):
49+
DifferentVariadicNodesWithTheSameName = auto()
4950
UnitWithoutDocumentation = auto()
5051
InvalidEnum = auto()
5152
OpenEnumWithNewItem = auto()
@@ -72,6 +73,7 @@ class ValidationProblem(Enum):
7273
ReservedSuffixWithoutField = auto()
7374
ReservedPrefixInWrongContext = auto()
7475
InvalidNexusTypeForNamedConcept = auto()
76+
KeysWithAndWithoutConcept = auto()
7577

7678

7779
class Collector:
@@ -85,6 +87,13 @@ def _log(self, path: str, log_type: ValidationProblem, value: Optional[Any], *ar
8587
if value is None:
8688
value = "<unknown>"
8789

90+
if log_type == ValidationProblem.DifferentVariadicNodesWithTheSameName:
91+
value = cast(Any, value)
92+
logger.warning(
93+
f"Instance name '{path}' used for multiple different concepts: "
94+
f"{', '.join(sorted(set(c for c, _ in value)))}. "
95+
f"The following keys are affected: {', '.join(sorted(set(k for _, k in value)))}."
96+
)
8897
if log_type == ValidationProblem.UnitWithoutDocumentation:
8998
logger.info(
9099
f"The unit, {path} = {value}, is being written but has no documentation."
@@ -177,6 +186,11 @@ def _log(self, path: str, log_type: ValidationProblem, value: Optional[Any], *ar
177186
f"The type ('{args[0] if args else '<unknown>'}') of the given concept '{path}' "
178187
f"conflicts with another existing concept of the same name, which is of type '{value.type}'."
179188
)
189+
elif log_type == ValidationProblem.KeysWithAndWithoutConcept:
190+
value = cast(Any, value)
191+
logger.warning(
192+
f"The key '{path}' uses the valid concept name '{args[0]}', but there is another valid key {value} that uses the non-variadic name of the node.'"
193+
)
180194

181195
def collect_and_log(
182196
self,

src/pynxtools/dataconverter/validation.py

Lines changed: 123 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,127 @@ def recurse_tree(
850850

851851
handling_map.get(child.type, handle_unknown_type)(child, keys, prev_path)
852852

853+
def find_instance_name_conflicts(mapping: MutableMapping[str, str]) -> None:
854+
"""
855+
Detect and log conflicts where the same variadic instance name is reused across
856+
different concept names.
857+
858+
This function ensures that a given instance name (e.g., 'my_name') is only used
859+
for a single concept (e.g., SAMPLE or USER, but not both). Reusing the same instance
860+
name for different concept names (e.g., SAMPLE[my_name] and USER[my_name]) is
861+
considered a conflict.
862+
863+
For example, this is a conflict:
864+
/ENTRY[entry1]/SAMPLE[my_name]/...
865+
/ENTRY[entry1]/USER[my_name]/...
866+
867+
But this is NOT a conflict:
868+
/ENTRY[entry1]/INSTRUMENT[instrument]/FIELD[my_name]/...
869+
/ENTRY[entry1]/INSTRUMENT[instrument]/DETECTOR[detector]/FIELD[my_name]/...
870+
871+
When such conflicts are found, an error is logged indicating the instance name
872+
and the conflicting concept names. Additionally, all keys involved in the conflict
873+
are logged and added to the `keys_to_remove` list.
874+
875+
Args:
876+
mapping (MutableMapping[str, str]):
877+
The mapping containing the data to validate.
878+
This should be a dict of `/` separated paths, such as
879+
"/ENTRY[entry1]/SAMPLE[sample1]/name".
880+
keys_to_remove (List[str]):
881+
List of keys that will be removed from the template. This is extended here
882+
in the case of conflicts.
883+
884+
"""
885+
pattern = re.compile(r"(?P<concept_name>[^\[\]/]+)\[(?P<instance>[^\]]+)\]")
886+
887+
# Tracks instance usage with respect to their parent group
888+
instance_usage: Dict[Tuple[str, str], List[Tuple[str, str]]] = defaultdict(list)
889+
890+
for key in mapping:
891+
matches = list(pattern.finditer(key))
892+
if not matches:
893+
# The keys contains no concepts with variable name, no need to further check
894+
continue
895+
for match in matches:
896+
concept_name = match.group("concept_name")
897+
instance_name = match.group("instance")
898+
899+
# Determine the parent path up to just before this match
900+
parent_path = key[: match.start()]
901+
child_path = key[match.start() :].split("/", 1)[-1]
902+
903+
# Here we check if for this key with a concept name, another valid key
904+
# with a non-concept name exists.
905+
non_concept_key = f"{parent_path}{instance_name}/{child_path}"
906+
907+
if non_concept_key in mapping:
908+
try:
909+
node = add_best_matches_for(non_concept_key, tree)
910+
if node is not None:
911+
collector.collect_and_log(
912+
key,
913+
ValidationProblem.KeysWithAndWithoutConcept,
914+
non_concept_key,
915+
concept_name,
916+
)
917+
collector.collect_and_log(
918+
key, ValidationProblem.KeyToBeRemoved, "key"
919+
)
920+
keys_to_remove.append(key)
921+
continue
922+
except TypeError:
923+
pass
924+
925+
instance_usage[(instance_name, parent_path)].append((concept_name, key))
926+
927+
for (instance_name, parent_path), entries in sorted(instance_usage.items()):
928+
concept_names = {c for c, _ in entries}
929+
if len(concept_names) > 1:
930+
keys = sorted(k for _, k in entries)
931+
collector.collect_and_log(
932+
instance_name,
933+
ValidationProblem.DifferentVariadicNodesWithTheSameName,
934+
entries,
935+
)
936+
# Now that we have name conflicts, we still need to check that there are
937+
# at least two valid keys in that conflict. Only then we remove these.
938+
# This takes care of the example with keys like
939+
# /ENTRY[my_entry]/USER[some_name]/name and /ENTRY[my_entry]/USERS[some_name]/name,
940+
# where we only want to keep the first one.
941+
valid_keys_with_name_conflicts = []
942+
943+
for key in keys:
944+
try:
945+
node = add_best_matches_for(key, tree)
946+
if node is not None:
947+
valid_keys_with_name_conflicts.append(key)
948+
continue
949+
except TypeError:
950+
pass
951+
collector.collect_and_log(
952+
key, ValidationProblem.KeyToBeRemoved, "key"
953+
)
954+
keys_to_remove.append(key)
955+
956+
if len(valid_keys_with_name_conflicts) >= 1:
957+
# At this point, all invalid keys have been removed.
958+
# If more than one valid concept still uses the same instance name under the same parent path,
959+
# this indicates a semantic ambiguity (e.g., USER[alex] and SAMPLE[alex]).
960+
# We remove these keys as well to avoid conflicts in the writer.
961+
remaining_concepts = {
962+
pattern.findall(k)[-1][0]
963+
for k in valid_keys_with_name_conflicts
964+
if pattern.findall(k)
965+
}
966+
# If multiple valid concept names reuse the same instance name, remove them too
967+
if len(remaining_concepts) > 1:
968+
for valid_key in valid_keys_with_name_conflicts:
969+
collector.collect_and_log(
970+
valid_key, ValidationProblem.KeyToBeRemoved, "key"
971+
)
972+
keys_to_remove.append(valid_key)
973+
853974
def check_attributes_of_nonexisting_field(
854975
node: NexusNode,
855976
):
@@ -1174,10 +1295,11 @@ def check_reserved_prefix(
11741295
"choice": handle_choice,
11751296
}
11761297

1177-
keys_to_remove = []
1298+
keys_to_remove: List[str] = []
11781299

11791300
tree = generate_tree_from(appdef)
11801301
collector.clear()
1302+
find_instance_name_conflicts(mapping)
11811303
nested_keys = build_nested_dict_from(mapping)
11821304
not_visited = list(mapping)
11831305
keys = _follow_link(nested_keys, "")

tests/dataconverter/test_validation.py

Lines changed: 135 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,139 @@ def listify_template(data_dict: Template):
234234
@pytest.mark.parametrize(
235235
"data_dict,error_messages",
236236
[
237+
pytest.param(
238+
alter_dict(
239+
TEMPLATE,
240+
"/ENTRY[my_entry]/NOTE[required_group2]/description",
241+
"an additional description",
242+
),
243+
[
244+
"The key '/ENTRY[my_entry]/NOTE[required_group2]/description' uses the valid concept name 'NOTE', "
245+
"but there is another valid key /ENTRY[my_entry]/required_group2/description that uses the non-variadic "
246+
"name of the node.'",
247+
"The key /ENTRY[my_entry]/NOTE[required_group2]/description will not be written.",
248+
],
249+
id="same-concept-with-and-without-concept-name",
250+
),
251+
pytest.param(
252+
alter_dict(
253+
alter_dict(
254+
alter_dict(
255+
alter_dict(
256+
alter_dict(
257+
TEMPLATE,
258+
"/ENTRY[my_entry]/SAMPLE[some_name]/name",
259+
"A sample name",
260+
),
261+
"/ENTRY[my_entry]/SAMPLE[some_name]/description",
262+
"A sample description",
263+
),
264+
"/ENTRY[my_entry]/USER[some_name]/name",
265+
"A user name",
266+
),
267+
"/ENTRY[my_entry]/MONITOR[some_name]/name",
268+
"A monitor name",
269+
),
270+
"/ENTRY[my_entry]/MONITOR[some_name]/description",
271+
"A monitor description",
272+
),
273+
[
274+
"Instance name 'some_name' used for multiple different concepts: MONITOR, SAMPLE, USER. "
275+
"The following keys are affected: /ENTRY[my_entry]/MONITOR[some_name]/description, "
276+
"/ENTRY[my_entry]/MONITOR[some_name]/name, /ENTRY[my_entry]/SAMPLE[some_name]/description, "
277+
"/ENTRY[my_entry]/SAMPLE[some_name]/name, /ENTRY[my_entry]/USER[some_name]/name.",
278+
"The key /ENTRY[my_entry]/MONITOR[some_name]/description will not be written.",
279+
"The key /ENTRY[my_entry]/MONITOR[some_name]/name will not be written.",
280+
"The key /ENTRY[my_entry]/SAMPLE[some_name]/description will not be written.",
281+
"The key /ENTRY[my_entry]/SAMPLE[some_name]/name will not be written.",
282+
"The key /ENTRY[my_entry]/USER[some_name]/name will not be written.",
283+
],
284+
id="variadic-groups-of-the-same-name",
285+
),
286+
pytest.param(
287+
alter_dict(
288+
alter_dict(
289+
alter_dict(
290+
TEMPLATE,
291+
"/ENTRY[my_entry]/INSTRUMENT[instrument]/APERTURE[another_name]/name",
292+
"An aperture within an instrument",
293+
),
294+
"/ENTRY[my_entry]/INSTRUMENT[instrument]/DETECTOR[another_name]/name",
295+
"A detector within an instrument",
296+
),
297+
"/ENTRY[my_entry]/INSTRUMENT[instrument]/SOURCE[my_source]/APERTURE[another_name]/name",
298+
"An aperture within a source inside an instrument",
299+
),
300+
[
301+
"Instance name 'another_name' used for multiple different concepts: APERTURE, DETECTOR. "
302+
"The following keys are affected: /ENTRY[my_entry]/INSTRUMENT[instrument]/APERTURE[another_name]/name, "
303+
"/ENTRY[my_entry]/INSTRUMENT[instrument]/DETECTOR[another_name]/name.",
304+
"The key /ENTRY[my_entry]/INSTRUMENT[instrument]/APERTURE[another_name]/name will not be written.",
305+
"The key /ENTRY[my_entry]/INSTRUMENT[instrument]/DETECTOR[another_name]/name will not be written.",
306+
],
307+
id="variadic-groups-of-the-same-name-but-at-different-levels",
308+
),
309+
pytest.param(
310+
alter_dict(
311+
alter_dict(
312+
alter_dict(
313+
alter_dict(
314+
alter_dict(
315+
alter_dict(
316+
TEMPLATE,
317+
"/ENTRY[my_entry]/USER[user]/name",
318+
"A user name",
319+
),
320+
"/ENTRY[my_entry]/USER[user]/role",
321+
"A user role",
322+
),
323+
"/ENTRY[my_entry]/USER[user]/affiliation",
324+
"A user affiliation",
325+
),
326+
"/ENTRY[my_entry]/ILLEGAL[user]/name",
327+
"An illegal user name",
328+
),
329+
"/ENTRY[my_entry]/ILLEGAL[user]/role",
330+
"An illegal user role",
331+
),
332+
"/ENTRY[my_entry]/ILLEGAL[user]/affiliation",
333+
"An illegal user affiliation",
334+
),
335+
[
336+
"Instance name 'user' used for multiple different concepts: ILLEGAL, USER. "
337+
"The following keys are affected: /ENTRY[my_entry]/ILLEGAL[user]/affiliation, /ENTRY[my_entry]/ILLEGAL[user]/name, "
338+
"/ENTRY[my_entry]/ILLEGAL[user]/role, /ENTRY[my_entry]/USER[user]/affiliation, /ENTRY[my_entry]/USER[user]/name, "
339+
"/ENTRY[my_entry]/USER[user]/role.",
340+
"The key /ENTRY[my_entry]/ILLEGAL[user]/affiliation will not be written.",
341+
"The key /ENTRY[my_entry]/ILLEGAL[user]/name will not be written.",
342+
"The key /ENTRY[my_entry]/ILLEGAL[user]/role will not be written.",
343+
],
344+
id="variadic-groups-of-the-same-name-illegal-concept-multiple-fields",
345+
),
346+
pytest.param(
347+
alter_dict(
348+
alter_dict(
349+
alter_dict(
350+
TEMPLATE,
351+
"/ENTRY[my_entry]/USER[user]/name",
352+
"A user name",
353+
),
354+
"/ENTRY[my_entry]/USERS[user]/name",
355+
"An invalid group of the same name",
356+
),
357+
"/ENTRY[my_entry]/SAMPLE[user]/name",
358+
"A sample group called user with a name",
359+
),
360+
[
361+
"Instance name 'user' used for multiple different concepts: SAMPLE, USER, USERS. "
362+
"The following keys are affected: /ENTRY[my_entry]/SAMPLE[user]/name, "
363+
"/ENTRY[my_entry]/USERS[user]/name, /ENTRY[my_entry]/USER[user]/name.",
364+
"The key /ENTRY[my_entry]/USERS[user]/name will not be written.",
365+
"The key /ENTRY[my_entry]/SAMPLE[user]/name will not be written.",
366+
"The key /ENTRY[my_entry]/USER[user]/name will not be written.",
367+
],
368+
id="variadic-groups-of-the-same-name-mix-of-valid-and-illegal-concepts",
369+
),
237370
pytest.param(
238371
alter_dict(
239372
alter_dict(
@@ -1079,11 +1212,11 @@ def listify_template(data_dict: Template):
10791212
pytest.param(
10801213
alter_dict(
10811214
TEMPLATE,
1082-
"/ENTRY[my_entry]/INSTRUMENT[my_instrument]/ILLEGAL[my_source]/type",
1215+
"/ENTRY[my_entry]/INSTRUMENT[my_instrument]/ILLEGAL[my_source2]/type",
10831216
1,
10841217
),
10851218
[
1086-
"Field /ENTRY[my_entry]/INSTRUMENT[my_instrument]/ILLEGAL[my_source]/type written without documentation."
1219+
"Field /ENTRY[my_entry]/INSTRUMENT[my_instrument]/ILLEGAL[my_source2]/type written without documentation."
10871220
],
10881221
id="bad-namefitting",
10891222
),

0 commit comments

Comments
 (0)