Skip to content

Commit 3483bd4

Browse files
committed
First ideas of validation based on hdf tree traversal
1 parent 91c0420 commit 3483bd4

File tree

3 files changed

+115
-6
lines changed

3 files changed

+115
-6
lines changed

dev-requirements.txt

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,21 @@
11
#
2-
# This file is autogenerated by pip-compile with Python 3.11
2+
# This file is autogenerated by pip-compile with Python 3.10
33
# by the following command:
44
#
55
# pip-compile --extra=dev --extra=docs --output-file=dev-requirements.txt pyproject.toml
66
#
7+
annotated-types==0.6.0
8+
# via pydantic
9+
anytree==2.12.1
10+
# via pynxtools (pyproject.toml)
711
ase==3.22.1
812
# via pynxtools (pyproject.toml)
913
babel==2.14.0
1014
# via mkdocs-material
1115
build==1.1.1
1216
# via pip-tools
17+
cachetools==5.3.3
18+
# via pynxtools (pyproject.toml)
1319
certifi==2024.2.2
1420
# via requests
1521
cfgv==3.4.0
@@ -34,6 +40,8 @@ cycler==0.12.1
3440
# via matplotlib
3541
distlib==0.3.8
3642
# via virtualenv
43+
exceptiongroup==1.2.1
44+
# via pytest
3745
filelock==3.13.3
3846
# via virtualenv
3947
fonttools==4.50.0
@@ -130,6 +138,10 @@ pluggy==1.4.0
130138
# via pytest
131139
pre-commit==3.7.0
132140
# via pynxtools (pyproject.toml)
141+
pydantic==2.7.1
142+
# via pynxtools (pyproject.toml)
143+
pydantic-core==2.18.2
144+
# via pydantic
133145
pygments==2.17.2
134146
# via mkdocs-material
135147
pymdown-extensions==10.7.1
@@ -176,19 +188,32 @@ ruff==0.3.4
176188
scipy==1.12.0
177189
# via ase
178190
six==1.16.0
179-
# via python-dateutil
191+
# via
192+
# anytree
193+
# python-dateutil
180194
structlog==24.1.0
181195
# via pynxtools (pyproject.toml)
182196
termcolor==2.4.0
183197
# via mkdocs-macros-plugin
198+
tomli==2.0.1
199+
# via
200+
# build
201+
# coverage
202+
# mypy
203+
# pip-tools
204+
# pyproject-hooks
205+
# pytest
184206
types-pytz==2024.1.0.20240203
185207
# via pynxtools (pyproject.toml)
186208
types-pyyaml==6.0.12.20240311
187209
# via pynxtools (pyproject.toml)
188210
types-requests==2.31.0.20240311
189211
# via pynxtools (pyproject.toml)
190212
typing-extensions==4.10.0
191-
# via mypy
213+
# via
214+
# mypy
215+
# pydantic
216+
# pydantic-core
192217
tzdata==2024.1
193218
# via pandas
194219
urllib3==2.2.1

pynxtools/dataconverter/validation.py

Lines changed: 86 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,14 @@
2020
from collections import defaultdict
2121
from functools import reduce
2222
from operator import getitem
23-
from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union
23+
from typing import Any, Iterable, List, Mapping, Optional, Set, Tuple, Union
2424

2525
import h5py
2626
import lxml.etree as ET
2727
import numpy as np
2828
from anytree import Resolver
29+
from cachetools import LRUCache, cached
30+
from cachetools.keys import hashkey
2931

3032
from pynxtools.dataconverter.helpers import (
3133
Collector,
@@ -42,20 +44,101 @@
4244
from pynxtools.definitions.dev_tools.utils.nxdl_utils import get_nx_namefit
4345

4446

47+
def best_namefit_of_(
48+
name: str, concepts: Set[str], nx_class: Optional[str] = None
49+
) -> str:
50+
# TODO: Find the best namefit of name in concepts
51+
# Consider nx_class if it is not None
52+
...
53+
54+
4555
def validate_hdf_group_against(appdef: str, data: h5py.Group):
4656
"""
4757
Checks whether all the required paths from the template are returned in data dict.
4858
4959
THIS IS JUST A FUNCTION SKELETON AND IS NOT WORKING YET!
5060
"""
5161

52-
def validate(name: str, data: Union[h5py.Group, h5py.Dataset]):
62+
# Only cache based on path. That way we retain the nx_class information
63+
# in the tree
64+
# Allow for 10000 cache entries. This should be enough for most cases
65+
@cached(
66+
cache=LRUCache(maxsize=10000),
67+
key=lambda path, _: hashkey(path),
68+
)
69+
def find_node_for(path: str, nx_class: Optional[str] = None) -> Optional[NexusNode]:
70+
if path == "":
71+
return tree
72+
73+
prev_path, last_elem = path.rsplit("/", 1)
74+
node = find_node_for(prev_path)
75+
76+
best_child = best_namefit_of_(
77+
last_elem,
78+
# TODO: Consider renaming `get_all_children_names` to
79+
# `get_all_direct_children_names`. Because that's what it is.
80+
node.get_all_children_names(),
81+
nx_class,
82+
)
83+
if best_child is None:
84+
return None
85+
86+
return node.search_child_with_name(best_child)
87+
88+
def remove_from_req_fields(path: str):
89+
if path in required_fields:
90+
required_fields.remove(path)
91+
92+
def handle_group(path: str, data: h5py.Group):
93+
node = find_node_for(path, data.attrs.get("NX_class"))
94+
if node is None:
95+
# TODO: Log undocumented
96+
return
97+
98+
# TODO: Do actual group checks
99+
100+
def handle_field(path: str, data: h5py.Dataset):
101+
node = find_node_for(path)
102+
if node is None:
103+
# TODO: Log undocumented
104+
return
105+
remove_from_req_fields(f"{path}")
106+
107+
# TODO: Do actual field checks
108+
109+
def handle_attributes(path: str, attribute_names: h5py.AttributeManager):
110+
for attr_name in attribute_names:
111+
node = find_node_for(f"{path}/{attr_name}")
112+
if node is None:
113+
# TODO: Log undocumented
114+
continue
115+
remove_from_req_fields(f"{path}/@{attr_name}")
116+
117+
# TODO: Do actual attribute checks
118+
119+
def validate(path: str, data: Union[h5py.Group, h5py.Dataset]):
53120
# Namefit name against tree (use recursive caching)
54-
pass
121+
if isinstance(data, h5py.Group):
122+
handle_group(path, data)
123+
elif isinstance(data, h5py.Dataset):
124+
handle_field(path, data)
125+
126+
handle_attributes(path, data.attrs)
55127

56128
tree = generate_tree_from(appdef)
129+
required_fields = tree.required_fields_and_attrs_names()
57130
data.visitems(validate)
58131

132+
for req_field in required_fields:
133+
if "@" in req_field:
134+
collector.collect_and_log(
135+
req_field, ValidationProblem.MissingRequiredAttribute, None
136+
)
137+
continue
138+
collector.collect_and_log(
139+
req_field, ValidationProblem.MissingRequiredField, None
140+
)
141+
59142

60143
def build_nested_dict_from(
61144
mapping: Mapping[str, Any],

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ dependencies = [
3636
"lxml>=4.9.1",
3737
"anytree",
3838
"pydantic",
39+
"cachetools",
3940
]
4041

4142
[project.urls]

0 commit comments

Comments
 (0)