Skip to content

Commit c73bd79

Browse files
committed
New xdmf.XmlStream for iterative parsing of xdmf
The xml tree is no longer fully built in memory, the new parsing strategy progressively builds and discards the tree as it is parsed. This greatly reduces the memory footprint when parsing xmf files.
1 parent abf8aa0 commit c73bd79

File tree

2 files changed

+122
-46
lines changed

2 files changed

+122
-46
lines changed

stagpy/stagyyparsers.py

Lines changed: 50 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
from .error import ParsingError
2424
from .phyvars import FIELD_FILES_H5, SFIELD_FILES_H5
25+
from .xdmf import XmlStream
2526

2627
if typing.TYPE_CHECKING:
2728
from pathlib import Path
@@ -853,66 +854,69 @@ def _maybe_get(
853854

854855
@cached_property
855856
def _data(self) -> Mapping[int, XmfEntry]:
856-
# Geometry stuff from surface field is not useful
857+
xs = XmlStream(filepath=self.path)
857858
data = {}
858-
root = xmlET.parse(str(self.path)).getroot()
859-
for snap in root[0][0]:
860-
time = self._maybe_get(snap, "Time", "Value", float)
861-
mo_lambda = self._maybe_get(snap, "mo_lambda", "Value", float)
862-
mo_thick_sol = self._maybe_get(snap, "mo_thick_sol", "Value", float)
859+
for _ in xs.iter_tag("Time"):
860+
time = float(xs.current.attrib["Value"])
861+
xs.advance()
862+
extra: dict[str, float] = {}
863+
while xs.current.tag != "Grid":
864+
# mo_lambda, mo_thick_sol
865+
extra[xs.current.tag] = float(xs.current.attrib["Value"])
866+
xs.advance()
863867

864868
yin_yang = False
865869
twod = None
866870

867-
elt_subdomain = _try_find(self.path, snap, "Grid")
868-
elt_geom = _try_find(self.path, elt_subdomain, "Geometry")
869-
if elt_geom.get("Type") == "X_Y":
870-
twod = ""
871-
for data_item in elt_geom.findall("DataItem"):
872-
coord = _try_text(self.path, data_item).strip()[-1]
873-
if coord in "XYZ":
874-
twod += coord
875-
data_item = _try_find(self.path, elt_geom, "DataItem")
876-
data_text = _try_text(self.path, data_item)
877-
coord_shape = _get_dim(self.path, data_item)
878-
coord_filepattern = data_text.strip().split(":/", 1)[0]
879-
coord_file_chunks = coord_filepattern.split("_")
880-
coord_file_chunks[-2] = "{icore:05d}"
881-
coord_filepattern = "_".join(coord_file_chunks)
871+
xs.skip_to_tag("Geometry")
872+
with xs.load() as elt_geom:
873+
if elt_geom.get("Type") == "X_Y":
874+
twod = ""
875+
for data_item in elt_geom:
876+
coord = _try_text(xs.filepath, data_item).strip()[-1]
877+
if coord in "XYZ":
878+
twod += coord
879+
data_item = elt_geom[0]
880+
data_text = _try_text(xs.filepath, data_item)
881+
coord_shape = _get_dim(xs.filepath, data_item)
882+
coord_filepattern = data_text.strip().split(":/", 1)[0]
883+
coord_file_chunks = coord_filepattern.split("_")
884+
coord_file_chunks[-2] = "{icore:05d}"
885+
coord_filepattern = "_".join(coord_file_chunks)
882886

883887
fields_info = {}
884-
for elt_fvar in elt_subdomain.findall("Attribute"):
885-
name = _try_get(self.path, elt_fvar, "Name")
886-
elt_data = _try_find(self.path, elt_fvar, "DataItem")
887-
shape = _get_dim(self.path, elt_data)
888-
data_text = _try_text(self.path, elt_data)
889-
h5file, group = data_text.strip().split(":/", 1)
890-
isnap = int(group[-5:])
891-
i0_yin = int(group[-11:-6]) - 1
892-
ifile = int(h5file[-14:-9])
893-
fields_info[name] = (ifile, shape)
894-
895-
i1_yin = i0_yin
888+
while xs.current.tag == "Attribute":
889+
with xs.load() as elt_fvar:
890+
name = elt_fvar.attrib["Name"]
891+
elt_data = elt_fvar[0]
892+
shape = _get_dim(xs.filepath, elt_data)
893+
data_text = _try_text(xs.filepath, elt_data)
894+
h5file, group = data_text.strip().split(":/", 1)
895+
isnap = int(group[-5:])
896+
i0_yin = int(group[-11:-6]) - 1
897+
ifile = int(h5file[-14:-9])
898+
fields_info[name] = (ifile, shape)
899+
900+
i1_yin = i0_yin + 1
896901
i0_yang = 0
897902
i1_yang = 0
898-
for elt_subdomain in snap.findall("Grid"):
899-
elt_name = _try_get(self.path, elt_subdomain, "Name")
900-
if elt_name.startswith("meshYang"):
901-
yin_yang = True
902-
elt_fvar = _try_find(self.path, elt_subdomain, "Attribute")
903-
elt_data = _try_find(self.path, elt_fvar, "DataItem")
904-
data_text = _try_text(self.path, elt_data)
905-
_, group = data_text.strip().split(":/", 1)
906-
i0_yang = int(group[-11:-6]) - 1
907-
i1_yang = i0_yang + (i1_yin - i0_yin)
903+
for _ in xs.iter_tag("Grid"):
904+
if xs.current.attrib["GridType"] == "Collection":
908905
break
909-
i1_yin += 1
906+
if (name := xs.current.attrib["Name"]).startswith("meshYang"):
907+
if i1_yang == 0:
908+
yin_yang = True
909+
i0_yang = int(name[-5:]) - 1
910+
i1_yang = i0_yang + (i1_yin - i0_yin)
911+
else:
912+
i1_yin += 1
913+
xs.drop()
910914

911915
data[isnap] = XmfEntry(
912916
isnap=isnap,
913917
time=time,
914-
mo_lambda=mo_lambda,
915-
mo_thick_sol=mo_thick_sol,
918+
mo_lambda=extra.get("mo_lambda"),
919+
mo_thick_sol=extra.get("mo_thick_sol"),
916920
yin_yang=yin_yang,
917921
twod=twod,
918922
coord_filepattern=coord_filepattern,

stagpy/xdmf.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
from __future__ import annotations
2+
3+
from contextlib import contextmanager
4+
from functools import cached_property
5+
from pathlib import Path
6+
from typing import Iterator
7+
from xml.etree import ElementTree as ET
8+
9+
10+
class XmlStream:
11+
def __init__(self, filepath: Path):
12+
self.filepath = filepath
13+
self._event = "end"
14+
self._elem: ET.Element
15+
16+
@cached_property
17+
def _cursor(self) -> Iterator[tuple[str, ET.Element]]:
18+
return ET.iterparse(str(self.filepath), events=("start", "end"))
19+
20+
def _to_next_start(self) -> ET.Element:
21+
for self._event, self._elem in self._cursor:
22+
if self._event == "start":
23+
return self._elem
24+
self._elem.clear()
25+
raise RuntimeError("Reached end of file")
26+
27+
@property
28+
def current(self) -> ET.Element:
29+
"""Element at "start" event."""
30+
if self._event == "start":
31+
return self._elem
32+
return self._to_next_start()
33+
34+
def advance(self) -> None:
35+
"""Advance to next "start" event."""
36+
self.current # make sure to be at current "start" event
37+
self._to_next_start()
38+
39+
def skip_to_tag(self, tag: str) -> None:
40+
"""Progress in file (both width and depth) until reaching the given tag."""
41+
while self.current.tag != tag:
42+
self.advance()
43+
44+
def iter_tag(self, tag: str) -> Iterator[None]:
45+
try:
46+
while True:
47+
self.skip_to_tag(tag)
48+
yield None
49+
except RuntimeError:
50+
pass
51+
52+
def drop(self) -> None:
53+
"""Discard the current element and its children."""
54+
self.current # make sure to be at current "start" event
55+
for self._event, elem in self._cursor:
56+
if self._event == "start":
57+
self.drop()
58+
else:
59+
elem.clear()
60+
return
61+
62+
@contextmanager
63+
def load(self) -> Iterator[ET.Element]:
64+
"""Fully read the current element and its children."""
65+
self.current # make sure to be at current "start" event
66+
for self._event, elem in self._cursor:
67+
if self._event == "start":
68+
self.load().__enter__()
69+
else:
70+
yield elem
71+
break
72+
elem.clear()

0 commit comments

Comments
 (0)