group logic for step/snap correspondence in _caching.StepSnap

amorison · amorison · commit 684769005ab5 · 2024-11-29T01:32:34.000Z
diff --git a/src/stagpy/_caching.py b/src/stagpy/_caching.py
@@ -1,12 +1,19 @@
 from __future__ import annotations
 
+import re
 import typing
+from abc import ABC, abstractmethod
 from collections import deque
 from dataclasses import dataclass
 from functools import cached_property
 
+from . import phyvars, stagyyparsers
+
 if typing.TYPE_CHECKING:
+    from typing import Mapping
+
     from .datatypes import Field
+    from .stagyydata import StagyyData
 
 
 @dataclass(frozen=True)
@@ -57,3 +64,119 @@ def evict_istep(self, istep: int) -> None:
         self._stack.clear()
         self._stack.extend(to_keep)
         assert len(self._stack) == len(self._data)
+
+
+class StepSnap(ABC):
+    """Keep track of the step/snap correspondence."""
+
+    @abstractmethod
+    def istep(self, *, isnap: int) -> int | None: ...
+
+    @abstractmethod
+    def isnap(self, *, istep: int) -> int | None: ...
+
+    @abstractmethod
+    def len_snap(self) -> int: ...
+
+
+@dataclass(frozen=True)
+class StepSnapInfo:
+    step_to_snap: Mapping[int, int]
+    snap_to_step: Mapping[int, int]
+    isnap_max: int
+
+
+@dataclass(frozen=True)
+class StepSnapH5(StepSnap):
+    sdat: StagyyData
+
+    @cached_property
+    def _info(self) -> StepSnapInfo:
+        assert self.sdat.hdf5 is not None
+        isnap = -1
+        step_to_snap = {}
+        snap_to_step = {}
+        for isnap, istep in stagyyparsers.read_time_h5(self.sdat.hdf5):
+            step_to_snap[istep] = isnap
+            snap_to_step[isnap] = istep
+        return StepSnapInfo(
+            step_to_snap=step_to_snap,
+            snap_to_step=snap_to_step,
+            isnap_max=isnap,
+        )
+
+    def istep(self, *, isnap: int) -> int | None:
+        return self._info.snap_to_step.get(isnap)
+
+    def isnap(self, *, istep: int) -> int | None:
+        return self._info.step_to_snap.get(istep)
+
+    def len_snap(self) -> int:
+        return self._info.isnap_max + 1
+
+
+@dataclass(frozen=True)
+class StepSnapLegacy(StepSnap):
+    sdat: StagyyData
+
+    @cached_property
+    def _step_to_snap(self) -> dict[int, int | None]:
+        return {}
+
+    @cached_property
+    def _snap_to_step(self) -> dict[int, int | None]:
+        return {}
+
+    @cached_property
+    def isnap_max(self) -> int:
+        imax = -1
+        out_stem = re.escape(self.sdat.par.legacy_output("_").name[:-1])
+        rgx = re.compile(f"^{out_stem}_([a-zA-Z]+)([0-9]{{5}})$")
+        fstems = set(fstem for fstem in phyvars.FIELD_FILES)
+        for fname in self.sdat._files:
+            match = rgx.match(fname.name)
+            if match is not None and match.group(1) in fstems:
+                imax = max(int(match.group(2)), imax)
+        return imax
+
+    def len_snap(self) -> int:
+        return self.isnap_max + 1
+
+    def istep(self, *, isnap: int) -> int | None:
+        if isnap < 0 or isnap > self.isnap_max:
+            return None
+        istep = self._snap_to_step.get(isnap, -1)
+        if istep == -1:
+            binfiles = self.sdat._binfiles_set(isnap)
+            if binfiles:
+                istep = stagyyparsers.field_istep(binfiles.pop())
+            else:
+                istep = None
+            self._snap_to_step[isnap] = istep
+            if istep is not None:
+                self._step_to_snap[istep] = isnap
+        return istep
+
+    def isnap(self, *, istep: int) -> int | None:
+        if istep < 0:
+            return None
+        isnap = self._step_to_snap.get(istep, -1)
+        if isnap == -1:
+            istep_try = None
+            # might be more efficient to do 0 and -1 then bisection, even if
+            # that means losing intermediate information
+            while (istep_try is None or istep_try < istep) and isnap < 99999:
+                isnap += 1
+                try:
+                    istep_try = self.sdat.snaps[isnap].istep
+                except KeyError:
+                    pass
+                # all intermediate istep could have their isnap to None
+                self._snap_to_step[isnap] = istep_try
+                if istep_try is not None:
+                    self._step_to_snap[istep_try] = isnap
+
+            if istep_try != istep:
+                self._step_to_snap[istep] = None
+
+        return self._step_to_snap[istep]
diff --git a/src/stagpy/stagyydata.py b/src/stagpy/stagyydata.py
@@ -8,7 +8,6 @@
 
 from __future__ import annotations
 
-import re
 import typing
 from collections import abc
 from dataclasses import dataclass, field
@@ -20,7 +19,7 @@
 
 from . import _helpers, error, phyvars, stagyyparsers, step
 from . import datatypes as dt
-from ._caching import FieldCache
+from ._caching import FieldCache, StepSnap, StepSnapH5, StepSnapLegacy
 from .parfile import StagyyPar
 from .stagyyparsers import FieldXmf, TracersXmf
 from .step import Step
@@ -390,15 +389,10 @@ class Snaps:
 
     def __init__(self, sdat: StagyyData):
         self.sdat = sdat
-        self._all_isteps_known = False
 
     def __repr__(self) -> str:
         return f"{self.sdat!r}.snaps"
 
-    @cached_property
-    def _isteps(self) -> dict[int, int | None]:
-        return {}
-
     @typing.overload
     def __getitem__(self, isnap: int) -> Step: ...
 
@@ -412,53 +406,21 @@ def __getitem__(self, isnap: int | slice | Sequence[StepIndex]) -> Step | StepsV
         assert isinstance(isnap, int)
         if isnap < 0:
             isnap += len(self)
-        if isnap < 0 or isnap >= len(self):
-            istep = None
-        else:
-            istep = self._isteps.get(isnap, None if self._all_isteps_known else -1)
-        if istep == -1:
-            # isnap not in _isteps but not all isteps known, keep looking
-            binfiles = self.sdat._binfiles_set(isnap)
-            if binfiles:
-                istep = stagyyparsers.field_istep(binfiles.pop())
-            else:
-                istep = None
-            if istep is not None:
-                self._bind(isnap, istep)
-            else:
-                self._isteps[isnap] = None
+        istep = self.sdat._step_snap.istep(isnap=isnap)
         if istep is None:
             raise error.InvalidSnapshotError(self.sdat, isnap, "Invalid snapshot index")
         return self.sdat.steps[istep]
 
     def __delitem__(self, isnap: int | None) -> None:
         if isnap is not None:
-            istep = self._isteps.get(isnap)
+            istep = self.sdat._step_snap.istep(isnap=isnap)
             del self.sdat.steps[istep]
 
-    @cached_property
-    def _len(self) -> int:
-        length = -1
-        if self.sdat.hdf5:
-            isnap = -1
-            for isnap, istep in stagyyparsers.read_time_h5(self.sdat.hdf5):
-                self._bind(isnap, istep)
-            length = isnap
-            self._all_isteps_known = True
-        if length < 0:
-            out_stem = re.escape(self.sdat.par.legacy_output("_").name[:-1])
-            rgx = re.compile(f"^{out_stem}_([a-zA-Z]+)([0-9]{{5}})$")
-            fstems = set(fstem for fstem in phyvars.FIELD_FILES)
-            for fname in self.sdat._files:
-                match = rgx.match(fname.name)
-                if match is not None and match.group(1) in fstems:
-                    length = max(int(match.group(2)), length)
-        if length < 0:
-            raise error.NoSnapshotError(self.sdat)
-        return length + 1
-
     def __len__(self) -> int:
-        return self._len
+        length = self.sdat._step_snap.len_snap()
+        if length <= 0:
+            raise error.NoSnapshotError(self.sdat)
+        return length
 
     def __iter__(self) -> Iterator[Step]:
         return iter(self[:])
@@ -490,16 +452,6 @@ def at_time(self, time: float, after: bool = False) -> Step:
             igp -= 1
         return self[igp]
 
-    def _bind(self, isnap: int, istep: int) -> None:
-        """Register the isnap / istep correspondence.
-
-        Args:
-            isnap: snapshot index.
-            istep: time step index.
-        """
-        self._isteps[isnap] = istep
-        self.sdat.steps[istep]._isnap = isnap
-
     def filter(
         self,
         snap: bool = False,
@@ -824,3 +776,9 @@ def _binfiles_set(self, isnap: int) -> set[Path]:
     @cached_property
     def _field_cache(self) -> FieldCache:
         return FieldCache(maxsize=50)
+
+    @cached_property
+    def _step_snap(self) -> StepSnap:
+        if self.hdf5 is not None:
+            return StepSnapH5(sdat=self)
+        return StepSnapLegacy(sdat=self)
diff --git a/src/stagpy/step.py b/src/stagpy/step.py
@@ -596,7 +596,6 @@ def __init__(self, istep: int, sdat: StagyyData):
         )
         self.tracers = Tracers(self)
         self.rprofs = Rprofs(self)
-        self._isnap: int | None = -1
 
     def __repr__(self) -> str:
         if self.isnap is not None:
@@ -642,19 +641,4 @@ def isnap(self) -> int | None:
 
         It is None if no snapshot exists for the time step.
         """
-        if self._isnap == -1:
-            istep = None
-            isnap = -1
-            # could be more efficient if do 0 and -1 then bisection
-            # (but loose intermediate <- would probably use too much
-            # memory for what it's worth if search algo is efficient)
-            while (istep is None or istep < self.istep) and isnap < 99999:
-                isnap += 1
-                try:
-                    istep = self.sdat.snaps[isnap].istep
-                except KeyError:
-                    pass
-                # all intermediate istep could have their ._isnap to None
-            if istep != self.istep:
-                self._isnap = None
-        return self._isnap
+        return self.sdat._step_snap.isnap(istep=self.istep)