Performance improvements (#7)

balhoff · web-flow · commit b9250ca13aec · 2026-02-16T15:15:09.000-05:00
* Use integer ids for iteration.

* Small efficiency improvement.

* Test instructions.
diff --git a/README.md b/README.md
@@ -140,6 +140,12 @@ Note: Partition URIs use MD5 hashes of the original IRIs to ensure syntactically
 - ty for type checking
 - ruff for formatting and linting
 
+### Running Tests
+
+```bash
+uv run pytest -v
+```
+
 ### Type Checking
 
 ```bash
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -7,14 +7,21 @@
 
 type RDFTerm = URIRef | Literal | BNode
 type Triple = tuple[RDFTerm, RDFTerm, RDFTerm]
+type TripleID = tuple[int, int, int]
 type PatternTerm = URIRef | Literal
 
 
 class MockHDTDocument:
     """Mock HDTDocument that works with in-memory RDF graphs.
 
     This allows testing the VOID processing logic without actual HDT files.
-    Mimics the interface of rdflib_hdt.HDTDocument.
+    Mimics the interface of rdflib_hdt.HDTDocument, including ID-based access.
+
+    ID assignment follows HDT dictionary structure:
+    - Shared terms (both subject and object): IDs 1..S in both spaces
+    - Subject-only terms: IDs S+1.. in subject space
+    - Object-only terms: IDs S+1.. in object space
+    - Predicates: separate ID space 1..P
     """
 
     def __init__(self, graph: Graph) -> None:
@@ -34,6 +41,47 @@ def __init__(self, graph: Graph) -> None:
             self._predicates.add(p)
             self._objects.add(o)
 
+        # HDT dictionary structure
+        # Explicit annotations needed because set operations and sorted()
+        # widen the element type to include Buffer (URIRef inherits str
+        # which supports buffer protocol in Python 3.12+).
+        shared: set[RDFTerm] = self._subjects & self._objects
+        subject_only: set[RDFTerm] = self._subjects - shared
+        object_only: set[RDFTerm] = self._objects - shared
+        self._nb_shared = len(shared)
+
+        shared_sorted: list[RDFTerm] = sorted(shared, key=str)  # type: ignore[assignment]
+        subj_only_sorted: list[RDFTerm] = sorted(subject_only, key=str)  # type: ignore[assignment]
+        obj_only_sorted: list[RDFTerm] = sorted(object_only, key=str)  # type: ignore[assignment]
+        pred_sorted: list[RDFTerm] = sorted(self._predicates, key=str)  # type: ignore[assignment]
+
+        # Subject ID space: shared (1..S), then subject-only (S+1..)
+        self._subject_to_id: dict[RDFTerm, int] = {}
+        self._id_to_subject: dict[int, RDFTerm] = {}
+        for i, t in enumerate(shared_sorted, 1):
+            self._subject_to_id[t] = i
+            self._id_to_subject[i] = t
+        for i, t in enumerate(subj_only_sorted, self._nb_shared + 1):
+            self._subject_to_id[t] = i
+            self._id_to_subject[i] = t
+
+        # Object ID space: shared (1..S), then object-only (S+1..)
+        self._object_to_id: dict[RDFTerm, int] = {}
+        self._id_to_object: dict[int, RDFTerm] = {}
+        for i, t in enumerate(shared_sorted, 1):
+            self._object_to_id[t] = i
+            self._id_to_object[i] = t
+        for i, t in enumerate(obj_only_sorted, self._nb_shared + 1):
+            self._object_to_id[t] = i
+            self._id_to_object[i] = t
+
+        # Predicate ID space: 1..P
+        self._predicate_to_id: dict[RDFTerm, int] = {}
+        self._id_to_predicate: dict[int, RDFTerm] = {}
+        for i, t in enumerate(pred_sorted, 1):
+            self._predicate_to_id[t] = i
+            self._id_to_predicate[i] = t
+
     def search(
         self, pattern: tuple[PatternTerm | None, PatternTerm | None, PatternTerm | None]
     ) -> tuple[Iterator[Triple], int]:
@@ -57,6 +105,89 @@ def search(
             matches.append((s, p, o))
         return iter(matches), len(matches)
 
+    def search_ids(
+        self,
+        query: tuple[int | None, int | None, int | None],
+        limit: int = 0,
+        offset: int = 0,
+    ) -> tuple[Iterator[TripleID], int]:
+        """Search for triples matching the given ID pattern.
+
+        Use 0 or None for wildcards.
+        """
+        s_id = query[0] or 0
+        p_id = query[1] or 0
+        o_id = query[2] or 0
+
+        # Convert non-zero IDs to terms for matching
+        s_filter = self._id_to_subject.get(s_id) if s_id else None
+        p_filter = self._id_to_predicate.get(p_id) if p_id else None
+        o_filter = self._id_to_object.get(o_id) if o_id else None
+
+        # Non-zero ID not found in dictionary → no matches
+        if (
+            (s_id and s_filter is None)
+            or (p_id and p_filter is None)
+            or (o_id and o_filter is None)
+        ):
+            return iter([]), 0
+
+        matches: list[TripleID] = []
+        for s, p, o in self._triples:
+            if s_filter is not None and s != s_filter:
+                continue
+            if p_filter is not None and p != p_filter:
+                continue
+            if o_filter is not None and o != o_filter:
+                continue
+            matches.append(
+                (
+                    self._subject_to_id[s],
+                    self._predicate_to_id[p],
+                    self._object_to_id[o],
+                )
+            )
+        return iter(matches), len(matches)
+
+    def term_to_id(self, term: RDFTerm, kind: int) -> int:
+        """Convert an rdflib term to its HDT integer ID.
+
+        Args:
+            term: The rdflib term
+            kind: 0=subject, 1=predicate, 2=object
+
+        Returns:
+            Integer ID, or 0 if not found
+        """
+        if kind == 0:
+            return self._subject_to_id.get(term, 0)
+        if kind == 1:
+            return self._predicate_to_id.get(term, 0)
+        if kind == 2:
+            return self._object_to_id.get(term, 0)
+        return 0
+
+    def id_to_term(self, term_id: int, kind: int) -> RDFTerm:
+        """Convert an HDT integer ID to its rdflib term.
+
+        Args:
+            term_id: The integer ID
+            kind: 0=subject, 1=predicate, 2=object
+        """
+        if kind == 0:
+            return self._id_to_subject[term_id]
+        if kind == 1:
+            return self._id_to_predicate[term_id]
+        if kind == 2:
+            return self._id_to_object[term_id]
+        msg = f"Invalid kind: {kind}"
+        raise ValueError(msg)
+
+    @property
+    def nb_shared(self) -> int:
+        """Get number of shared subject-object terms."""
+        return self._nb_shared
+
     @property
     def total_triples(self) -> int:
         """Get total number of triples."""
diff --git a/void_hdt/cli.py b/void_hdt/cli.py
@@ -1,5 +1,6 @@
 """Command-line interface for void-hdt."""
 
+import resource
 import sys
 from pathlib import Path
 
@@ -10,6 +11,14 @@
 from void_hdt.void_generator import VOIDGenerator
 
 
+def _get_rss_gb() -> float:
+    """Get peak RSS in GB."""
+    usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+    if sys.platform == "darwin":
+        return usage / (1024**3)  # macOS: bytes
+    return usage / (1024**2)  # Linux: KB
+
+
 @click.command()
 @click.argument("hdt_file", type=click.Path(exists=True, path_type=Path))
 @click.option(
@@ -30,7 +39,20 @@
     default=False,
     help="Use blank nodes for partition nodes instead of URI references",
 )
-def main(hdt_file: Path, output: Path, dataset_uri: str, use_blank_nodes: bool) -> None:
+@click.option(
+    "--cache-size",
+    type=int,
+    default=2_000_000,
+    show_default=True,
+    help="Max entries in the type-lookup cache (trades memory for speed)",
+)
+def main(
+    hdt_file: Path,
+    output: Path,
+    dataset_uri: str,
+    use_blank_nodes: bool,
+    cache_size: int,
+) -> None:
     """Generate VOID vocabulary descriptions from HDT files.
 
     Processes an HDT file to extract dataset statistics, class partitions,
@@ -57,8 +79,13 @@ def main(hdt_file: Path, output: Path, dataset_uri: str, use_blank_nodes: bool)
         click.echo(f"  Distinct objects: {document.nb_objects}")
 
         # Analyze class and property partitions (two passes through data)
+        click.echo(f"Peak RSS before analysis: {_get_rss_gb():.1f} GB")
         click.echo("Analyzing class partitions...")
-        analyzer.analyze(document)
+
+        def _progress(msg: str) -> None:
+            click.echo(f"{msg}  [RSS: {_get_rss_gb():.1f} GB]")
+
+        analyzer.analyze(document, cache_size=cache_size, progress_fn=_progress)
 
         class_count = len(analyzer.class_partitions)
         click.echo(f"  Found {class_count} classes")
diff --git a/void_hdt/partitions.py b/void_hdt/partitions.py