Skip to content

Commit b9250ca

Browse files
authored
Performance improvements (#7)
* Use integer ids for iteration. * Small efficiency improvement. * Test instructions.
1 parent c41b4a9 commit b9250ca

File tree

4 files changed

+308
-52
lines changed

4 files changed

+308
-52
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,12 @@ Note: Partition URIs use MD5 hashes of the original IRIs to ensure syntactically
140140
- ty for type checking
141141
- ruff for formatting and linting
142142

143+
### Running Tests
144+
145+
```bash
146+
uv run pytest -v
147+
```
148+
143149
### Type Checking
144150

145151
```bash

tests/conftest.py

Lines changed: 132 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,21 @@
77

88
type RDFTerm = URIRef | Literal | BNode
99
type Triple = tuple[RDFTerm, RDFTerm, RDFTerm]
10+
type TripleID = tuple[int, int, int]
1011
type PatternTerm = URIRef | Literal
1112

1213

1314
class MockHDTDocument:
1415
"""Mock HDTDocument that works with in-memory RDF graphs.
1516
1617
This allows testing the VOID processing logic without actual HDT files.
17-
Mimics the interface of rdflib_hdt.HDTDocument.
18+
Mimics the interface of rdflib_hdt.HDTDocument, including ID-based access.
19+
20+
ID assignment follows HDT dictionary structure:
21+
- Shared terms (both subject and object): IDs 1..S in both spaces
22+
- Subject-only terms: IDs S+1.. in subject space
23+
- Object-only terms: IDs S+1.. in object space
24+
- Predicates: separate ID space 1..P
1825
"""
1926

2027
def __init__(self, graph: Graph) -> None:
@@ -34,6 +41,47 @@ def __init__(self, graph: Graph) -> None:
3441
self._predicates.add(p)
3542
self._objects.add(o)
3643

44+
# HDT dictionary structure
45+
# Explicit annotations needed because set operations and sorted()
46+
# widen the element type to include Buffer (URIRef inherits str
47+
# which supports buffer protocol in Python 3.12+).
48+
shared: set[RDFTerm] = self._subjects & self._objects
49+
subject_only: set[RDFTerm] = self._subjects - shared
50+
object_only: set[RDFTerm] = self._objects - shared
51+
self._nb_shared = len(shared)
52+
53+
shared_sorted: list[RDFTerm] = sorted(shared, key=str) # type: ignore[assignment]
54+
subj_only_sorted: list[RDFTerm] = sorted(subject_only, key=str) # type: ignore[assignment]
55+
obj_only_sorted: list[RDFTerm] = sorted(object_only, key=str) # type: ignore[assignment]
56+
pred_sorted: list[RDFTerm] = sorted(self._predicates, key=str) # type: ignore[assignment]
57+
58+
# Subject ID space: shared (1..S), then subject-only (S+1..)
59+
self._subject_to_id: dict[RDFTerm, int] = {}
60+
self._id_to_subject: dict[int, RDFTerm] = {}
61+
for i, t in enumerate(shared_sorted, 1):
62+
self._subject_to_id[t] = i
63+
self._id_to_subject[i] = t
64+
for i, t in enumerate(subj_only_sorted, self._nb_shared + 1):
65+
self._subject_to_id[t] = i
66+
self._id_to_subject[i] = t
67+
68+
# Object ID space: shared (1..S), then object-only (S+1..)
69+
self._object_to_id: dict[RDFTerm, int] = {}
70+
self._id_to_object: dict[int, RDFTerm] = {}
71+
for i, t in enumerate(shared_sorted, 1):
72+
self._object_to_id[t] = i
73+
self._id_to_object[i] = t
74+
for i, t in enumerate(obj_only_sorted, self._nb_shared + 1):
75+
self._object_to_id[t] = i
76+
self._id_to_object[i] = t
77+
78+
# Predicate ID space: 1..P
79+
self._predicate_to_id: dict[RDFTerm, int] = {}
80+
self._id_to_predicate: dict[int, RDFTerm] = {}
81+
for i, t in enumerate(pred_sorted, 1):
82+
self._predicate_to_id[t] = i
83+
self._id_to_predicate[i] = t
84+
3785
def search(
3886
self, pattern: tuple[PatternTerm | None, PatternTerm | None, PatternTerm | None]
3987
) -> tuple[Iterator[Triple], int]:
@@ -57,6 +105,89 @@ def search(
57105
matches.append((s, p, o))
58106
return iter(matches), len(matches)
59107

108+
def search_ids(
109+
self,
110+
query: tuple[int | None, int | None, int | None],
111+
limit: int = 0,
112+
offset: int = 0,
113+
) -> tuple[Iterator[TripleID], int]:
114+
"""Search for triples matching the given ID pattern.
115+
116+
Use 0 or None for wildcards.
117+
"""
118+
s_id = query[0] or 0
119+
p_id = query[1] or 0
120+
o_id = query[2] or 0
121+
122+
# Convert non-zero IDs to terms for matching
123+
s_filter = self._id_to_subject.get(s_id) if s_id else None
124+
p_filter = self._id_to_predicate.get(p_id) if p_id else None
125+
o_filter = self._id_to_object.get(o_id) if o_id else None
126+
127+
# Non-zero ID not found in dictionary → no matches
128+
if (
129+
(s_id and s_filter is None)
130+
or (p_id and p_filter is None)
131+
or (o_id and o_filter is None)
132+
):
133+
return iter([]), 0
134+
135+
matches: list[TripleID] = []
136+
for s, p, o in self._triples:
137+
if s_filter is not None and s != s_filter:
138+
continue
139+
if p_filter is not None and p != p_filter:
140+
continue
141+
if o_filter is not None and o != o_filter:
142+
continue
143+
matches.append(
144+
(
145+
self._subject_to_id[s],
146+
self._predicate_to_id[p],
147+
self._object_to_id[o],
148+
)
149+
)
150+
return iter(matches), len(matches)
151+
152+
def term_to_id(self, term: RDFTerm, kind: int) -> int:
153+
"""Convert an rdflib term to its HDT integer ID.
154+
155+
Args:
156+
term: The rdflib term
157+
kind: 0=subject, 1=predicate, 2=object
158+
159+
Returns:
160+
Integer ID, or 0 if not found
161+
"""
162+
if kind == 0:
163+
return self._subject_to_id.get(term, 0)
164+
if kind == 1:
165+
return self._predicate_to_id.get(term, 0)
166+
if kind == 2:
167+
return self._object_to_id.get(term, 0)
168+
return 0
169+
170+
def id_to_term(self, term_id: int, kind: int) -> RDFTerm:
171+
"""Convert an HDT integer ID to its rdflib term.
172+
173+
Args:
174+
term_id: The integer ID
175+
kind: 0=subject, 1=predicate, 2=object
176+
"""
177+
if kind == 0:
178+
return self._id_to_subject[term_id]
179+
if kind == 1:
180+
return self._id_to_predicate[term_id]
181+
if kind == 2:
182+
return self._id_to_object[term_id]
183+
msg = f"Invalid kind: {kind}"
184+
raise ValueError(msg)
185+
186+
@property
187+
def nb_shared(self) -> int:
188+
"""Get number of shared subject-object terms."""
189+
return self._nb_shared
190+
60191
@property
61192
def total_triples(self) -> int:
62193
"""Get total number of triples."""

void_hdt/cli.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Command-line interface for void-hdt."""
22

3+
import resource
34
import sys
45
from pathlib import Path
56

@@ -10,6 +11,14 @@
1011
from void_hdt.void_generator import VOIDGenerator
1112

1213

14+
def _get_rss_gb() -> float:
15+
"""Get peak RSS in GB."""
16+
usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
17+
if sys.platform == "darwin":
18+
return usage / (1024**3) # macOS: bytes
19+
return usage / (1024**2) # Linux: KB
20+
21+
1322
@click.command()
1423
@click.argument("hdt_file", type=click.Path(exists=True, path_type=Path))
1524
@click.option(
@@ -30,7 +39,20 @@
3039
default=False,
3140
help="Use blank nodes for partition nodes instead of URI references",
3241
)
33-
def main(hdt_file: Path, output: Path, dataset_uri: str, use_blank_nodes: bool) -> None:
42+
@click.option(
43+
"--cache-size",
44+
type=int,
45+
default=2_000_000,
46+
show_default=True,
47+
help="Max entries in the type-lookup cache (trades memory for speed)",
48+
)
49+
def main(
50+
hdt_file: Path,
51+
output: Path,
52+
dataset_uri: str,
53+
use_blank_nodes: bool,
54+
cache_size: int,
55+
) -> None:
3456
"""Generate VOID vocabulary descriptions from HDT files.
3557
3658
Processes an HDT file to extract dataset statistics, class partitions,
@@ -57,8 +79,13 @@ def main(hdt_file: Path, output: Path, dataset_uri: str, use_blank_nodes: bool)
5779
click.echo(f" Distinct objects: {document.nb_objects}")
5880

5981
# Analyze class and property partitions (two passes through data)
82+
click.echo(f"Peak RSS before analysis: {_get_rss_gb():.1f} GB")
6083
click.echo("Analyzing class partitions...")
61-
analyzer.analyze(document)
84+
85+
def _progress(msg: str) -> None:
86+
click.echo(f"{msg} [RSS: {_get_rss_gb():.1f} GB]")
87+
88+
analyzer.analyze(document, cache_size=cache_size, progress_fn=_progress)
6289

6390
class_count = len(analyzer.class_partitions)
6491
click.echo(f" Found {class_count} classes")

0 commit comments

Comments
 (0)