77
88type RDFTerm = URIRef | Literal | BNode
99type Triple = tuple [RDFTerm , RDFTerm , RDFTerm ]
10+ type TripleID = tuple [int , int , int ]
1011type PatternTerm = URIRef | Literal
1112
1213
1314class MockHDTDocument :
1415 """Mock HDTDocument that works with in-memory RDF graphs.
1516
1617 This allows testing the VOID processing logic without actual HDT files.
17- Mimics the interface of rdflib_hdt.HDTDocument.
18+ Mimics the interface of rdflib_hdt.HDTDocument, including ID-based access.
19+
20+ ID assignment follows HDT dictionary structure:
21+ - Shared terms (both subject and object): IDs 1..S in both spaces
22+ - Subject-only terms: IDs S+1.. in subject space
23+ - Object-only terms: IDs S+1.. in object space
24+ - Predicates: separate ID space 1..P
1825 """
1926
2027 def __init__ (self , graph : Graph ) -> None :
@@ -34,6 +41,47 @@ def __init__(self, graph: Graph) -> None:
3441 self ._predicates .add (p )
3542 self ._objects .add (o )
3643
44+ # HDT dictionary structure
45+ # Explicit annotations needed because set operations and sorted()
46+ # widen the element type to include Buffer (URIRef inherits str
47+ # which supports buffer protocol in Python 3.12+).
48+ shared : set [RDFTerm ] = self ._subjects & self ._objects
49+ subject_only : set [RDFTerm ] = self ._subjects - shared
50+ object_only : set [RDFTerm ] = self ._objects - shared
51+ self ._nb_shared = len (shared )
52+
53+ shared_sorted : list [RDFTerm ] = sorted (shared , key = str ) # type: ignore[assignment]
54+ subj_only_sorted : list [RDFTerm ] = sorted (subject_only , key = str ) # type: ignore[assignment]
55+ obj_only_sorted : list [RDFTerm ] = sorted (object_only , key = str ) # type: ignore[assignment]
56+ pred_sorted : list [RDFTerm ] = sorted (self ._predicates , key = str ) # type: ignore[assignment]
57+
58+ # Subject ID space: shared (1..S), then subject-only (S+1..)
59+ self ._subject_to_id : dict [RDFTerm , int ] = {}
60+ self ._id_to_subject : dict [int , RDFTerm ] = {}
61+ for i , t in enumerate (shared_sorted , 1 ):
62+ self ._subject_to_id [t ] = i
63+ self ._id_to_subject [i ] = t
64+ for i , t in enumerate (subj_only_sorted , self ._nb_shared + 1 ):
65+ self ._subject_to_id [t ] = i
66+ self ._id_to_subject [i ] = t
67+
68+ # Object ID space: shared (1..S), then object-only (S+1..)
69+ self ._object_to_id : dict [RDFTerm , int ] = {}
70+ self ._id_to_object : dict [int , RDFTerm ] = {}
71+ for i , t in enumerate (shared_sorted , 1 ):
72+ self ._object_to_id [t ] = i
73+ self ._id_to_object [i ] = t
74+ for i , t in enumerate (obj_only_sorted , self ._nb_shared + 1 ):
75+ self ._object_to_id [t ] = i
76+ self ._id_to_object [i ] = t
77+
78+ # Predicate ID space: 1..P
79+ self ._predicate_to_id : dict [RDFTerm , int ] = {}
80+ self ._id_to_predicate : dict [int , RDFTerm ] = {}
81+ for i , t in enumerate (pred_sorted , 1 ):
82+ self ._predicate_to_id [t ] = i
83+ self ._id_to_predicate [i ] = t
84+
3785 def search (
3886 self , pattern : tuple [PatternTerm | None , PatternTerm | None , PatternTerm | None ]
3987 ) -> tuple [Iterator [Triple ], int ]:
@@ -57,6 +105,89 @@ def search(
57105 matches .append ((s , p , o ))
58106 return iter (matches ), len (matches )
59107
108+ def search_ids (
109+ self ,
110+ query : tuple [int | None , int | None , int | None ],
111+ limit : int = 0 ,
112+ offset : int = 0 ,
113+ ) -> tuple [Iterator [TripleID ], int ]:
114+ """Search for triples matching the given ID pattern.
115+
116+ Use 0 or None for wildcards.
117+ """
118+ s_id = query [0 ] or 0
119+ p_id = query [1 ] or 0
120+ o_id = query [2 ] or 0
121+
122+ # Convert non-zero IDs to terms for matching
123+ s_filter = self ._id_to_subject .get (s_id ) if s_id else None
124+ p_filter = self ._id_to_predicate .get (p_id ) if p_id else None
125+ o_filter = self ._id_to_object .get (o_id ) if o_id else None
126+
127+ # Non-zero ID not found in dictionary → no matches
128+ if (
129+ (s_id and s_filter is None )
130+ or (p_id and p_filter is None )
131+ or (o_id and o_filter is None )
132+ ):
133+ return iter ([]), 0
134+
135+ matches : list [TripleID ] = []
136+ for s , p , o in self ._triples :
137+ if s_filter is not None and s != s_filter :
138+ continue
139+ if p_filter is not None and p != p_filter :
140+ continue
141+ if o_filter is not None and o != o_filter :
142+ continue
143+ matches .append (
144+ (
145+ self ._subject_to_id [s ],
146+ self ._predicate_to_id [p ],
147+ self ._object_to_id [o ],
148+ )
149+ )
150+ return iter (matches ), len (matches )
151+
152+ def term_to_id (self , term : RDFTerm , kind : int ) -> int :
153+ """Convert an rdflib term to its HDT integer ID.
154+
155+ Args:
156+ term: The rdflib term
157+ kind: 0=subject, 1=predicate, 2=object
158+
159+ Returns:
160+ Integer ID, or 0 if not found
161+ """
162+ if kind == 0 :
163+ return self ._subject_to_id .get (term , 0 )
164+ if kind == 1 :
165+ return self ._predicate_to_id .get (term , 0 )
166+ if kind == 2 :
167+ return self ._object_to_id .get (term , 0 )
168+ return 0
169+
170+ def id_to_term (self , term_id : int , kind : int ) -> RDFTerm :
171+ """Convert an HDT integer ID to its rdflib term.
172+
173+ Args:
174+ term_id: The integer ID
175+ kind: 0=subject, 1=predicate, 2=object
176+ """
177+ if kind == 0 :
178+ return self ._id_to_subject [term_id ]
179+ if kind == 1 :
180+ return self ._id_to_predicate [term_id ]
181+ if kind == 2 :
182+ return self ._id_to_object [term_id ]
183+ msg = f"Invalid kind: { kind } "
184+ raise ValueError (msg )
185+
186+ @property
187+ def nb_shared (self ) -> int :
188+ """Get number of shared subject-object terms."""
189+ return self ._nb_shared
190+
60191 @property
61192 def total_triples (self ) -> int :
62193 """Get total number of triples."""
0 commit comments