29
29
30
30
CONTENT_ID = "content_id"
31
31
32
- CONTENT_COLUMNS = (
33
- "content_id, kind, text_content, attributes_blob, metadata_s, links_blob"
34
- )
32
+ CONTENT_COLUMNS = "content_id, kind, text_content, links_blob, metadata_blob"
35
33
36
34
SELECT_CQL_TEMPLATE = (
37
35
"SELECT {columns} FROM {table_name} {where_clause} {order_clause} {limit_clause};"
@@ -119,18 +117,12 @@ def _deserialize_links(json_blob: Optional[str]) -> Set[Link]:
119
117
120
118
121
119
def _row_to_node (row : Any ) -> Node :
122
- metadata_s = row .metadata_s
123
- if metadata_s is None :
124
- metadata_s = {}
125
- attributes_blob = row .attributes_blob
126
- attributes_dict = (
127
- _deserialize_metadata (attributes_blob ) if attributes_blob is not None else {}
128
- )
120
+ metadata = _deserialize_metadata (row .metadata_blob )
129
121
links = _deserialize_links (row .links_blob )
130
122
return Node (
131
123
id = row .content_id ,
132
124
text = row .text_content ,
133
- metadata = { ** attributes_dict , ** metadata_s } ,
125
+ metadata = metadata ,
134
126
links = links ,
135
127
)
136
128
@@ -198,7 +190,7 @@ def __init__(
198
190
f"""
199
191
INSERT INTO { keyspace } .{ node_table } (
200
192
content_id, kind, text_content, text_embedding, link_to_tags,
201
- link_from_tags, attributes_blob, metadata_s, links_blob
193
+ link_from_tags, links_blob, metadata_blob, metadata_s
202
194
) VALUES (?, '{ Kind .passage } ', ?, ?, ?, ?, ?, ?, ?)
203
195
""" # noqa: S608
204
196
)
@@ -265,9 +257,9 @@ def _apply_schema(self) -> None:
265
257
266
258
link_to_tags SET<TUPLE<TEXT, TEXT>>,
267
259
link_from_tags SET<TUPLE<TEXT, TEXT>>,
268
- attributes_blob TEXT,
269
- metadata_s MAP<TEXT,TEXT>,
270
260
links_blob TEXT,
261
+ metadata_blob TEXT,
262
+ metadata_s MAP<TEXT,TEXT>,
271
263
272
264
PRIMARY KEY (content_id)
273
265
)
@@ -287,36 +279,14 @@ def _apply_schema(self) -> None:
287
279
""" )
288
280
289
281
self ._session .execute (f"""
290
- CREATE CUSTOM INDEX IF NOT EXISTS { self ._node_table } _metadata_index
282
+ CREATE CUSTOM INDEX IF NOT EXISTS { self ._node_table } _metadata_s_index
291
283
ON { self .table_name ()} (ENTRIES(metadata_s))
292
284
USING 'StorageAttachedIndex';
293
285
""" )
294
286
295
287
def _concurrent_queries (self ) -> ConcurrentQueries :
296
288
return ConcurrentQueries (self ._session )
297
289
298
- def _parse_metadata (
299
- self , metadata : Dict [str , Any ], is_query : bool
300
- ) -> Tuple [str , Dict [str , str ]]:
301
- attributes_dict = {
302
- k : self ._coerce_string (v )
303
- for k , v in metadata .items ()
304
- if not _is_metadata_field_indexed (k , self ._metadata_indexing_policy )
305
- }
306
- if is_query and len (attributes_dict ) > 0 :
307
- raise ValueError ("Non-indexed metadata fields cannot be used in queries." )
308
- attributes_blob = _serialize_metadata (attributes_dict )
309
-
310
- metadata_indexed_dict = {
311
- k : v
312
- for k , v in metadata .items ()
313
- if _is_metadata_field_indexed (k , self ._metadata_indexing_policy )
314
- }
315
- metadata_s = {
316
- k : self ._coerce_string (v ) for k , v in metadata_indexed_dict .items ()
317
- }
318
- return (attributes_blob , metadata_s )
319
-
320
290
# TODO: Async (aadd_nodes)
321
291
def add_nodes (
322
292
self ,
@@ -352,10 +322,13 @@ def add_nodes(
352
322
if tag .direction in {"out" , "bidir" }:
353
323
link_to_tags .add ((tag .kind , tag .tag ))
354
324
355
- attributes_blob , metadata_s = self ._parse_metadata (
356
- metadata = metadata , is_query = False
357
- )
325
+ metadata_s = {
326
+ k : self ._coerce_string (v )
327
+ for k , v in metadata .items ()
328
+ if _is_metadata_field_indexed (k , self ._metadata_indexing_policy )
329
+ }
358
330
331
+ metadata_blob = _serialize_metadata (metadata )
359
332
links_blob = _serialize_links (links )
360
333
cq .execute (
361
334
self ._insert_passage ,
@@ -365,9 +338,9 @@ def add_nodes(
365
338
text_embedding ,
366
339
link_to_tags ,
367
340
link_from_tags ,
368
- attributes_blob ,
369
- metadata_s ,
370
341
links_blob ,
342
+ metadata_blob ,
343
+ metadata_s ,
371
344
),
372
345
)
373
346
@@ -413,7 +386,7 @@ def mmr_traversal_search(
413
386
adjacent_k : int = 10 ,
414
387
lambda_mult : float = 0.5 ,
415
388
score_threshold : float = float ("-inf" ),
416
- metadata : Dict [str , Any ] = {},
389
+ metadata_filter : Dict [str , Any ] = {},
417
390
) -> Iterable [Node ]:
418
391
"""Retrieve documents from this graph store using MMR-traversal.
419
392
@@ -439,7 +412,7 @@ def mmr_traversal_search(
439
412
diversity and 1 to minimum diversity. Defaults to 0.5.
440
413
score_threshold: Only documents with a score greater than or equal
441
414
this threshold will be chosen. Defaults to -infinity.
442
- metadata : Optional metadata to filter the results.
415
+ metadata_filter : Optional metadata to filter the results.
443
416
"""
444
417
query_embedding = self ._embedding .embed_query (query )
445
418
helper = MmrHelper (
@@ -458,7 +431,7 @@ def fetch_initial_candidates() -> None:
458
431
query , params = self ._get_search_cql (
459
432
limit = fetch_k ,
460
433
columns = "content_id, text_embedding, link_to_tags" ,
461
- metadata = metadata ,
434
+ metadata = metadata_filter ,
462
435
embedding = query_embedding ,
463
436
)
464
437
@@ -539,7 +512,7 @@ def traversal_search(
539
512
* ,
540
513
k : int = 4 ,
541
514
depth : int = 1 ,
542
- metadata : Dict [str , Any ] = {},
515
+ metadata_filter : Dict [str , Any ] = {},
543
516
) -> Iterable [Node ]:
544
517
"""Retrieve documents from this knowledge store.
545
518
@@ -552,7 +525,7 @@ def traversal_search(
552
525
k: The number of Documents to return from the initial vector search.
553
526
Defaults to 4.
554
527
depth: The maximum depth of edges to traverse. Defaults to 1.
555
- metadata : Optional metadata to filter the results.
528
+ metadata_filter : Optional metadata to filter the results.
556
529
557
530
Returns:
558
531
Collection of retrieved documents.
@@ -639,8 +612,9 @@ def visit_targets(d: int, targets: Sequence[Any]) -> None:
639
612
640
613
query_embedding = self ._embedding .embed_query (query )
641
614
query , params = self ._get_search_cql (
615
+ columns = "content_id, link_to_tags" ,
642
616
limit = k ,
643
- metadata = metadata ,
617
+ metadata = metadata_filter ,
644
618
embedding = query_embedding ,
645
619
)
646
620
@@ -656,11 +630,11 @@ def similarity_search(
656
630
self ,
657
631
embedding : List [float ],
658
632
k : int = 4 ,
659
- metadata : Dict [str , Any ] = {},
633
+ metadata_filter : Dict [str , Any ] = {},
660
634
) -> Iterable [Node ]:
661
635
"""Retrieve nodes similar to the given embedding, optionally filtered by metadata.""" # noqa: E501
662
636
query , params = self ._get_search_cql (
663
- embedding = embedding , limit = k , metadata = metadata
637
+ embedding = embedding , limit = k , metadata = metadata_filter
664
638
)
665
639
666
640
for row in self ._session .execute (query , params ):
@@ -813,17 +787,20 @@ def _coerce_string(value: Any) -> str:
813
787
def _extract_where_clause_blocks (
814
788
self , metadata : Dict [str , Any ]
815
789
) -> Tuple [str , List [Any ]]:
816
- _ , metadata_s = self ._parse_metadata (metadata = metadata , is_query = True )
817
-
818
- if len (metadata_s ) == 0 :
819
- return "" , []
820
-
821
790
wc_blocks : List [str ] = []
822
791
vals_list : List [Any ] = []
823
792
824
- for k , v in sorted (metadata_s .items ()):
825
- wc_blocks .append (f"metadata_s['{ k } '] = ?" )
826
- vals_list .append (v )
793
+ for key , value in sorted (metadata .items ()):
794
+ if _is_metadata_field_indexed (key , self ._metadata_indexing_policy ):
795
+ wc_blocks .append (f"metadata_s['{ key } '] = ?" )
796
+ vals_list .append (self ._coerce_string (value = value ))
797
+ else :
798
+ raise ValueError (
799
+ "Non-indexed metadata fields cannot be used in queries."
800
+ )
801
+
802
+ if len (wc_blocks ) == 0 :
803
+ return "" , []
827
804
828
805
where_clause = "WHERE " + " AND " .join (wc_blocks )
829
806
return where_clause , vals_list
0 commit comments