@@ -380,7 +380,6 @@ def _prepare_inputs(
380
380
def _handle_duplicates (
381
381
self ,
382
382
nodes : Sequence [BaseNode ],
383
- store_doc_text : bool = True ,
384
383
) -> Sequence [BaseNode ]:
385
384
"""Handle docstore duplicates by checking all hashes."""
386
385
assert self .docstore is not None
@@ -394,14 +393,11 @@ def _handle_duplicates(
394
393
nodes_to_run .append (node )
395
394
current_hashes .append (node .hash )
396
395
397
- self .docstore .add_documents (nodes_to_run , store_text = store_doc_text )
398
-
399
396
return nodes_to_run
400
397
401
398
def _handle_upserts (
402
399
self ,
403
400
nodes : Sequence [BaseNode ],
404
- store_doc_text : bool = True ,
405
401
) -> Sequence [BaseNode ]:
406
402
"""Handle docstore upserts by checking hashes and ids."""
407
403
assert self .docstore is not None
@@ -437,11 +433,7 @@ def _handle_upserts(
437
433
if self .vector_store is not None :
438
434
self .vector_store .delete (ref_doc_id )
439
435
440
- nodes_to_run = list (deduped_nodes_to_run .values ())
441
- self .docstore .set_document_hashes ({n .id_ : n .hash for n in nodes_to_run })
442
- self .docstore .add_documents (nodes_to_run , store_text = store_doc_text )
443
-
444
- return nodes_to_run
436
+ return list (deduped_nodes_to_run .values ())
445
437
446
438
@staticmethod
447
439
def _node_batcher (
@@ -452,6 +444,23 @@ def _node_batcher(
452
444
for i in range (0 , len (nodes ), batch_size ):
453
445
yield nodes [i : i + batch_size ]
454
446
447
+ def _update_docstore (
448
+ self , nodes : Sequence [BaseNode ], store_doc_text : bool = True
449
+ ) -> None :
450
+ """Update the document store with the given nodes."""
451
+ assert self .docstore is not None
452
+
453
+ if self .docstore_strategy in (
454
+ DocstoreStrategy .UPSERTS ,
455
+ DocstoreStrategy .UPSERTS_AND_DELETE ,
456
+ ):
457
+ self .docstore .set_document_hashes ({n .id_ : n .hash for n in nodes })
458
+ self .docstore .add_documents (nodes , store_text = store_doc_text )
459
+ elif self .docstore_strategy == DocstoreStrategy .DUPLICATES_ONLY :
460
+ self .docstore .add_documents (nodes , store_text = store_doc_text )
461
+ else :
462
+ raise ValueError (f"Invalid docstore strategy: { self .docstore_strategy } " )
463
+
455
464
@dispatcher .span
456
465
def run (
457
466
self ,
@@ -493,13 +502,9 @@ def run(
493
502
DocstoreStrategy .UPSERTS ,
494
503
DocstoreStrategy .UPSERTS_AND_DELETE ,
495
504
):
496
- nodes_to_run = self ._handle_upserts (
497
- input_nodes , store_doc_text = store_doc_text
498
- )
505
+ nodes_to_run = self ._handle_upserts (input_nodes )
499
506
elif self .docstore_strategy == DocstoreStrategy .DUPLICATES_ONLY :
500
- nodes_to_run = self ._handle_duplicates (
501
- input_nodes , store_doc_text = store_doc_text
502
- )
507
+ nodes_to_run = self ._handle_duplicates (input_nodes )
503
508
else :
504
509
raise ValueError (f"Invalid docstore strategy: { self .docstore_strategy } " )
505
510
elif self .docstore is not None and self .vector_store is None :
@@ -515,10 +520,7 @@ def run(
515
520
"Switching to duplicates_only strategy."
516
521
)
517
522
self .docstore_strategy = DocstoreStrategy .DUPLICATES_ONLY
518
- nodes_to_run = self ._handle_duplicates (
519
- input_nodes , store_doc_text = store_doc_text
520
- )
521
-
523
+ nodes_to_run = self ._handle_duplicates (input_nodes )
522
524
else :
523
525
nodes_to_run = input_nodes
524
526
@@ -564,6 +566,9 @@ def run(
564
566
if nodes_with_embeddings :
565
567
self .vector_store .add (nodes_with_embeddings )
566
568
569
+ if self .docstore is not None :
570
+ self ._update_docstore (nodes_to_run , store_doc_text = store_doc_text )
571
+
567
572
return nodes
568
573
569
574
# ------ async methods ------
0 commit comments