Update IngestionPipeline async document store insertion (#19868)

Florian-BACHO · web-flow · commit 1692239ed778 · 2025-09-15T19:55:06.000-06:00
diff --git a/llama-index-core/llama_index/core/ingestion/pipeline.py b/llama-index-core/llama_index/core/ingestion/pipeline.py
@@ -572,6 +572,23 @@ def run(
         return nodes
 
     # ------ async methods ------
+    async def _aupdate_docstore(
+        self, nodes: Sequence[BaseNode], store_doc_text: bool = True
+    ) -> None:
+        """Update the document store with the given nodes."""
+        assert self.docstore is not None
+
+        if self.docstore_strategy in (
+            DocstoreStrategy.UPSERTS,
+            DocstoreStrategy.UPSERTS_AND_DELETE,
+        ):
+            await self.docstore.aset_document_hashes({n.id_: n.hash for n in nodes})
+            await self.docstore.async_add_documents(nodes, store_text=store_doc_text)
+        elif self.docstore_strategy == DocstoreStrategy.DUPLICATES_ONLY:
+            await self.docstore.async_add_documents(nodes, store_text=store_doc_text)
+        else:
+            raise ValueError(f"Invalid docstore strategy: {self.docstore_strategy}")
+
     async def _ahandle_duplicates(
         self,
         nodes: Sequence[BaseNode],
@@ -589,8 +606,6 @@ async def _ahandle_duplicates(
                 nodes_to_run.append(node)
                 current_hashes.append(node.hash)
 
-        await self.docstore.async_add_documents(nodes_to_run, store_text=store_doc_text)
-
         return nodes_to_run
 
     async def _ahandle_upserts(
@@ -632,11 +647,7 @@ async def _ahandle_upserts(
                 if self.vector_store is not None:
                     await self.vector_store.adelete(ref_doc_id)
 
-        nodes_to_run = list(deduped_nodes_to_run.values())
-        await self.docstore.async_add_documents(nodes_to_run, store_text=store_doc_text)
-        await self.docstore.aset_document_hashes({n.id_: n.hash for n in nodes_to_run})
-
-        return nodes_to_run
+        return list(deduped_nodes_to_run.values())
 
     @dispatcher.span
     async def arun(
@@ -757,4 +768,7 @@ async def arun(
             if nodes_with_embeddings:
                 await self.vector_store.async_add(nodes_with_embeddings)
 
+        if self.docstore is not None:
+            await self._aupdate_docstore(nodes_to_run, store_doc_text=store_doc_text)
+
         return nodes
diff --git a/llama-index-core/tests/ingestion/test_pipeline.py b/llama-index-core/tests/ingestion/test_pipeline.py
@@ -286,3 +286,179 @@ def __call__(
         pipeline.run(documents=[document1])
 
     assert pipeline.docstore.get_node("1", raise_error=False) is None
+
+
+@pytest.mark.asyncio
+async def test_arun_pipeline() -> None:
+    pipeline = IngestionPipeline(
+        readers=[
+            ReaderConfig(
+                reader=StringIterableReader(),
+                reader_kwargs={"texts": ["This is a test."]},
+            )
+        ],
+        documents=[Document.example()],
+        transformations=[
+            SentenceSplitter(),
+            KeywordExtractor(llm=MockLLM()),
+        ],
+    )
+
+    nodes = await pipeline.arun()
+
+    assert len(nodes) == 2
+    assert len(nodes[0].metadata) > 0
+
+
+@pytest.mark.asyncio
+async def test_arun_pipeline_with_ref_doc_id():
+    documents = [
+        Document(text="one", doc_id="1"),
+    ]
+    pipeline = IngestionPipeline(
+        documents=documents,
+        transformations=[
+            MarkdownElementNodeParser(),
+            SentenceSplitter(),
+            MockEmbedding(embed_dim=8),
+        ],
+    )
+
+    nodes = await pipeline.arun()
+
+    assert len(nodes) == 1
+    assert nodes[0].ref_doc_id == "1"
+
+
+@pytest.mark.asyncio
+async def test_async_pipeline_update_text_content() -> None:
+    document1 = Document.example()
+    document1.id_ = "1"
+
+    pipeline = IngestionPipeline(
+        transformations=[
+            SentenceSplitter(chunk_size=25, chunk_overlap=0),
+        ],
+        docstore=SimpleDocumentStore(),
+    )
+
+    nodes = await pipeline.arun(documents=[document1])
+    assert len(nodes) == 19
+    assert pipeline.docstore is not None
+    assert len(pipeline.docstore.docs) == 1
+
+    # adjust document content
+    document1 = Document(text="test", doc_id="1")
+
+    # run pipeline again
+    nodes = pipeline.run(documents=[document1])
+
+    assert len(nodes) == 1
+    assert pipeline.docstore is not None
+    assert len(pipeline.docstore.docs) == 1
+    assert next(iter(pipeline.docstore.docs.values())).text == "test"  # type: ignore
+
+
+@pytest.mark.asyncio
+async def test_async_pipeline_update_metadata() -> None:
+    """Test that IngestionPipeline updates document metadata, if it changed."""
+    old_metadata = {"filename": "README.md", "category": "codebase"}
+    document1 = Document.example()
+    document1.metadata = old_metadata
+    document1.id_ = "1"
+
+    pipeline = IngestionPipeline(
+        transformations=[
+            SentenceSplitter(chunk_size=25, chunk_overlap=0),
+        ],
+        docstore=SimpleDocumentStore(),
+    )
+
+    nodes = await pipeline.arun(documents=[document1])
+    assert len(nodes) >= 1
+    assert pipeline.docstore is not None
+    assert len(pipeline.docstore.docs) == 1
+    for node in nodes:
+        assert node.metadata == old_metadata
+
+    # adjust document metadata
+    new_metadata = {"filename": "README.md", "category": "documentation"}
+    document1.metadata = new_metadata
+
+    # run pipeline again
+    nodes_new = pipeline.run(documents=[document1])
+
+    assert len(nodes_new) == len(nodes)
+    assert pipeline.docstore is not None
+    assert len(pipeline.docstore.docs) == 1
+    assert next(iter(pipeline.docstore.docs.values())).metadata == new_metadata  # type: ignore
+    for node in nodes_new:
+        assert node.metadata == new_metadata
+
+
+@pytest.mark.asyncio
+async def test_async_pipeline_dedup_duplicates_only() -> None:
+    documents = [
+        Document(text="one", doc_id="1"),
+        Document(text="two", doc_id="2"),
+        Document(text="three", doc_id="3"),
+    ]
+
+    pipeline = IngestionPipeline(
+        transformations=[
+            SentenceSplitter(chunk_size=25, chunk_overlap=0),
+        ],
+        docstore=SimpleDocumentStore(),
+    )
+
+    nodes = await pipeline.arun(documents=documents)
+    assert len(nodes) == 3
+
+    nodes = await pipeline.arun(documents=documents)
+    assert len(nodes) == 0
+
+
+async def test_async_pipeline_parallel() -> None:
+    document1 = Document.example()
+    document1.id_ = "1"
+    document2 = Document(text="One\n\n\nTwo\n\n\nThree.", doc_id="2")
+
+    pipeline = IngestionPipeline(
+        transformations=[
+            SentenceSplitter(chunk_size=25, chunk_overlap=0),
+        ],
+        docstore=SimpleDocumentStore(),
+    )
+
+    num_workers = min(2, cpu_count())
+    nodes = await pipeline.arun(
+        documents=[document1, document2], num_workers=num_workers
+    )
+    assert len(nodes) == 20
+    assert pipeline.docstore is not None
+    assert len(pipeline.docstore.docs) == 2
+
+
+@pytest.mark.asyncio
+async def test_async_pipeline_with_transform_error() -> None:
+    class RaisingTransform(TransformComponent):
+        def __call__(
+            self, nodes: Sequence[BaseNode], **kwargs: Any
+        ) -> Sequence[BaseNode]:
+            raise RuntimeError
+
+    document1 = Document.example()
+    document1.id_ = "1"
+
+    pipeline = IngestionPipeline(
+        transformations=[
+            SentenceSplitter(chunk_size=25, chunk_overlap=0),
+            RaisingTransform(),
+        ],
+        docstore=SimpleDocumentStore(),
+    )
+
+    with pytest.raises(RuntimeError):
+        await pipeline.arun(documents=[document1])
+
+    assert pipeline.docstore.get_node("1", raise_error=False) is None