Add support for batch document insertion to YDB (#7)

murathany7 · vgvoleg · web-flow · commit 83d7f9afb410 · 2025-04-10T21:45:36.000+03:00
* Add batch document insertion functionality

* Update test_vectorestore.py

* default size changed and using more efficient method

* Update test_vectorestore.py

* Update vectorstores.py

---------

Co-authored-by: Oleg Ovcharuk &lt;vgvoleg@gmail.com&gt;
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -1,7 +1,7 @@
 version: "3.3"
 services:
   ydb:
-    image: ydbplatform/local-ydb:trunk
+    image: ydbplatform/local-ydb:24.3.13.12
     restart: always
     ports:
       - "2136:2136"
diff --git a/langchain_ydb/vectorstores.py b/langchain_ydb/vectorstores.py
@@ -218,10 +218,11 @@ def _escape_str(self, text: str) -> str:
 
     def _prepare_insert_query(self) -> str:
         return f"""
-        DECLARE $id AS Utf8;
-        DECLARE $document as Utf8;
-        DECLARE $embedding as List<Float>;
-        DECLARE $metadata as Json;
+        DECLARE $documents AS List<Struct<
+            id: Utf8,
+            document: Utf8,
+            embedding: List<Float>,
+            metadata: Json>>;
 
         UPSERT INTO `{self.config.table}`
         (
@@ -230,13 +231,12 @@ def _prepare_insert_query(self) -> str:
         {self.config.column_map["embedding"]},
         {self.config.column_map["metadata"]}
         )
-        VALUES
-        (
-        $id,
-        $document,
-        Untag(Knn::ToBinaryStringFloat($embedding), "FloatVector"),
-        $metadata
-        );
+        SELECT
+            id,
+            document,
+            Untag(Knn::ToBinaryStringFloat(embedding), "FloatVector"),
+            metadata
+        FROM AS_TABLE($documents);
         """
 
     def _prepare_search_query(
@@ -285,6 +285,7 @@ def add_texts(
         metadatas: Optional[list[dict]] = None,
         *,
         ids: Optional[list[str]] = None,
+        batch_size: int = 32,
         **kwargs: Any,
     ) -> list[str]:
         """Run more texts through the embeddings and add to the vectorstore.
@@ -293,6 +294,7 @@ def add_texts(
             texts: Iterable of strings to add to the vectorstore.
             metadatas: Optional list of metadatas associated with the texts.
             ids: Optional list of IDs associated with the texts.
+            batch_size: Number of texts to process in a single batch. Defaults to 32.
             **kwargs: vectorstore specific parameters.
                 One of the kwargs should be `ids` which is a list of ids
                 associated with the texts.
@@ -315,20 +317,52 @@ def add_texts(
         metadatas = metadatas if metadatas else [{} for _ in range(len(texts_))]
 
         ydb = self._ydb_lib
-
-        for id, text, metadata in self.pgbar(
-            zip(ids, texts, metadatas),
-            desc="Inserting data...",
-            total=len(ids),
+        
+        # Define struct type with proper member fields
+        document_struct_type = ydb.StructType()
+        document_struct_type.add_member('id', ydb.PrimitiveType.Utf8)
+        document_struct_type.add_member('document', ydb.PrimitiveType.Utf8)
+        document_struct_type.add_member(
+            'embedding', 
+            ydb.ListType(ydb.PrimitiveType.Float)
+        )
+        document_struct_type.add_member('metadata', ydb.PrimitiveType.Json)
+        
+        # Process in batches
+        batch_ranges = range(0, len(texts_), batch_size)
+        for i in self.pgbar(
+            batch_ranges, 
+            desc="Processing batches...", 
+            total=len(batch_ranges)
         ):
-            embedding = self.embedding_function.embed_query(text)
+            batch_texts = texts_[i:i+batch_size]
+            batch_ids = ids[i:i+batch_size]
+            batch_metadatas = metadatas[i:i+batch_size]
+            
+            # Generate embeddings for the batch
+            embeddings = self.embedding_function.embed_documents(
+                batch_texts,  # type: ignore
+            )
+            
+            # Create a list of document structs
+            documents = []
+            for doc_id, doc_text, doc_embedding, doc_metadata in zip(
+                batch_ids, batch_texts, embeddings, batch_metadatas
+            ):
+                # Use dictionary format for struct values - YDB will convert them
+                document = {
+                    'id': doc_id,
+                    'document': doc_text,
+                    'embedding': doc_embedding,
+                    'metadata': json.dumps(doc_metadata)
+                }
+                documents.append(document)
+            
+            # Execute the batch insert
             self._execute_query(
                 self._insert_query,
                 {
-                    "$id": id,
-                    "$document": text,
-                    "$embedding": (embedding, ydb.ListType(ydb.PrimitiveType.Float)),
-                    "$metadata": (json.dumps(metadata), ydb.PrimitiveType.Json),
+                    "$documents": (documents, ydb.ListType(document_struct_type))
                 },
             )
 
@@ -343,6 +377,7 @@ def from_texts(
         *,
         config: Optional[YDBSettings] = None,
         ids: Optional[list[str]] = None,
+        batch_size: int = 32,
         **kwargs: Any,
     ) -> YDB:
         """Return YDB VectorStore initialized from texts and embeddings.
@@ -353,13 +388,14 @@ def from_texts(
             metadatas: Optional list of metadatas associated with the texts.
                 Default is None.
             ids: Optional list of IDs associated with the texts.
+            batch_size: Number of texts to process in a single batch. Defaults to 32.
             kwargs: Additional keyword arguments.
 
         Returns:
             VectorStore: VectorStore initialized from texts and embeddings.
         """
         vs = cls(embedding, config, **kwargs)
-        vs.add_texts(texts=texts, metadatas=metadatas, ids=ids)
+        vs.add_texts(texts=texts, metadatas=metadatas, ids=ids, batch_size=batch_size)
         return vs
 
     def delete(self, ids: Optional[list[str]] = None, **kwargs: Any) -> Optional[bool]:
diff --git a/tests/test_vectorestore.py b/tests/test_vectorestore.py
@@ -328,3 +328,96 @@ def test_search_from_retriever_interface_with_filter() -> None:
     assert output == [Document(page_content="bar", metadata={"page": "1"})]
 
     docsearch.drop()
+
+
+@pytest.mark.parametrize("n", [10, 50, 100])
+def test_batch_insertion(n: int) -> None:
+    """Test batch insertion with different document counts."""
+    # Create documents
+    texts = [f"text_{i}" for i in range(n)]
+    metadatas = [{"index": str(i)} for i in range(n)]
+    
+    # Create vectorstore
+    config = YDBSettings(drop_existing_table=True)
+    config.table = f"test_ydb_batch_{n}"
+    docsearch = YDB.from_texts(
+        texts=texts,
+        embedding=ConsistentFakeEmbeddings(),
+        config=config,
+        metadatas=metadatas,
+    )
+    
+    # Verify total count matches expected
+    all_results = docsearch.similarity_search("text", k=n + 1)
+    assert len(all_results) == n
+    
+    # Clean up
+    docsearch.drop()
+
+@pytest.mark.parametrize("n,batch_size", [(25, None), (50, 10), (100, 50)])
+def test_batch_insertion_with_add_texts(n: int, batch_size: int) -> None:
+    """Test add_texts with different document counts and batch sizes."""
+    # Setup
+    config = YDBSettings(drop_existing_table=True)
+    config.table = f"test_ydb_add_texts_batch_{n}_{batch_size}"
+    docsearch = YDB(
+        embedding=ConsistentFakeEmbeddings(),
+        config=config,
+    )
+    
+    # Create test data
+    texts = [f"text_{i}" for i in range(n)]
+    metadatas = [{"index": str(i)} for i in range(n)]
+    
+    # Mock the embedding and execute functions to verify batch behavior
+    with pytest.MonkeyPatch.context() as mp:
+        # Track batches
+        processed_batches = []
+        
+        # Mock embedding function to track batch sizes
+        def mock_embed_documents(texts):
+            processed_batches.append(len(texts))
+            # Return fake embeddings of appropriate length
+            return [[0.1] * 5 for _ in range(len(texts))]
+        
+        # Mock execute query to avoid actual database operations
+        def mock_execute_query(query, params=None, ddl=False):
+            return None
+        
+        # Apply mocks
+        mp.setattr(
+            docsearch.embedding_function, "embed_documents", mock_embed_documents
+        )
+        mp.setattr(docsearch, "_execute_query", mock_execute_query)
+        
+        # Execute add_texts with specified batch size
+        kwargs = {}
+        if batch_size is not None:
+            kwargs["batch_size"] = batch_size
+        
+        ids = docsearch.add_texts(
+            texts=texts,
+            metadatas=metadatas,
+            **kwargs
+        )
+        
+        # Verify results
+        assert len(ids) == n  # Correct number of IDs returned
+        
+        # Verify correct batch sizes were used
+        expected_batch_size = batch_size if batch_size is not None else 32
+        expected_num_batches = (n + expected_batch_size - 1) // expected_batch_size
+        
+        assert len(processed_batches) == expected_num_batches
+        
+        # Verify all texts were processed in total
+        assert sum(processed_batches) == n
+        
+        # Verify most batches are of the expected size (except possibly the last one)
+        for i, batch_size in enumerate(processed_batches):
+            if i < len(processed_batches) - 1:
+                # All but the last batch should be full
+                assert batch_size == expected_batch_size
+            else:
+                # Last batch can be smaller
+                assert batch_size <= expected_batch_size