overwrite removed from vector store

Gaudy Blanco · Gaudy Blanco · commit 37a34cf3a119 · 2025-10-11T00:41:58.000-06:00
diff --git a/graphrag/config/models/vector_store_config.py b/graphrag/config/models/vector_store_config.py
@@ -82,11 +82,6 @@ def _validate_url(self) -> None:
         default=vector_store_defaults.database_name,
     )
 
-    overwrite: bool = Field(
-        description="Overwrite the existing data.",
-        default=vector_store_defaults.overwrite,
-    )
-
     embeddings_schema: dict[str, VectorStoreSchemaConfig] = {}
 
     def _validate_embeddings_schema(self) -> None:
diff --git a/graphrag/index/operations/embed_text/embed_text.py b/graphrag/index/operations/embed_text/embed_text.py
@@ -40,17 +40,14 @@ async def embed_text(
         vector_store: BaseVectorStore = _create_vector_store(
             vector_store_config, index_name, embedding_name
         )
-        vector_store_workflow_config = vector_store_config.get(
-            embedding_name, vector_store_config
-        )
+
         return await _text_embed_with_vector_store(
             input=input,
             callbacks=callbacks,
             model=model,
             tokenizer=tokenizer,
             embed_column=embed_column,
             vector_store=vector_store,
-            vector_store_config=vector_store_workflow_config,
             batch_size=batch_size,
             batch_max_tokens=batch_max_tokens,
             num_threads=num_threads,
@@ -95,17 +92,13 @@ async def _text_embed_with_vector_store(
     tokenizer: Tokenizer,
     embed_column: str,
     vector_store: BaseVectorStore,
-    vector_store_config: dict,
     batch_size: int,
     batch_max_tokens: int,
     num_threads: int,
     id_column: str,
     title_column: str | None = None,
 ):
     # Get vector-storage configuration
-
-    overwrite: bool = vector_store_config.get("overwrite", True)
-
     if embed_column not in input.columns:
         msg = f"Column {embed_column} not found in input dataframe with columns {input.columns}"
         raise ValueError(msg)
@@ -168,7 +161,7 @@ async def _text_embed_with_vector_store(
             )
             documents.append(document)
 
-        vector_store.load_documents(documents, overwrite and i == 0)
+        vector_store.load_documents(documents)
         starting_index += len(documents)
         i += 1
 
diff --git a/graphrag/vector_stores/azure_ai_search.py b/graphrag/vector_stores/azure_ai_search.py
@@ -74,57 +74,54 @@ def connect(self, **kwargs: Any) -> Any:
             not_supported_error = "Azure AI Search expects `url`."
             raise ValueError(not_supported_error)
 
-    def load_documents(
-        self, documents: list[VectorStoreDocument], overwrite: bool = True
-    ) -> None:
+    def load_documents(self, documents: list[VectorStoreDocument]) -> None:
         """Load documents into an Azure AI Search index."""
-        if overwrite:
-            if (
-                self.index_name is not None
-                and self.index_name in self.index_client.list_index_names()
-            ):
-                self.index_client.delete_index(self.index_name)
-
-            # Configure vector search profile
-            vector_search = VectorSearch(
-                algorithms=[
-                    HnswAlgorithmConfiguration(
-                        name="HnswAlg",
-                        parameters=HnswParameters(
-                            metric=VectorSearchAlgorithmMetric.COSINE
-                        ),
-                    )
-                ],
-                profiles=[
-                    VectorSearchProfile(
-                        name=self.vector_search_profile_name,
-                        algorithm_configuration_name="HnswAlg",
-                    )
-                ],
-            )
-            # Configure the index
-            index = SearchIndex(
-                name=self.index_name if self.index_name else "",
-                fields=[
-                    SimpleField(
-                        name=self.id_field,
-                        type=SearchFieldDataType.String,
-                        key=True,
-                    ),
-                    SearchField(
-                        name=self.vector_field,
-                        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
-                        searchable=True,
-                        hidden=False,  # DRIFT needs to return the vector for client-side similarity
-                        vector_search_dimensions=self.vector_size,
-                        vector_search_profile_name=self.vector_search_profile_name,
+        if (
+            self.index_name is not None
+            and self.index_name in self.index_client.list_index_names()
+        ):
+            self.index_client.delete_index(self.index_name)
+
+        # Configure vector search profile
+        vector_search = VectorSearch(
+            algorithms=[
+                HnswAlgorithmConfiguration(
+                    name="HnswAlg",
+                    parameters=HnswParameters(
+                        metric=VectorSearchAlgorithmMetric.COSINE
                     ),
-                ],
-                vector_search=vector_search,
-            )
-            self.index_client.create_or_update_index(
-                index,
-            )
+                )
+            ],
+            profiles=[
+                VectorSearchProfile(
+                    name=self.vector_search_profile_name,
+                    algorithm_configuration_name="HnswAlg",
+                )
+            ],
+        )
+        # Configure the index
+        index = SearchIndex(
+            name=self.index_name if self.index_name else "",
+            fields=[
+                SimpleField(
+                    name=self.id_field,
+                    type=SearchFieldDataType.String,
+                    key=True,
+                ),
+                SearchField(
+                    name=self.vector_field,
+                    type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+                    searchable=True,
+                    hidden=False,  # DRIFT needs to return the vector for client-side similarity
+                    vector_search_dimensions=self.vector_size,
+                    vector_search_profile_name=self.vector_search_profile_name,
+                ),
+            ],
+            vector_search=vector_search,
+        )
+        self.index_client.create_or_update_index(
+            index,
+        )
 
         batch = [
             {
diff --git a/graphrag/vector_stores/base.py b/graphrag/vector_stores/base.py
@@ -58,9 +58,7 @@ def connect(self, **kwargs: Any) -> None:
         """Connect to vector storage."""
 
     @abstractmethod
-    def load_documents(
-        self, documents: list[VectorStoreDocument], overwrite: bool = True
-    ) -> None:
+    def load_documents(self, documents: list[VectorStoreDocument]) -> None:
         """Load documents into the vector-store."""
 
     @abstractmethod
diff --git a/graphrag/vector_stores/cosmosdb.py b/graphrag/vector_stores/cosmosdb.py
@@ -149,14 +149,11 @@ def _container_exists(self) -> bool:
         ]
         return self._container_name in existing_container_names
 
-    def load_documents(
-        self, documents: list[VectorStoreDocument], overwrite: bool = True
-    ) -> None:
+    def load_documents(self, documents: list[VectorStoreDocument]) -> None:
         """Load documents into CosmosDB."""
         # Create a CosmosDB container on overwrite
-        if overwrite:
-            self._delete_container()
-            self._create_container()
+        self._delete_container()
+        self._create_container()
 
         if self._container_client is None:
             msg = "Container client is not initialized."
diff --git a/graphrag/vector_stores/lancedb.py b/graphrag/vector_stores/lancedb.py
@@ -35,9 +35,7 @@ def connect(self, **kwargs: Any) -> Any:
         if self.index_name and self.index_name in self.db_connection.table_names():
             self.document_collection = self.db_connection.open_table(self.index_name)
 
-    def load_documents(
-        self, documents: list[VectorStoreDocument], overwrite: bool = True
-    ) -> None:
+    def load_documents(self, documents: list[VectorStoreDocument]) -> None:
         """Load documents into vector storage."""
         # Step 1: Prepare data columns manually
         ids = []
@@ -71,28 +69,20 @@ def load_documents(
         # NOTE: If modifying the next section of code, ensure that the schema remains the same.
         #       The pyarrow format of the 'vector' field may change if the order of operations is changed
         #       and will break vector search.
-        if overwrite:
-            if data:
-                self.document_collection = self.db_connection.create_table(
-                    self.index_name if self.index_name else "",
-                    data=data,
-                    mode="overwrite",
-                    schema=data.schema,
-                )
-            else:
-                self.document_collection = self.db_connection.create_table(
-                    self.index_name if self.index_name else "", mode="overwrite"
-                )
-            self.document_collection.create_index(
-                vector_column_name=self.vector_field, index_type="IVF_FLAT"
+        if data:
+            self.document_collection = self.db_connection.create_table(
+                self.index_name if self.index_name else "",
+                data=data,
+                mode="overwrite",
+                schema=data.schema,
             )
         else:
-            # add data to existing table
-            self.document_collection = self.db_connection.open_table(
-                self.index_name if self.index_name else ""
+            self.document_collection = self.db_connection.create_table(
+                self.index_name if self.index_name else "", mode="overwrite"
             )
-            if data:
-                self.document_collection.add(data)
+        self.document_collection.create_index(
+            vector_column_name=self.vector_field, index_type="IVF_FLAT"
+        )
 
     def similarity_search_by_vector(
         self, query_embedding: list[float] | np.ndarray, k: int = 10
diff --git a/tests/integration/vector_stores/test_lancedb.py b/tests/integration/vector_stores/test_lancedb.py
@@ -83,7 +83,7 @@ def test_vector_store_operations(self, sample_documents):
             assert isinstance(results[0].score, float)
 
             # Test append mode
-            vector_store.load_documents([sample_documents[2]], overwrite=False)
+            vector_store.load_documents([sample_documents[2]])
             result = vector_store.search_by_id("3")
             assert result.id == "3"
 
@@ -137,7 +137,7 @@ def test_empty_collection(self):
                 id="1",
                 vector=[0.1, 0.2, 0.3, 0.4, 0.5],
             )
-            vector_store.load_documents([doc], overwrite=False)
+            vector_store.load_documents([doc])
 
             result = vector_store.search_by_id("1")
             assert result.id == "1"
@@ -205,7 +205,7 @@ def test_vector_store_customization(self, sample_documents):
             assert isinstance(results[0].score, float)
 
             # Test append mode
-            vector_store.load_documents([sample_documents[2]], overwrite=False)
+            vector_store.load_documents([sample_documents[2]])
             result = vector_store.search_by_id("3")
             assert result.id == "3"
 
diff --git a/tests/unit/config/utils.py b/tests/unit/config/utils.py
@@ -120,7 +120,6 @@ def assert_vector_store_configs(
         assert store_a.api_key == store_e.api_key
         assert store_a.audience == store_e.audience
         assert store_a.container_name == store_e.container_name
-        assert store_a.overwrite == store_e.overwrite
         assert store_a.database_name == store_e.database_name