Skip to content

Commit 37a34cf

Browse files
Gaudy BlancoGaudy Blanco
authored andcommitted
overwrite removed from vector store
1 parent 14e3335 commit 37a34cf

File tree

8 files changed

+67
-98
lines changed

8 files changed

+67
-98
lines changed

graphrag/config/models/vector_store_config.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,6 @@ def _validate_url(self) -> None:
8282
default=vector_store_defaults.database_name,
8383
)
8484

85-
overwrite: bool = Field(
86-
description="Overwrite the existing data.",
87-
default=vector_store_defaults.overwrite,
88-
)
89-
9085
embeddings_schema: dict[str, VectorStoreSchemaConfig] = {}
9186

9287
def _validate_embeddings_schema(self) -> None:

graphrag/index/operations/embed_text/embed_text.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,14 @@ async def embed_text(
4040
vector_store: BaseVectorStore = _create_vector_store(
4141
vector_store_config, index_name, embedding_name
4242
)
43-
vector_store_workflow_config = vector_store_config.get(
44-
embedding_name, vector_store_config
45-
)
43+
4644
return await _text_embed_with_vector_store(
4745
input=input,
4846
callbacks=callbacks,
4947
model=model,
5048
tokenizer=tokenizer,
5149
embed_column=embed_column,
5250
vector_store=vector_store,
53-
vector_store_config=vector_store_workflow_config,
5451
batch_size=batch_size,
5552
batch_max_tokens=batch_max_tokens,
5653
num_threads=num_threads,
@@ -95,17 +92,13 @@ async def _text_embed_with_vector_store(
9592
tokenizer: Tokenizer,
9693
embed_column: str,
9794
vector_store: BaseVectorStore,
98-
vector_store_config: dict,
9995
batch_size: int,
10096
batch_max_tokens: int,
10197
num_threads: int,
10298
id_column: str,
10399
title_column: str | None = None,
104100
):
105101
# Get vector-storage configuration
106-
107-
overwrite: bool = vector_store_config.get("overwrite", True)
108-
109102
if embed_column not in input.columns:
110103
msg = f"Column {embed_column} not found in input dataframe with columns {input.columns}"
111104
raise ValueError(msg)
@@ -168,7 +161,7 @@ async def _text_embed_with_vector_store(
168161
)
169162
documents.append(document)
170163

171-
vector_store.load_documents(documents, overwrite and i == 0)
164+
vector_store.load_documents(documents)
172165
starting_index += len(documents)
173166
i += 1
174167

graphrag/vector_stores/azure_ai_search.py

Lines changed: 46 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -74,57 +74,54 @@ def connect(self, **kwargs: Any) -> Any:
7474
not_supported_error = "Azure AI Search expects `url`."
7575
raise ValueError(not_supported_error)
7676

77-
def load_documents(
78-
self, documents: list[VectorStoreDocument], overwrite: bool = True
79-
) -> None:
77+
def load_documents(self, documents: list[VectorStoreDocument]) -> None:
8078
"""Load documents into an Azure AI Search index."""
81-
if overwrite:
82-
if (
83-
self.index_name is not None
84-
and self.index_name in self.index_client.list_index_names()
85-
):
86-
self.index_client.delete_index(self.index_name)
87-
88-
# Configure vector search profile
89-
vector_search = VectorSearch(
90-
algorithms=[
91-
HnswAlgorithmConfiguration(
92-
name="HnswAlg",
93-
parameters=HnswParameters(
94-
metric=VectorSearchAlgorithmMetric.COSINE
95-
),
96-
)
97-
],
98-
profiles=[
99-
VectorSearchProfile(
100-
name=self.vector_search_profile_name,
101-
algorithm_configuration_name="HnswAlg",
102-
)
103-
],
104-
)
105-
# Configure the index
106-
index = SearchIndex(
107-
name=self.index_name if self.index_name else "",
108-
fields=[
109-
SimpleField(
110-
name=self.id_field,
111-
type=SearchFieldDataType.String,
112-
key=True,
113-
),
114-
SearchField(
115-
name=self.vector_field,
116-
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
117-
searchable=True,
118-
hidden=False, # DRIFT needs to return the vector for client-side similarity
119-
vector_search_dimensions=self.vector_size,
120-
vector_search_profile_name=self.vector_search_profile_name,
79+
if (
80+
self.index_name is not None
81+
and self.index_name in self.index_client.list_index_names()
82+
):
83+
self.index_client.delete_index(self.index_name)
84+
85+
# Configure vector search profile
86+
vector_search = VectorSearch(
87+
algorithms=[
88+
HnswAlgorithmConfiguration(
89+
name="HnswAlg",
90+
parameters=HnswParameters(
91+
metric=VectorSearchAlgorithmMetric.COSINE
12192
),
122-
],
123-
vector_search=vector_search,
124-
)
125-
self.index_client.create_or_update_index(
126-
index,
127-
)
93+
)
94+
],
95+
profiles=[
96+
VectorSearchProfile(
97+
name=self.vector_search_profile_name,
98+
algorithm_configuration_name="HnswAlg",
99+
)
100+
],
101+
)
102+
# Configure the index
103+
index = SearchIndex(
104+
name=self.index_name if self.index_name else "",
105+
fields=[
106+
SimpleField(
107+
name=self.id_field,
108+
type=SearchFieldDataType.String,
109+
key=True,
110+
),
111+
SearchField(
112+
name=self.vector_field,
113+
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
114+
searchable=True,
115+
hidden=False, # DRIFT needs to return the vector for client-side similarity
116+
vector_search_dimensions=self.vector_size,
117+
vector_search_profile_name=self.vector_search_profile_name,
118+
),
119+
],
120+
vector_search=vector_search,
121+
)
122+
self.index_client.create_or_update_index(
123+
index,
124+
)
128125

129126
batch = [
130127
{

graphrag/vector_stores/base.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,7 @@ def connect(self, **kwargs: Any) -> None:
5858
"""Connect to vector storage."""
5959

6060
@abstractmethod
61-
def load_documents(
62-
self, documents: list[VectorStoreDocument], overwrite: bool = True
63-
) -> None:
61+
def load_documents(self, documents: list[VectorStoreDocument]) -> None:
6462
"""Load documents into the vector-store."""
6563

6664
@abstractmethod

graphrag/vector_stores/cosmosdb.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -149,14 +149,11 @@ def _container_exists(self) -> bool:
149149
]
150150
return self._container_name in existing_container_names
151151

152-
def load_documents(
153-
self, documents: list[VectorStoreDocument], overwrite: bool = True
154-
) -> None:
152+
def load_documents(self, documents: list[VectorStoreDocument]) -> None:
155153
"""Load documents into CosmosDB."""
156154
# Create a CosmosDB container on overwrite
157-
if overwrite:
158-
self._delete_container()
159-
self._create_container()
155+
self._delete_container()
156+
self._create_container()
160157

161158
if self._container_client is None:
162159
msg = "Container client is not initialized."

graphrag/vector_stores/lancedb.py

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,7 @@ def connect(self, **kwargs: Any) -> Any:
3535
if self.index_name and self.index_name in self.db_connection.table_names():
3636
self.document_collection = self.db_connection.open_table(self.index_name)
3737

38-
def load_documents(
39-
self, documents: list[VectorStoreDocument], overwrite: bool = True
40-
) -> None:
38+
def load_documents(self, documents: list[VectorStoreDocument]) -> None:
4139
"""Load documents into vector storage."""
4240
# Step 1: Prepare data columns manually
4341
ids = []
@@ -71,28 +69,20 @@ def load_documents(
7169
# NOTE: If modifying the next section of code, ensure that the schema remains the same.
7270
# The pyarrow format of the 'vector' field may change if the order of operations is changed
7371
# and will break vector search.
74-
if overwrite:
75-
if data:
76-
self.document_collection = self.db_connection.create_table(
77-
self.index_name if self.index_name else "",
78-
data=data,
79-
mode="overwrite",
80-
schema=data.schema,
81-
)
82-
else:
83-
self.document_collection = self.db_connection.create_table(
84-
self.index_name if self.index_name else "", mode="overwrite"
85-
)
86-
self.document_collection.create_index(
87-
vector_column_name=self.vector_field, index_type="IVF_FLAT"
72+
if data:
73+
self.document_collection = self.db_connection.create_table(
74+
self.index_name if self.index_name else "",
75+
data=data,
76+
mode="overwrite",
77+
schema=data.schema,
8878
)
8979
else:
90-
# add data to existing table
91-
self.document_collection = self.db_connection.open_table(
92-
self.index_name if self.index_name else ""
80+
self.document_collection = self.db_connection.create_table(
81+
self.index_name if self.index_name else "", mode="overwrite"
9382
)
94-
if data:
95-
self.document_collection.add(data)
83+
self.document_collection.create_index(
84+
vector_column_name=self.vector_field, index_type="IVF_FLAT"
85+
)
9686

9787
def similarity_search_by_vector(
9888
self, query_embedding: list[float] | np.ndarray, k: int = 10

tests/integration/vector_stores/test_lancedb.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def test_vector_store_operations(self, sample_documents):
8383
assert isinstance(results[0].score, float)
8484

8585
# Test append mode
86-
vector_store.load_documents([sample_documents[2]], overwrite=False)
86+
vector_store.load_documents([sample_documents[2]])
8787
result = vector_store.search_by_id("3")
8888
assert result.id == "3"
8989

@@ -137,7 +137,7 @@ def test_empty_collection(self):
137137
id="1",
138138
vector=[0.1, 0.2, 0.3, 0.4, 0.5],
139139
)
140-
vector_store.load_documents([doc], overwrite=False)
140+
vector_store.load_documents([doc])
141141

142142
result = vector_store.search_by_id("1")
143143
assert result.id == "1"
@@ -205,7 +205,7 @@ def test_vector_store_customization(self, sample_documents):
205205
assert isinstance(results[0].score, float)
206206

207207
# Test append mode
208-
vector_store.load_documents([sample_documents[2]], overwrite=False)
208+
vector_store.load_documents([sample_documents[2]])
209209
result = vector_store.search_by_id("3")
210210
assert result.id == "3"
211211

tests/unit/config/utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@ def assert_vector_store_configs(
120120
assert store_a.api_key == store_e.api_key
121121
assert store_a.audience == store_e.audience
122122
assert store_a.container_name == store_e.container_name
123-
assert store_a.overwrite == store_e.overwrite
124123
assert store_a.database_name == store_e.database_name
125124

126125

0 commit comments

Comments
 (0)