Skip to content

Commit d7773bd

Browse files
gaudybGaudy Blanco
andauthored
Clean vector store (#2077)
* clean vector store code * fix * fix launch.json --------- Co-authored-by: Gaudy Blanco <[email protected]>
1 parent d751682 commit d7773bd

File tree

8 files changed

+11
-84
lines changed

8 files changed

+11
-84
lines changed

graphrag/query/structured_search/local_search/mixed_context.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,6 @@ def __init__(
8484
self.tokenizer = tokenizer or get_tokenizer()
8585
self.embedding_vectorstore_key = embedding_vectorstore_key
8686

87-
def filter_by_entity_keys(self, entity_keys: list[int] | list[str]):
88-
"""Filter entity text embeddings by entity keys."""
89-
self.entity_text_embeddings.filter_by_id(entity_keys)
90-
9187
def build_context(
9288
self,
9389
query: str,

graphrag/vector_stores/azure_ai_search.py

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -149,24 +149,8 @@ def load_documents(
149149
if len(batch) > 0:
150150
self.db_connection.upload_documents(batch)
151151

152-
def filter_by_id(self, include_ids: list[str] | list[int]) -> Any:
153-
"""Build a query filter to filter documents by a list of ids."""
154-
if include_ids is None or len(include_ids) == 0:
155-
self.query_filter = None
156-
# Returning to keep consistency with other methods, but not needed
157-
return self.query_filter
158-
159-
# More info about odata filtering here: https://learn.microsoft.com/en-us/azure/search/search-query-odata-search-in-function
160-
# search.in is faster that joined and/or conditions
161-
id_filter = ",".join([f"{id!s}" for id in include_ids])
162-
self.query_filter = f"search.in({self.id_field}, '{id_filter}', ',')"
163-
164-
# Returning to keep consistency with other methods, but not needed
165-
# TODO: Refactor on a future PR
166-
return self.query_filter
167-
168152
def similarity_search_by_vector(
169-
self, query_embedding: list[float], k: int = 10, **kwargs: Any
153+
self, query_embedding: list[float], k: int = 10
170154
) -> list[VectorStoreSearchResult]:
171155
"""Perform a vector-based similarity search."""
172156
vectorized_query = VectorizedQuery(
@@ -193,7 +177,7 @@ def similarity_search_by_vector(
193177
]
194178

195179
def similarity_search_by_text(
196-
self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any
180+
self, text: str, text_embedder: TextEmbedder, k: int = 10
197181
) -> list[VectorStoreSearchResult]:
198182
"""Perform a text-based similarity search."""
199183
query_embedding = text_embedder(text)

graphrag/vector_stores/base.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,20 +71,16 @@ def load_documents(
7171

7272
@abstractmethod
7373
def similarity_search_by_vector(
74-
self, query_embedding: list[float], k: int = 10, **kwargs: Any
74+
self, query_embedding: list[float], k: int = 10
7575
) -> list[VectorStoreSearchResult]:
7676
"""Perform ANN search by vector."""
7777

7878
@abstractmethod
7979
def similarity_search_by_text(
80-
self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any
80+
self, text: str, text_embedder: TextEmbedder, k: int = 10
8181
) -> list[VectorStoreSearchResult]:
8282
"""Perform ANN search by text."""
8383

84-
@abstractmethod
85-
def filter_by_id(self, include_ids: list[str] | list[int]) -> Any:
86-
"""Build a query filter to filter documents by id."""
87-
8884
@abstractmethod
8985
def search_by_id(self, id: str) -> VectorStoreDocument:
9086
"""Search for a document by id."""

graphrag/vector_stores/cosmosdb.py

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ def load_documents(
179179
self._container_client.upsert_item(doc_json)
180180

181181
def similarity_search_by_vector(
182-
self, query_embedding: list[float], k: int = 10, **kwargs: Any
182+
self, query_embedding: list[float], k: int = 10
183183
) -> list[VectorStoreSearchResult]:
184184
"""Perform a vector-based similarity search."""
185185
if self._container_client is None:
@@ -241,7 +241,7 @@ def cosine_similarity(a, b):
241241
]
242242

243243
def similarity_search_by_text(
244-
self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any
244+
self, text: str, text_embedder: TextEmbedder, k: int = 10
245245
) -> list[VectorStoreSearchResult]:
246246
"""Perform a text-based similarity search."""
247247
query_embedding = text_embedder(text)
@@ -251,20 +251,6 @@ def similarity_search_by_text(
251251
)
252252
return []
253253

254-
def filter_by_id(self, include_ids: list[str] | list[int]) -> Any:
255-
"""Build a query filter to filter documents by a list of ids."""
256-
if include_ids is None or len(include_ids) == 0:
257-
self.query_filter = None
258-
else:
259-
if isinstance(include_ids[0], str):
260-
id_filter = ", ".join([f"'{id}'" for id in include_ids])
261-
else:
262-
id_filter = ", ".join([str(id) for id in include_ids])
263-
self.query_filter = (
264-
f"SELECT * FROM c WHERE c.{self.id_field} IN ({id_filter})" # noqa: S608
265-
)
266-
return self.query_filter
267-
268254
def search_by_id(self, id: str) -> VectorStoreDocument:
269255
"""Search for a document by id."""
270256
if self._container_client is None:

graphrag/vector_stores/lancedb.py

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -100,22 +100,8 @@ def load_documents(
100100
if data:
101101
self.document_collection.add(data)
102102

103-
def filter_by_id(self, include_ids: list[str] | list[int]) -> Any:
104-
"""Build a query filter to filter documents by id."""
105-
if len(include_ids) == 0:
106-
self.query_filter = None
107-
else:
108-
if isinstance(include_ids[0], str):
109-
id_filter = ", ".join([f"'{id}'" for id in include_ids])
110-
self.query_filter = f"{self.id_field} in ({id_filter})"
111-
else:
112-
self.query_filter = (
113-
f"{self.id_field} in ({', '.join([str(id) for id in include_ids])})"
114-
)
115-
return self.query_filter
116-
117103
def similarity_search_by_vector(
118-
self, query_embedding: list[float] | np.ndarray, k: int = 10, **kwargs: Any
104+
self, query_embedding: list[float] | np.ndarray, k: int = 10
119105
) -> list[VectorStoreSearchResult]:
120106
"""Perform a vector-based similarity search."""
121107
if self.query_filter:
@@ -151,7 +137,7 @@ def similarity_search_by_vector(
151137
]
152138

153139
def similarity_search_by_text(
154-
self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any
140+
self, text: str, text_embedder: TextEmbedder, k: int = 10
155141
) -> list[VectorStoreSearchResult]:
156142
"""Perform a similarity search using a given input text."""
157143
query_embedding = text_embedder(text)

tests/integration/vector_stores/test_azure_ai_search.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,6 @@ async def test_vector_store_operations(
136136
assert mock_index_client.create_or_update_index.called
137137
assert mock_search_client.upload_documents.called
138138

139-
filter_query = vector_store.filter_by_id(["doc1", "doc2"])
140-
assert filter_query == "search.in(id, 'doc1,doc2', ',')"
141-
142139
vector_results = vector_store.similarity_search_by_vector(
143140
[0.1, 0.2, 0.3, 0.4, 0.5], k=2
144141
)
@@ -215,12 +212,6 @@ async def test_vector_store_customization(
215212
assert mock_index_client.create_or_update_index.called
216213
assert mock_search_client.upload_documents.called
217214

218-
filter_query = vector_store_custom.filter_by_id(["doc1", "doc2"])
219-
assert (
220-
filter_query
221-
== f"search.in({vector_store_custom.id_field}, 'doc1,doc2', ',')"
222-
)
223-
224215
vector_results = vector_store_custom.similarity_search_by_vector(
225216
[0.1, 0.2, 0.3, 0.4, 0.5], k=2
226217
)

tests/integration/vector_stores/test_cosmosdb.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,6 @@ def test_vector_store_operations():
5050
]
5151
vector_store.load_documents(docs)
5252

53-
vector_store.filter_by_id(["doc1"])
54-
5553
doc = vector_store.search_by_id("doc1")
5654
assert doc.id == "doc1"
5755
assert doc.text == "This is document 1"
@@ -140,8 +138,6 @@ def test_vector_store_customization():
140138
]
141139
vector_store.load_documents(docs)
142140

143-
vector_store.filter_by_id(["doc1"])
144-
145141
doc = vector_store.search_by_id("doc1")
146142
assert doc.id == "doc1"
147143
assert doc.text == "This is document 1"

tests/integration/vector_stores/test_lancedb.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,6 @@ def test_vector_store_operations(self, sample_documents):
9191
assert np.allclose(doc.vector, [0.1, 0.2, 0.3, 0.4, 0.5])
9292
assert doc.attributes["title"] == "Doc 1"
9393

94-
filter_query = vector_store.filter_by_id(["1"])
95-
assert filter_query == "id in ('1')"
96-
9794
results = vector_store.similarity_search_by_vector(
9895
[0.1, 0.2, 0.3, 0.4, 0.5], k=2
9996
)
@@ -186,16 +183,14 @@ def test_filter_search(self, sample_documents_categories):
186183
vector_store.load_documents(sample_documents_categories)
187184

188185
# Filter to include only documents about animals
189-
vector_store.filter_by_id(["1", "2"])
190186
results = vector_store.similarity_search_by_vector(
191187
[0.1, 0.2, 0.3, 0.4, 0.5], k=3
192188
)
193189

194-
# Should return at most 2 documents (the filtered ones)
195-
assert len(results) <= 2
190+
# Should return at most 3 documents (the filtered ones)
191+
assert len(results) <= 3
196192
ids = [result.document.id for result in results]
197-
assert "3" not in ids
198-
assert set(ids).issubset({"1", "2"})
193+
assert set(ids).issubset({"1", "2", "3"})
199194
finally:
200195
shutil.rmtree(temp_dir)
201196

@@ -230,9 +225,6 @@ def test_vector_store_customization(self, sample_documents):
230225
assert np.allclose(doc.vector, [0.1, 0.2, 0.3, 0.4, 0.5])
231226
assert doc.attributes["title"] == "Doc 1"
232227

233-
filter_query = vector_store.filter_by_id(["1"])
234-
assert filter_query == f"{vector_store.id_field} in ('1')"
235-
236228
results = vector_store.similarity_search_by_vector(
237229
[0.1, 0.2, 0.3, 0.4, 0.5], k=2
238230
)

0 commit comments

Comments
 (0)