Skip to content

Commit fe7837e

Browse files
authored
bugfix/prevent pinecone being called too often when deleting content (#295)
1 parent e041b86 commit fe7837e

File tree

3 files changed

+26
-33
lines changed

3 files changed

+26
-33
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.3.8-dev0
2+
3+
### Fixes
4+
5+
* **Prevent pinecone delete from hammering database when deleting**
16

27
## 0.3.7
38

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.3.7" # pragma: no cover
1+
__version__ = "0.3.8-dev0" # pragma: no cover

unstructured_ingest/v2/processes/connectors/pinecone.py

Lines changed: 20 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@
66
from pydantic import Field, Secret
77

88
from unstructured_ingest.error import DestinationConnectionError
9-
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
9+
from unstructured_ingest.utils.data_prep import (
10+
flatten_dict,
11+
generator_batching_wbytes,
12+
)
1013
from unstructured_ingest.utils.dep_check import requires_dependencies
1114
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
1215
from unstructured_ingest.v2.interfaces import (
@@ -148,8 +151,10 @@ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
148151

149152
metadata[RECORD_ID_LABEL] = file_data.identifier
150153

154+
# To support more optimal deletes, a prefix is suggested for each record:
155+
# https://docs.pinecone.io/guides/data/manage-rag-documents#delete-all-records-for-a-parent-document
151156
return {
152-
"id": get_enhanced_element_id(element_dict=element_dict, file_data=file_data),
157+
"id": f"{file_data.identifier}#{get_enhanced_element_id(element_dict=element_dict, file_data=file_data)}", # noqa:E501
153158
"values": embeddings,
154159
"metadata": metadata,
155160
}
@@ -215,45 +220,28 @@ def pod_delete_by_record_id(self, file_data: FileData) -> None:
215220
f"from pinecone index: {resp}"
216221
)
217222

218-
def delete_by_query(self, index: "PineconeIndex", query_params: dict) -> None:
219-
while True:
220-
query_results = index.query(**query_params)
221-
matches = query_results.get("matches", [])
222-
if not matches:
223-
break
224-
ids = [match["id"] for match in matches]
225-
delete_params = {"ids": ids}
226-
if namespace := self.upload_config.namespace:
227-
delete_params["namespace"] = namespace
228-
index.delete(**delete_params)
229-
230223
def serverless_delete_by_record_id(self, file_data: FileData) -> None:
231224
logger.debug(
232225
f"deleting any content with metadata "
233226
f"{self.upload_config.record_id_key}={file_data.identifier} "
234227
f"from pinecone serverless index"
235228
)
236229
index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
237-
index_stats = index.describe_index_stats()
238-
dimension = index_stats["dimension"]
239-
total_vectors = index_stats["total_vector_count"]
240-
if total_vectors == 0:
241-
return
242-
while total_vectors > 0:
243-
top_k = min(total_vectors, MAX_QUERY_RESULTS)
244-
query_params = {
245-
"filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}},
246-
"vector": [0] * dimension,
247-
"top_k": top_k,
248-
}
230+
list_kwargs = {"prefix": f"{file_data.identifier}#"}
231+
deleted_ids = 0
232+
if namespace := self.upload_config.namespace:
233+
list_kwargs["namespace"] = namespace
234+
for ids in index.list(**list_kwargs):
235+
deleted_ids += len(ids)
236+
delete_kwargs = {"ids": ids}
249237
if namespace := self.upload_config.namespace:
250-
query_params["namespace"] = namespace
251-
self.delete_by_query(index=index, query_params=query_params)
252-
index_stats = index.describe_index_stats()
253-
total_vectors = index_stats["total_vector_count"]
254-
238+
delete_resp = delete_kwargs["namespace"] = namespace
239+
# delete_resp should be an empty dict if there were no errors
240+
if delete_resp:
241+
logger.error(f"failed to delete batch of ids: {delete_resp}")
242+
index.delete(**delete_kwargs)
255243
logger.info(
256-
f"deleted {total_vectors} records with metadata "
244+
f"deleted {deleted_ids} records with metadata "
257245
f"{self.upload_config.record_id_key}={file_data.identifier} "
258246
f"from pinecone index"
259247
)

0 commit comments

Comments
 (0)