Skip to content

Commit 0c2d13f

Browse files
authored
bug: skip validating empty embeddings (#3774)
* skip validating empty embeddings * skip batches without embeddings to update * add unit test with mocked retriever
1 parent e84fae2 commit 0c2d13f

File tree

3 files changed

+23
-4
lines changed

3 files changed

+23
-4
lines changed

haystack/document_stores/pinecone.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,12 @@ def update_embeddings(
505505
for _ in range(0, document_count, batch_size):
506506
document_batch = list(islice(documents, batch_size))
507507
embeddings = retriever.embed_documents(document_batch)
508+
if embeddings.size == 0:
509+
# Skip batch if there are no embeddings. Otherwise, incorrect embedding shape will be inferred and
510+
# Pinecone APi will return a "No vectors provided" Bad Request Error
511+
progress_bar.set_description_str("Documents Processed")
512+
progress_bar.update(batch_size)
513+
continue
508514
self._validate_embeddings_shape(
509515
embeddings=embeddings, num_documents=len(document_batch), embedding_dim=self.embedding_dim
510516
)

test/document_stores/test_pinecone.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import os
44
from inspect import getmembers, isclass, isfunction
5+
from unittest.mock import MagicMock
56

67
import pytest
78

@@ -12,8 +13,7 @@
1213

1314
from .test_base import DocumentStoreBaseTestAbstract
1415
from ..mocks import pinecone as pinecone_mock
15-
from ..conftest import SAMPLES_PATH
16-
16+
from ..nodes.test_retriever import MockBaseRetriever
1717

1818
# Set metadata fields used during testing for PineconeDocumentStore meta_config
1919
META_FIELDS = ["meta_field", "name", "date", "numeric_field", "odd_document"]
@@ -417,3 +417,15 @@ def test_multilayer_dict(self, doc_store_with_docs: PineconeDocumentStore):
417417

418418
assert len(retrieved_docs) == 1
419419
assert retrieved_docs[0].meta == multilayer_meta
420+
421+
@pytest.mark.unit
422+
def test_skip_validating_empty_embeddings(self, ds: PineconeDocumentStore):
423+
document = Document(id="0", content="test")
424+
retriever = MockBaseRetriever(document_store=ds, mock_document=document)
425+
ds.write_documents(documents=[document])
426+
ds._validate_embeddings_shape = MagicMock()
427+
428+
ds.update_embeddings(retriever)
429+
ds._validate_embeddings_shape.assert_called_once()
430+
ds.update_embeddings(retriever, update_existing_embeddings=False)
431+
ds._validate_embeddings_shape.assert_called_once()

test/nodes/test_retriever.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020
from haystack.pipelines import DocumentSearchPipeline
2121
from haystack.schema import Document
2222
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
23-
from haystack.document_stores.faiss import FAISSDocumentStore
24-
from haystack.document_stores import MilvusDocumentStore
2523
from haystack.nodes.retriever.dense import DensePassageRetriever, EmbeddingRetriever, TableTextRetriever
2624
from haystack.nodes.retriever.sparse import BM25Retriever, FilterRetriever, TfidfRetriever
2725
from haystack.nodes.retriever.multimodal import MultiModalRetriever
@@ -160,6 +158,9 @@ def retrieve_batch(
160158
):
161159
return [[self.mock_document] for _ in range(len(queries))]
162160

161+
def embed_documents(self, documents: List[Document]):
162+
return np.full((len(documents), 768), 0.5)
163+
163164

164165
def test_retrieval_empty_query(document_store: BaseDocumentStore):
165166
# test with empty query using the run() method

0 commit comments

Comments
 (0)