Skip to content

Commit f50b496

Browse files
authored
bug: fix embedding_dim mismatch in DocumentStore (#3183)
* match index dim with embed dim (#3090) * aligned messages across all docstores * aligned messages across all docstores (#3090) * aligned messages across all docstores (#3090)
1 parent 768583d commit f50b496

File tree

7 files changed

+58
-17
lines changed

7 files changed

+58
-17
lines changed

haystack/document_stores/elasticsearch.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1479,14 +1479,16 @@ def update_embeddings(
14791479
for result_batch in get_batches_from_generator(result, batch_size):
14801480
document_batch = [self._convert_es_hit_to_document(hit, return_embedding=False) for hit in result_batch]
14811481
embeddings = retriever.embed_documents(document_batch) # type: ignore
1482-
assert len(document_batch) == len(embeddings)
1483-
1482+
if len(document_batch) != len(embeddings):
1483+
raise DocumentStoreError(
1484+
"The number of embeddings does not match the number of documents in the batch "
1485+
f"({len(embeddings)} != {len(document_batch)})"
1486+
)
14841487
if embeddings[0].shape[0] != self.embedding_dim:
14851488
raise RuntimeError(
1486-
f"Embedding dim. of model ({embeddings[0].shape[0]})"
1487-
f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})."
1488-
"Specify the arg `embedding_dim` when initializing ElasticsearchDocumentStore()"
1489+
f"Embedding dimensions of the model ({embeddings[0].shape[0]}) doesn't match the embedding dimensions of the document store ({self.embedding_dim}). Please reinitiate ElasticsearchDocumentStore() with arg embedding_dim={embeddings[0].shape[0]}."
14891490
)
1491+
14901492
doc_updates = []
14911493
for doc, emb in zip(document_batch, embeddings):
14921494
update = {

haystack/document_stores/faiss.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
from haystack.schema import Document
2424
from haystack.document_stores.base import get_batches_from_generator
25+
from haystack.errors import DocumentStoreError
2526

2627
if TYPE_CHECKING:
2728
from haystack.nodes.retriever import BaseRetriever
@@ -361,7 +362,15 @@ def update_embeddings(
361362
) as progress_bar:
362363
for document_batch in batched_documents:
363364
embeddings = retriever.embed_documents(document_batch) # type: ignore
364-
assert len(document_batch) == len(embeddings)
365+
if len(document_batch) != len(embeddings):
366+
raise DocumentStoreError(
367+
"The number of embeddings does not match the number of documents in the batch "
368+
f"({len(embeddings)} != {len(document_batch)})"
369+
)
370+
if embeddings[0].shape[0] != self.embedding_dim:
371+
raise RuntimeError(
372+
f"Embedding dimensions of the model ({embeddings[0].shape[0]}) doesn't match the embedding dimensions of the document store ({self.embedding_dim}). Please reinitiate FAISSDocumentStore() with arg embedding_dim={embeddings[0].shape[0]}."
373+
)
365374

366375
embeddings_to_index = np.array(embeddings, dtype="float32")
367376

haystack/document_stores/memory.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -459,16 +459,14 @@ def update_embeddings(
459459
) as progress_bar:
460460
for document_batch in batched_documents:
461461
embeddings = retriever.embed_documents(document_batch) # type: ignore
462-
if not len(document_batch) == len(embeddings):
462+
if len(document_batch) != len(embeddings):
463463
raise DocumentStoreError(
464464
"The number of embeddings does not match the number of documents in the batch "
465465
f"({len(embeddings)} != {len(document_batch)})"
466466
)
467467
if embeddings[0].shape[0] != self.embedding_dim:
468468
raise RuntimeError(
469-
f"Embedding dim. of model ({embeddings[0].shape[0]})"
470-
f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})."
471-
"Specify the arg `embedding_dim` when initializing InMemoryDocumentStore()"
469+
f"Embedding dimensions of the model ({embeddings[0].shape[0]}) doesn't match the embedding dimensions of the document store ({self.embedding_dim}). Please reinitiate InMemoryDocumentStore() with arg embedding_dim={embeddings[0].shape[0]}."
472470
)
473471

474472
for doc, emb in zip(document_batch, embeddings):

haystack/document_stores/milvus1.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
from haystack.schema import Document
1717
from haystack.document_stores.base import get_batches_from_generator
18+
from haystack.errors import DocumentStoreError
1819

1920
if TYPE_CHECKING:
2021
from haystack.nodes.retriever import BaseRetriever
@@ -334,6 +335,16 @@ def update_embeddings(
334335
self._delete_vector_ids_from_milvus(documents=document_batch, index=index)
335336

336337
embeddings = retriever.embed_documents(document_batch) # type: ignore
338+
if len(document_batch) != len(embeddings):
339+
raise DocumentStoreError(
340+
"The number of embeddings does not match the number of documents in the batch "
341+
f"({len(embeddings)} != {len(document_batch)})"
342+
)
343+
if embeddings[0].shape[0] != self.embedding_dim:
344+
raise RuntimeError(
345+
f"Embedding dimensions of the model ({embeddings[0].shape[0]}) doesn't match the embedding dimensions of the document store ({self.embedding_dim}). Please reinitiate MilvusDocumentStore() with arg embedding_dim={embeddings[0].shape[0]}."
346+
)
347+
337348
if self.similarity == "cosine":
338349
for embedding in embeddings:
339350
self.normalize_embedding(embedding)

haystack/document_stores/milvus2.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from haystack.schema import Document
1919
from haystack.document_stores.sql import SQLDocumentStore
2020
from haystack.document_stores.base import get_batches_from_generator
21+
from haystack.errors import DocumentStoreError
2122

2223
if TYPE_CHECKING:
2324
from haystack.nodes.retriever.base import BaseRetriever
@@ -369,6 +370,16 @@ def update_embeddings(
369370
self._delete_vector_ids_from_milvus(documents=document_batch, index=index)
370371

371372
embeddings = retriever.embed_documents(document_batch) # type: ignore
373+
if len(document_batch) != len(embeddings):
374+
raise DocumentStoreError(
375+
"The number of embeddings does not match the number of documents in the batch "
376+
f"({len(embeddings)} != {len(document_batch)})"
377+
)
378+
if embeddings[0].shape[0] != self.embedding_dim:
379+
raise RuntimeError(
380+
f"Embedding dimensions of the model ({embeddings[0].shape[0]}) doesn't match the embedding dimensions of the document store ({self.embedding_dim}). Please reinitiate MilvusDocumentStore() with arg embedding_dim={embeddings[0].shape[0]}."
381+
)
382+
372383
if self.cosine:
373384
embeddings = [embedding / np.linalg.norm(embedding) for embedding in embeddings]
374385
embeddings_list = [embedding.tolist() for embedding in embeddings]

haystack/document_stores/pinecone.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from haystack.document_stores import BaseDocumentStore
1212

1313
from haystack.document_stores.filter_utils import LogicalFilterClause
14-
from haystack.errors import PineconeDocumentStoreError, DuplicateDocumentError
14+
from haystack.errors import PineconeDocumentStoreError, DuplicateDocumentError, DocumentStoreError
1515

1616
if TYPE_CHECKING:
1717
from haystack.nodes.retriever import BaseRetriever
@@ -490,7 +490,15 @@ def update_embeddings(
490490
for _ in range(0, document_count, batch_size):
491491
document_batch = list(islice(documents, batch_size))
492492
embeddings = retriever.embed_documents(document_batch) # type: ignore
493-
assert len(document_batch) == len(embeddings)
493+
if len(document_batch) != len(embeddings):
494+
raise DocumentStoreError(
495+
"The number of embeddings does not match the number of documents in the batch "
496+
f"({len(embeddings)} != {len(document_batch)})"
497+
)
498+
if embeddings[0].shape[0] != self.embedding_dim:
499+
raise RuntimeError(
500+
f"Embedding dimensions of the model ({embeddings[0].shape[0]}) doesn't match the embedding dimensions of the document store ({self.embedding_dim}). Please reinitiate PineconeDocumentStore() with arg embedding_dim={embeddings[0].shape[0]}."
501+
)
494502

495503
embeddings_to_index = np.array(embeddings, dtype="float32")
496504
if self.similarity == "cosine":

haystack/document_stores/weaviate.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,14 +1228,16 @@ def update_embeddings(
12281228
self._convert_weaviate_result_to_document(hit, return_embedding=False) for hit in result_batch
12291229
]
12301230
embeddings = retriever.embed_documents(document_batch) # type: ignore
1231-
assert len(document_batch) == len(embeddings)
1232-
1231+
if len(document_batch) != len(embeddings):
1232+
raise DocumentStoreError(
1233+
"The number of embeddings does not match the number of documents in the batch "
1234+
f"({len(embeddings)} != {len(document_batch)})"
1235+
)
12331236
if embeddings[0].shape[0] != self.embedding_dim:
12341237
raise RuntimeError(
1235-
f"Embedding dim. of model ({embeddings[0].shape[0]})"
1236-
f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})."
1237-
"Specify the arg `embedding_dim` when initializing WeaviateDocumentStore()"
1238+
f"Embedding dimensions of the model ({embeddings[0].shape[0]}) doesn't match the embedding dimensions of the document store ({self.embedding_dim}). Please reinitiate WeaviateDocumentStore() with arg embedding_dim={embeddings[0].shape[0]}."
12381239
)
1240+
12391241
for doc, emb in zip(document_batch, embeddings):
12401242
# Using update method to only update the embeddings, other properties will be in tact
12411243
if self.similarity == "cosine":

0 commit comments

Comments
 (0)