|
| 1 | +"""Tests for CRUD utilities.""" |
| 2 | + |
| 3 | +from collections.abc import Callable, Generator, Iterable |
| 4 | +from typing import Any |
| 5 | + |
| 6 | +from bson import ObjectId |
| 7 | +from pymongo import ReplaceOne |
| 8 | +from pymongo.synchronous.collection import Collection |
| 9 | + |
| 10 | +from pymongo_vectorsearch_utils.util import oid_to_str, str_to_oid |
| 11 | + |
| 12 | + |
| 13 | +def bulk_embed_and_insert_texts( |
| 14 | + texts: list[str] | Iterable[str], |
| 15 | + metadatas: list[dict] | Generator[dict, Any, Any], |
| 16 | + embedding_func: Callable[[list[str]], list[list[float]]], |
| 17 | + collection: Collection[Any], |
| 18 | + text_key: str, |
| 19 | + embedding_key: str, |
| 20 | + ids: list[str] | None = None, |
| 21 | + **kwargs: Any, |
| 22 | +) -> list[str]: |
| 23 | + """Bulk insert single batch of texts, embeddings, and optionally ids. |
| 24 | +
|
| 25 | + Important notes on ids: |
| 26 | + - If _id or id is a key in the metadatas dicts, one must |
| 27 | + pop them and provide as separate list. |
| 28 | + - They must be unique. |
| 29 | + - If they are not provided, unique ones are created, |
| 30 | + stored as bson.ObjectIds internally, and strings in the database. |
| 31 | + These will appear in Document.metadata with key, '_id'. |
| 32 | +
|
| 33 | + Args: |
| 34 | + texts: Iterable of strings to add to the vectorstore. |
| 35 | + metadatas: Optional list of metadatas associated with the texts. |
| 36 | + embedding_func: A function that generates embedding vectors from the texts. |
| 37 | + collection: The MongoDB collection where documents will be inserted. |
| 38 | + text_key: The field name where thet text will be stored in each document. |
| 39 | + embedding_key: The field name where the embedding will be stored in each document. |
| 40 | + ids: Optional list of unique ids that will be used as index in VectorStore. |
| 41 | + See note on ids. |
| 42 | + """ |
| 43 | + if not texts: |
| 44 | + return [] |
| 45 | + # Compute embedding vectors |
| 46 | + embeddings = embedding_func(list(texts)) |
| 47 | + if not ids: |
| 48 | + ids = [str(ObjectId()) for _ in range(len(list(texts)))] |
| 49 | + docs = [ |
| 50 | + { |
| 51 | + "_id": str_to_oid(i), |
| 52 | + text_key: t, |
| 53 | + embedding_key: embedding, |
| 54 | + **m, |
| 55 | + } |
| 56 | + for i, t, m, embedding in zip(ids, texts, metadatas, embeddings, strict=False) |
| 57 | + ] |
| 58 | + operations = [ReplaceOne({"_id": doc["_id"]}, doc, upsert=True) for doc in docs] |
| 59 | + # insert the documents in MongoDB Atlas |
| 60 | + result = collection.bulk_write(operations) |
| 61 | + assert result.upserted_ids is not None |
| 62 | + return [oid_to_str(_id) for _id in result.upserted_ids.values()] |
0 commit comments