update tests

blink1073 · blink1073 · commit 712f703ee7e4 · 2025-04-18T13:20:51.000-05:00
diff --git a/pymongo_voyageai/__init__.py b/pymongo_voyageai/__init__.py
@@ -1,7 +1,7 @@
 from ._version import __version__
 from .client import PyMongoVoyageAI
 from .document import Document, DocumentType, ImageDocument, StoredDocument, TextDocument
-from .storage import ObjectStorage, S3Storage
+from .storage import MemoryStorage, ObjectStorage, S3Storage
 
 __all__ = [
     "Document",
@@ -12,5 +12,6 @@
     "PyMongoVoyageAI",
     "ObjectStorage",
     "S3Storage",
+    "MemoryStorage",
     "__version__",
 ]
diff --git a/pymongo_voyageai/client.py b/pymongo_voyageai/client.py
@@ -6,6 +6,7 @@
 from typing import Any
 
 from bson import ObjectId
+from langchain_core.runnables.config import run_in_executor
 from langchain_mongodb.index import create_vector_search_index
 from langchain_mongodb.pipelines import vector_search_stage
 from langchain_mongodb.utils import make_serializable
@@ -105,6 +106,17 @@ def image_to_storage(self, document: ImageDocument | Image.Image) -> StoredDocum
             document = ImageDocument(image=document)
         return self._storage.save_image(document)
 
+    async def aimage_to_storage(self, document: ImageDocument | Image.Image) -> StoredDocument:
+        """Convert an image to a stored document.
+
+        Args:
+            document: The input document or image object.
+
+        Returns:
+            The stored document object.
+        """
+        return await run_in_executor(None, self.image_to_storage, document)
+
     def storage_to_image(self, document: StoredDocument | str) -> ImageDocument:
         """Convert a stored document to an image document.
 
@@ -120,6 +132,17 @@ def storage_to_image(self, document: StoredDocument | str) -> ImageDocument:
             )
         return self._storage.load_image(document=document)
 
+    async def astorage_to_image(self, document: StoredDocument | str) -> ImageDocument:
+        """Convert a stored document to an image document.
+
+        Args:
+            document: The input document or object name.
+
+        Returns:
+            The image document object.
+        """
+        return await run_in_executor(None, self.storage_to_image, document)
+
     def url_to_images(
         self,
         url: str,
@@ -145,6 +168,38 @@ def url_to_images(
             url, metadata=metadata, start=start, end=end, image_column=image_column, **kwargs
         )
 
+    async def aurl_to_images(
+        self,
+        url: str,
+        metadata: dict[str, Any] | None = None,
+        start: int = 0,
+        end: int | None = None,
+        image_column: str | None = None,
+        **kwargs: Any,
+    ) -> list[ImageDocument]:
+        """Extract images from a url.
+
+        Args:
+            url: The url to load the images from.
+            metadata: A set of metadata to associate with the images.
+            start: The start frame to use for the images.
+            end: The end frame to use for the images.
+            image_column: The name of the column used to store the image data, for parquet files.
+
+        Returns:
+            A list of image document objects.
+        """
+        return await run_in_executor(
+            None,
+            self.url_to_images,
+            url,
+            metadata=metadata,
+            start=start,
+            end=end,
+            image_column=image_column,
+            **kwargs,
+        )
+
     def add_documents(
         self,
         inputs: Sequence[str | Image.Image | Document | Sequence[str | Image.Image | Document]],
@@ -230,6 +285,30 @@ def add_documents(
             self._coll.bulk_write(operations)
         return output_docs
 
+    async def aadd_documents(
+        self,
+        inputs: Sequence[str | Image.Image | Document | Sequence[str | Image.Image | Document]],
+        ids: list[str] | None = None,
+        batch_size: int = DEFAULT_INSERT_BATCH_SIZE,
+        **kwargs: Any,
+    ) -> list[dict[str, Any]]:
+        """Add multimodal documents to the vectorstore.
+
+        Args:
+            inputs: List of inputs to add to the vectorstore, which are each a list of documents.
+            ids: Optional list of unique ids that will be used as index in VectorStore.
+                See note on ids in add_texts.
+            batch_size: Number of documents to insert at a time.
+                Tuning this may help with performance and sidestep MongoDB limits.
+            kwargs: Additional keyword args for future expansion.
+
+        Returns:
+            A list documents with their associated input documents.
+        """
+        return await run_in_executor(
+            None, self.add_documents, inputs, ids=ids, batch_size=batch_size, **kwargs
+        )
+
     def delete_by_ids(
         self, ids: list[str | ObjectId], delete_stored_objects: bool = True, **kwargs: Any
     ) -> bool:
@@ -248,6 +327,23 @@ def delete_by_ids(
             {"_id": {"$in": oids}}, delete_stored_objects=delete_stored_objects, **kwargs
         )
 
+    async def adelete_by_ids(
+        self, ids: list[str | ObjectId], delete_stored_objects: bool = True, **kwargs: Any
+    ) -> bool:
+        """Delete documents by ids.
+
+        Args:
+            ids: List of ids to delete.
+            delete_stored_objects: Whether to delete the associated stored objects.
+            **kwargs: Other keyword arguments passed to delete_many().
+
+        Returns:
+            bool: True if deletion is successful, False otherwise.
+        """
+        return await run_in_executor(
+            None, self.delete_by_ids, ids, delete_stored_objects=delete_stored_objects, **kwargs
+        )
+
     def delete_many(
         self, filter: Mapping[str, Any], delete_stored_objects: bool = True, **kwargs: Any
     ) -> bool:
@@ -269,11 +365,32 @@ def delete_many(
                         self._storage.delete_image(inp)
         return self._coll.delete_many(filter=filter, **kwargs).acknowledged
 
+    async def adelete_many(
+        self, filter: Mapping[str, Any], delete_stored_objects: bool = True, **kwargs: Any
+    ) -> bool:
+        """Delete documents using a filter.
+
+        Args:
+            ids: List of ids to delete.
+            delete_stored_objects: Whether to delete the associated stored objects.
+            **kwargs: Other keyword arguments passed to the collection's `delete_many` method.
+
+        Returns:
+            bool: True if deletion is successful, False otherwise.
+        """
+        return await run_in_executor(
+            None, self.delete_many, filter, delete_stored_objects=delete_stored_objects, **kwargs
+        )
+
     def close(self) -> None:
         """Close the client, cleaning up resources."""
         self._coll.database.client.close()
         self._storage.close()
 
+    async def aclose(self) -> None:
+        """Close the client, cleaning up resources."""
+        return await run_in_executor(None, self.close)
+
     def get_by_ids(
         self, ids: Sequence[str | ObjectId], extract_images: bool = True
     ) -> list[dict[str, Any]]:
@@ -294,6 +411,21 @@ def get_by_ids(
             docs.append(doc)
         return docs
 
+    async def aget_by_ids(
+        self, ids: Sequence[str | ObjectId], extract_images: bool = True
+    ) -> list[dict[str, Any]]:
+        """Get a list of documents by id.
+
+        Args:
+            ids: List of ids to search for.
+            extract_images: Whether to extract the stored documents into image documents.
+
+        Returns:
+            A list of matching documents, where the `inputs` is a list of stored documents
+            or image documents.
+        """
+        return await run_in_executor(None, self.get_by_ids, ids, extract_images=extract_images)
+
     def wait_for_indexing(self, timeout: int = TIMEOUT, interval: int = INTERVAL) -> None:
         """Wait for the search index to update to account for newly added embeddings."""
         n_docs = self._coll.count_documents({})
@@ -306,6 +438,12 @@ def wait_for_indexing(self, timeout: int = TIMEOUT, interval: int = INTERVAL) ->
 
         raise TimeoutError(f"Failed to embed, insert, and index texts in {timeout}s.")
 
+    async def await_for_indexing(self, timeout: int = TIMEOUT, interval: int = INTERVAL) -> None:
+        """Wait for the search index to update to account for newly added embeddings."""
+        return await run_in_executor(
+            None, self.wait_for_indexing, timeout=timeout, interval=interval
+        )
+
     def similarity_search(
         self,
         query: str,
@@ -379,6 +517,53 @@ def similarity_search(
             docs.append(res)
         return docs
 
+    async def asimilarity_search(
+        self,
+        query: str,
+        k: int = 4,
+        pre_filter: dict[str, Any] | None = None,
+        post_filter_pipeline: list[dict[str, Any]] | None = None,
+        oversampling_factor: int = 10,
+        include_scores: bool = False,
+        include_embeddings: bool = False,
+        extract_images: bool = False,
+        **kwargs: Any,
+    ) -> list[dict[str, Any]]:  # noqa: E501
+        """Return documents most similar to the given query.
+
+        Args:
+            query: Input text of semantic query.
+            k: The number of documents to return. Defaults to 4.
+            pre_filter: List of MQL match expressions comparing an indexed field.
+            post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
+                to filter/process results after $vectorSearch.
+            oversampling_factor: Multiple of k used when generating number of candidates
+                at each step in the HNSW Vector Search.
+            include_scores: If True, the query score of each result
+                will be included in metadata.
+            include_embeddings: If True, the embedding vector of each result
+                will be included in metadata.
+            extract_images: If True, the stored documents will be converted image documents.
+            kwargs: Additional arguments are specific to the search_type
+
+        Returns:
+            List of documents most similar to the query and their scores, where the `inputs`
+            is a list of stored documents or image documents.
+        """
+        return await run_in_executor(
+            None,
+            self.similarity_search,
+            query,
+            k=k,
+            pre_filter=pre_filter,
+            post_filter_pipeline=post_filter_pipeline,
+            oversampling_factor=oversampling_factor,
+            include_scores=include_scores,
+            include_embeddings=include_embeddings,
+            extract_images=extract_images,
+            **kwargs,
+        )
+
     def _expand_doc(self, obj: dict[str, Any], extract_images: bool = True) -> dict[str, Any]:
         for idx, inp in enumerate(list(obj["inputs"])):
             if inp["type"] == DocumentType.storage:
diff --git a/pymongo_voyageai/storage.py b/pymongo_voyageai/storage.py
@@ -82,3 +82,31 @@ def delete_image(self, document: StoredDocument) -> None:
 
     def close(self) -> None:
         self.client.close()
+
+
+class MemoryStorage(ObjectStorage):
+    """An in-memory object store"""
+
+    def __init__(self) -> None:
+        self.root_location = "foo"
+        self.storage: dict[str, ImageDocument] = dict()
+
+    def save_image(self, image: ImageDocument) -> StoredDocument:
+        object_name = str(ObjectId())
+        self.storage[object_name] = image
+        return StoredDocument(
+            root_location=self.root_location,
+            name=image.name,
+            object_name=object_name,
+            source_url=image.source_url,
+            page_number=image.page_number,
+        )
+
+    def load_image(self, document: StoredDocument) -> ImageDocument:
+        return self.storage[document.object_name]
+
+    def delete_image(self, document: StoredDocument) -> None:
+        self.storage.pop(document.object_name, None)
+
+    def close(self):
+        pass
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,6 +51,7 @@ dev = [
     "pyarrow>=19.0.1",
     "pre-commit>=4.2.0",
     "autodoc-pydantic>=2.2.0",
+    "pytest-asyncio>=0.26.0",
 ]
 
 
diff --git a/tests/test_client_integration.py b/tests/test_client_integration.py
@@ -3,50 +3,25 @@
 
 import numpy as np
 import pytest
-from bson import ObjectId
 
-from pymongo_voyageai import ImageDocument, PyMongoVoyageAI, StoredDocument
-from pymongo_voyageai.storage import ImageStorage, S3Storage
+from pymongo_voyageai import PyMongoVoyageAI
 
 if "VOYAGEAI_API_KEY" not in os.environ:
     pytest.skip("Requires VoyageAI API Key.", allow_module_level=True)
 
+if "S3_BUCKET_NAME" not in os.environ:
+    pytest.skip("Requires VoyageAI API Key.", allow_module_level=True)
 
 # mypy: disable_error_code="no-untyped-def"
-class MemoryStorage(ImageStorage):
-    def __init__(self) -> None:
-        self.root_location = "foo"
-        self.storage: dict[str, ImageDocument] = dict()
-
-    def save_image(self, image: ImageDocument) -> StoredDocument:
-        object_name = str(ObjectId())
-        self.storage[object_name] = image
-        return StoredDocument(
-            root_location=self.root_location,
-            name=image.name,
-            object_name=object_name,
-            source_url=image.source_url,
-            page_number=image.page_number,
-        )
-
-    def load_image(self, document: StoredDocument) -> ImageDocument:
-        return self.storage[document.object_name]
-
-    def delete_image(self, document: StoredDocument) -> None:
-        del self.storage[document.object_name]
 
 
 @pytest.fixture
 def client() -> Generator[PyMongoVoyageAI, None, None]:
     conn_str = os.environ.get("MONGODB_URI", "mongodb://127.0.0.1:27017?directConnection=true")
-    if "S3_BUCKET" in os.environ:
-        storage_object = S3Storage(os.environ["S3_BUCKET"])
-    else:
-        storage_object = MemoryStorage()  # type:ignore[assignment]
     client = PyMongoVoyageAI(
         voyageai_api_key=os.environ["VOYAGEAI_API_KEY"],
+        s3_bucket_name=os.environ["S3_BUCKET_NAME"],
         mongo_connection_string=conn_str,
-        storage_object=storage_object,
         collection_name="test",
         database_name="pymongo_voyageai_test_db",
     )
@@ -93,8 +68,25 @@ def test_pdf_pages(client: PyMongoVoyageAI):
     images = client.url_to_images(url)
     resp = client.add_documents(images)
     client.wait_for_indexing()
-    data = client.similarity_search(query, extract_images=True)
+    data = client.similarity_search(query, extract_images=False)
     # We expect page 5 to be the best match.
     assert data[0]["inputs"][0].page_number == 5
     assert len(client.get_by_ids([d["_id"] for d in resp])) == len(resp)
     client.delete_by_ids([d["_id"] for d in resp])
+
+
+@pytest.mark.asyncio
+async def test_image_set_async(client: PyMongoVoyageAI):
+    url = "hf://datasets/princeton-nlp/CharXiv/val.parquet"
+    documents = await client.aurl_to_images(url, image_column="image", end=3)
+    resp = await client.aadd_documents(documents)
+    await client.await_for_indexing()
+    query = "3D loss landscapes for different training strategies"
+    data = await client.asimilarity_search(query, extract_images=True)
+    # The best match should be the third input image.
+    assert data[0]["inputs"][0].image.tobytes() == documents[2].image.tobytes()
+    ids = await client.aget_by_ids([d["_id"] for d in resp])
+    assert len(ids) == len(resp)
+    await client.adelete_by_ids([d["_id"] for d in resp])
+    await client.adelete_many({})
+    await client.aclose()
diff --git a/tests/test_client_unit.py b/tests/test_client_unit.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -51,6 +51,7 @@ dev = [`
`51`	`51`	`"pyarrow>=19.0.1",`
`52`	`52`	`"pre-commit>=4.2.0",`
`53`	`53`	`"autodoc-pydantic>=2.2.0",`
	`54`	`+ "pytest-asyncio>=0.26.0",`
`54`	`55`	`]`
`55`	`56`
`56`	`57`