Merge pull request #2 from blink1073/read-from-storage

blink1073 · web-flow · commit 92954cca6083 · 2025-04-30T14:22:26.000-05:00
INTPYTHON-614 Allow data be loaded by url in the object store
diff --git a/docs/examples.rst b/docs/examples.rst
@@ -97,6 +97,38 @@ Combining Text and Images
     client.close()
 
 
+Loading Data from S3
+--------------------
+
+If you already have data stored in S3, you can use an ``s3://`` url to load the image(s):
+
+.. code-block:: python
+
+    import os
+    from pymongo_voyageai import PyMongoVoyageAI
+
+    client = PyMongoVoyageAI(
+        voyageai_api_key=os.environ["VOYAGEAI_API_KEY"],
+        s3_bucket_name=os.environ["S3_BUCKET_NAME"],
+        mongo_connection_string=os.environ["MONGODB_URI"],
+        collection_name="test",
+        database_name="test_db",
+    )
+
+    query = "The consequences of a dictator's peace"
+    url = "s3://my-bucket-name/readingcopy.pdf"
+    images = client.url_to_images(url)
+    resp = client.add_documents(images)
+    client.wait_for_indexing()
+    data = client.similarity_search(query, extract_images=True)
+
+    # We expect page 5 to be the best match.
+    assert data[0]["inputs"][0].page_number == 5
+    assert len(client.get_by_ids([d["_id"] for d in resp])) == len(resp)
+    client.delete_by_ids([d["_id"] for d in resp])
+    client.close()
+
+
 Using Async API
 ---------------
 
diff --git a/pymongo_voyageai/client.py b/pymongo_voyageai/client.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import io
 import logging
 from collections.abc import Mapping, Sequence
 from time import monotonic, sleep
@@ -206,7 +207,19 @@ def image_to_storage(self, document: ImageDocument | Image.Image) -> StoredDocum
         """
         if isinstance(document, Image.Image):
             document = ImageDocument(image=document)
-        return self._storage.save_image(document)
+        object_name = f"{ObjectId()}.png"
+        fd = io.BytesIO()
+        document.image.save(fd, "png")
+        fd.seek(0)
+        self._storage.save_data(fd, object_name)
+        return StoredDocument(
+            root_location=self._storage.root_location,
+            object_name=object_name,
+            page_number=document.page_number,
+            source_url=document.source_url,
+            name=document.name,
+            metadata=document.metadata,
+        )
 
     async def aimage_to_storage(self, document: ImageDocument | Image.Image) -> StoredDocument:
         """Convert an image to a stored document.
@@ -232,7 +245,15 @@ def storage_to_image(self, document: StoredDocument | str) -> ImageDocument:
             document = StoredDocument(
                 root_location=self._storage.root_location, object_name=document
             )
-        return self._storage.load_image(document=document)
+        buffer = self._storage.read_data(document.object_name)
+        image = Image.open(buffer)
+        return ImageDocument(
+            image=image,
+            source_url=document.source_url,
+            page_number=document.page_number,
+            metadata=document.metadata,
+            name=document.name,
+        )
 
     async def astorage_to_image(self, document: StoredDocument | str) -> ImageDocument:
         """Convert a stored document to an image document.
@@ -267,7 +288,13 @@ def url_to_images(
             A list of image document objects.
         """
         return url_to_images(
-            url, metadata=metadata, start=start, end=end, image_column=image_column, **kwargs
+            url,
+            storage=self._storage,
+            metadata=metadata,
+            start=start,
+            end=end,
+            image_column=image_column,
+            **kwargs,
         )
 
     async def aurl_to_images(
@@ -464,7 +491,7 @@ def delete_many(
                 self._expand_doc(obj, False)
                 for inp in obj["inputs"]:
                     if isinstance(inp, StoredDocument):
-                        self._storage.delete_image(inp)
+                        self._storage.delete_data(inp.object_name)
         return self._coll.delete_many(filter=filter, **kwargs).acknowledged
 
     async def adelete_many(
diff --git a/pymongo_voyageai/storage.py b/pymongo_voyageai/storage.py
@@ -2,38 +2,43 @@
 
 import boto3  # type:ignore[import-untyped]
 import botocore  # type:ignore[import-untyped]
-from bson import ObjectId
-from PIL import Image
-
-from .document import ImageDocument, StoredDocument
 
 
 class ObjectStorage:
-    """A class used store image documents."""
+    """A class used to store binary data."""
 
     root_location: str
-    """The root location to use in the object store."""
+    """The default root location to use in the object store."""
+
+    url_prefixes: list[str] | None
+    """The url prefixes used by the object store, for reading data from a url."""
 
-    def save_image(self, image: ImageDocument) -> StoredDocument:
-        """Save an image document to the object store."""
+    def save_data(self, data: io.BytesIO, object_name: str) -> None:
+        """Save data to the object store."""
         raise NotImplementedError
 
-    def load_image(self, document: StoredDocument) -> ImageDocument:
-        """Load an image document from the object store."""
+    def read_data(self, object_name: str) -> io.BytesIO:
+        """Read data from the object store."""
         raise NotImplementedError
 
-    def delete_image(self, document: StoredDocument) -> None:
-        """Remove an image document from the object store."""
+    def load_url(self, url: str) -> io.BytesIO:
+        """Load data from a url."""
         raise NotImplementedError
 
-    def close(self) -> None:
-        """Close the object store."""
+    def delete_data(self, object_name: str) -> None:
+        """Delete data from the object store."""
         raise NotImplementedError
 
+    def close(self):
+        """Close the object store."""
+        pass
+
 
 class S3Storage(ObjectStorage):
     """An object store using an S3 bucket."""
 
+    url_prefixes = ["s3://"]
+
     def __init__(
         self,
         bucket_name: str,
@@ -50,35 +55,26 @@ def __init__(
         self.client = client or boto3.client("s3", region_name=region_name)
         self.root_location = bucket_name
 
-    def save_image(self, image: ImageDocument) -> StoredDocument:
-        object_name = str(ObjectId())
-        fd = io.BytesIO()
-        image.image.save(fd, "png")
-        fd.seek(0)
-        self.client.upload_fileobj(fd, self.root_location, object_name)
-        return StoredDocument(
-            root_location=self.root_location,
-            object_name=object_name,
-            page_number=image.page_number,
-            source_url=image.source_url,
-            name=image.name,
-            metadata=image.metadata,
-        )
-
-    def load_image(self, document: StoredDocument) -> ImageDocument:
+    def save_data(self, data: io.BytesIO, object_name: str) -> None:
+        """Save data to the object store."""
+        self.client.upload_fileobj(data, self.root_location, object_name)
+
+    def read_data(self, object_name: str) -> io.BytesIO:
+        """Read data using the object store."""
+        buffer = io.BytesIO()
+        self.client.download_fileobj(self.root_location, object_name, buffer)
+        return buffer
+
+    def load_url(self, url: str) -> io.BytesIO:
+        """Load data from a url."""
+        bucket, _, object_name = url.replace("s3://", "").partition("/")
         buffer = io.BytesIO()
-        self.client.download_fileobj(document.root_location, document.object_name, buffer)
-        image = Image.open(buffer)
-        return ImageDocument(
-            image=image,
-            source_url=document.source_url,
-            page_number=document.page_number,
-            metadata=document.metadata,
-            name=document.name,
-        )
-
-    def delete_image(self, document: StoredDocument) -> None:
-        self.client.delete_object(Bucket=document.root_location, Key=document.object_name)
+        self.client.download_fileobj(bucket, object_name, buffer)
+        return buffer
+
+    def delete_data(self, object_name: str) -> None:
+        """Delete data from the object store."""
+        self.client.delete_object(Bucket=self.root_location, Key=object_name)
 
     def close(self) -> None:
         self.client.close()
@@ -87,26 +83,25 @@ def close(self) -> None:
 class MemoryStorage(ObjectStorage):
     """An in-memory object store"""
 
+    url_prefixes = ["file://"]
+
     def __init__(self) -> None:
         self.root_location = "foo"
-        self.storage: dict[str, ImageDocument] = dict()
+        self.storage: dict[str, io.BytesIO] = dict()
 
-    def save_image(self, image: ImageDocument) -> StoredDocument:
-        object_name = str(ObjectId())
-        self.storage[object_name] = image
-        return StoredDocument(
-            root_location=self.root_location,
-            name=image.name,
-            object_name=object_name,
-            source_url=image.source_url,
-            page_number=image.page_number,
-        )
+    def save_data(self, data: io.BytesIO, object_name: str) -> None:
+        """Save data to the object store."""
+        self.storage[object_name] = data
 
-    def load_image(self, document: StoredDocument) -> ImageDocument:
-        return self.storage[document.object_name]
+    def read_data(self, object_name: str) -> io.BytesIO:
+        """Read data using the object store."""
+        return self.storage[object_name]
 
-    def delete_image(self, document: StoredDocument) -> None:
-        self.storage.pop(document.object_name, None)
+    def load_url(self, url: str) -> io.BytesIO:
+        """Load data from a url."""
+        with open(url.replace("file://", ""), "rb") as fid:
+            return io.BytesIO(fid.read())
 
-    def close(self):
-        pass
+    def delete_data(self, object_name: str) -> None:
+        """Delete data from the object store."""
+        self.storage.pop(object_name, None)
diff --git a/pymongo_voyageai/utils.py b/pymongo_voyageai/utils.py
@@ -5,6 +5,7 @@
 from PIL import Image
 
 from .document import ImageDocument
+from .storage import ObjectStorage, S3Storage
 
 try:
     import fitz  # type:ignore[import-untyped]
@@ -17,13 +18,13 @@
 INTERVAL = 1
 
 
-def pdf_url_to_images(
-    url: str, start: int | None = None, end: int | None = None, zoom: float = 1.0
+def pdf_data_to_images(
+    pdf_stream: io.BytesIO, start: int | None = None, end: int | None = None, zoom: float = 1.0
 ) -> list[Image.Image]:
-    """Extract images from a pdf url.
+    """Extract images from a pdf byte stream.
 
     Args:
-        url: The url to load the images from.
+        pdf_stream: The BytesIO object to load the images from.
         start: The start frame to use for the images.
         end: The end frame to use for the images.
         zoom: The zoom factor to apply to the images.
@@ -33,14 +34,8 @@ def pdf_url_to_images(
     """
     if fitz is None:
         raise ValueError("pymongo-voyageai requires PyMuPDF to read pdf files") from None
-    # Ensure that the URL is valid
-    if not url.startswith("http") and url.endswith(".pdf"):
-        raise ValueError("Invalid URL")
 
     # Read the PDF from the specified URL
-    with urllib.request.urlopen(url) as response:
-        pdf_data = response.read()
-    pdf_stream = io.BytesIO(pdf_data)
     pdf = fitz.open(stream=pdf_stream, filetype="pdf")
 
     images = []
@@ -57,7 +52,6 @@ def pdf_url_to_images(
         # Convert pixmap to PIL Image
         img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
         images.append(img)
-    print("out of loop")
 
     # Close the document
     pdf.close()
@@ -67,6 +61,7 @@ def pdf_url_to_images(
 
 def url_to_images(
     url: str,
+    storage: ObjectStorage | None = None,
     metadata: dict[str, Any] | None = None,
     start: int = 0,
     end: int | None = None,
@@ -77,6 +72,7 @@ def url_to_images(
 
     Args:
         url: The url to load the images from.
+        storage: The storage object which can be used to load data from custom urls.
         metadata: A set of metadata to associate with the images.
         start: The start frame to use for the images.
         end: The end frame to use for the images.
@@ -90,14 +86,35 @@ def url_to_images(
     basename = url[i:]
     i = basename.rfind(".")
     name = basename[:i]
+
+    source = None
+    # Prefer to use our storage object to read the file data.
+    if storage and storage.url_prefixes:
+        for pattern in storage.url_prefixes:
+            if url.startswith(pattern):
+                source = storage.load_url(url)
+                break
+    # For parquet files that are not loaded by the storage object, let pandas handle the download.
+    if source is None and url.endswith(".parquet"):
+        source = url
+    # For s3 files that are not loaded by the storage object, create a temp S3Storage object.
+    if source is None and url.startswith("s3://"):
+        storage = S3Storage("")
+        source = storage.load_url(url)
+        storage.close()
+    # For all other files, use the native download.
+    if source is None:
+        with urllib.request.urlopen(url) as response:
+            source = io.BytesIO(response.read())
+
     if url.endswith(".parquet"):
         try:
             import pandas as pd
         except ImportError:
             raise ValueError("pymongo-voyageai requires pandas to read parquet files") from None
         if image_column is None:
             raise ValueError("Must supply and image field to read a parquet file")
-        column = pd.read_parquet(url, **kwargs)[image_column][start:end]
+        column = pd.read_parquet(source, **kwargs)[image_column][start:end]
         for idx, item in enumerate(column.tolist()):
             image = Image.open(io.BytesIO(item["bytes"]))
             images.append(
@@ -110,7 +127,7 @@ def url_to_images(
                 )
             )
     elif url.endswith(".pdf"):
-        for idx, img in enumerate(pdf_url_to_images(url, start=start, end=end, **kwargs)):
+        for idx, img in enumerate(pdf_data_to_images(source, start=start, end=end, **kwargs)):
             images.append(
                 ImageDocument(
                     image=img,
@@ -121,9 +138,7 @@ def url_to_images(
                 )
             )
     else:
-        with urllib.request.urlopen(url) as response:
-            image_data = response.read()
-        image = Image.open(io.BytesIO(image_data))
+        image = Image.open(source)
         if "transparency" in image.info and image.mode != "RGBA":
             image = image.convert("RGBA")
         images.append(ImageDocument(image=image, name=name, source_url=url, metadata=metadata))
diff --git a/pyproject.toml b/pyproject.toml
@@ -70,6 +70,7 @@ addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"]
 xfail_strict = true
 filterwarnings = [
     "error",
+    "module:datetime.datetime.utcnow:DeprecationWarning", # from boto3
     "module:builtin type Swig:DeprecationWarning", # from pymupdf
     "module:builtin type swig:DeprecationWarning", # from pymupdf
 ]
diff --git a/tests/test_client_integration.py b/tests/test_client_integration.py
diff --git a/tests/test_client_unit.py b/tests/test_client_unit.py

Original file line number	Diff line number	Diff line change
`@@ -70,6 +70,7 @@ addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"]`
`70`	`70`	`xfail_strict = true`
`71`	`71`	`filterwarnings = [`
`72`	`72`	`"error",`
	`73`	`+ "module:datetime.datetime.utcnow:DeprecationWarning", # from boto3`
`73`	`74`	`"module:builtin type Swig:DeprecationWarning", # from pymupdf`
`74`	`75`	`"module:builtin type swig:DeprecationWarning", # from pymupdf`
`75`	`76`	`]`