Skip to content

Commit 8ea4447

Browse files
committed
update example
1 parent 3c5a466 commit 8ea4447

File tree

2 files changed

+26
-2
lines changed

2 files changed

+26
-2
lines changed

pymongo_voyageai/utils.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from PIL import Image
66

77
from .document import ImageDocument
8-
from .storage import ObjectStorage
8+
from .storage import ObjectStorage, S3Storage
99

1010
try:
1111
import fitz # type:ignore[import-untyped]
@@ -97,6 +97,11 @@ def url_to_images(
9797
# For parquet files that are not loaded by the storage object, let pandas handle the download.
9898
if source is None and url.endswith(".parquet"):
9999
source = url
100+
# For s3 files that are not loaded by the storage object, create a temp S3Storage object.
101+
if source is None and url.startswith("s3://"):
102+
storage = S3Storage("")
103+
source = storage.read_from_url(url)
104+
storage.close()
100105
# For all other files, use the native download.
101106
if source is None:
102107
with urllib.request.urlopen(url) as response:

tests/test_client_integration.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pytest
77
from bson import ObjectId
88

9-
from pymongo_voyageai import PyMongoVoyageAI
9+
from pymongo_voyageai import MemoryStorage, PyMongoVoyageAI
1010

1111
if "VOYAGEAI_API_KEY" not in os.environ:
1212
pytest.skip("Requires VoyageAI API Key.", allow_module_level=True)
@@ -95,6 +95,25 @@ def test_pdf_pages_storage(client: PyMongoVoyageAI):
9595
storage.client.delete_object(Bucket=storage.root_location, Key=object_name)
9696

9797

98+
def test_pdf_pages_custom_storage(client: PyMongoVoyageAI):
99+
query = "The consequences of a dictator's peace"
100+
url = "https://www.fdrlibrary.org/documents/356632/390886/readingcopy.pdf"
101+
storage = client._storage
102+
object_name = f"{ObjectId()}.pdf"
103+
with urllib.request.urlopen(url) as response:
104+
storage.client.upload_fileobj(response, storage.root_location, object_name)
105+
url = f"s3://{storage.root_location}/{object_name}"
106+
client._storage = MemoryStorage()
107+
images = client.url_to_images(url)
108+
resp = client.add_documents(images)
109+
client.wait_for_indexing()
110+
data = client.similarity_search(query, extract_images=True)
111+
assert len(data[0]["inputs"][0].image.tobytes()) > 0
112+
assert len(client.get_by_ids([d["_id"] for d in resp])) == len(resp)
113+
client.delete_by_ids([d["_id"] for d in resp])
114+
storage.client.delete_object(Bucket=storage.root_location, Key=object_name)
115+
116+
98117
@pytest.mark.asyncio
99118
async def test_image_set_async(client: PyMongoVoyageAI):
100119
url = "hf://datasets/princeton-nlp/CharXiv/val.parquet"

0 commit comments

Comments
 (0)