Skip to content

Commit 8d62b7c

Browse files
committed
Update documentation
1 parent d1e693e commit 8d62b7c

File tree

4 files changed

+263
-26
lines changed

4 files changed

+263
-26
lines changed

docs/api.rst

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
API
2+
===
3+
4+
Classes
5+
-------
6+
7+
.. autoclass:: pymongo_voyageai.PyMongoVoyageAI
8+
:members:
9+
10+
.. automethod:: __init__
11+
12+
.. autoclass:: pymongo_voyageai.ImageDocument
13+
14+
.. autoclass:: pymongo_voyageai.TextDocument
15+
16+
.. autoclass:: pymongo_voyageai.StoredDocument
17+
18+
.. autoclass:: pymongo_voyageai.S3Storage
19+
20+
.. automethod:: __init__
21+
22+
.. autoclass:: pymongo_voyageai.MemoryStorage
23+
:members:
24+
25+
.. autoclass:: pymongo_voyageai.ObjectStorage
26+
:members:
27+
28+
.. autoclass:: pymongo_voyageai.DocumentType
29+
30+
.. autoclass:: pymongo_voyageai.Document

docs/examples.rst

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
Examples
2+
========
3+
4+
Querying Against PDF Pages
5+
--------------------------
6+
7+
.. code-block::python
8+
9+
import os
10+
from pymongo_voyageai import PyMongoVoyageAI
11+
client = PyMongoVoyageAI(
12+
voyageai_api_key=os.environ["VOYAGEAI_API_KEY"],
13+
s3_bucket_name=os.environ["S3_BUCKET_NAME"],
14+
mongo_connection_string=os.environ["MONGODB_URI"],
15+
collection_name="test",
16+
database_name="test_db",
17+
)
18+
19+
query = "The consequences of a dictator's peace"
20+
url = "https://www.fdrlibrary.org/documents/356632/390886/readingcopy.pdf"
21+
images = client.url_to_images(url)
22+
resp = client.add_documents(images)
23+
client.wait_for_indexing()
24+
data = client.similarity_search(query, extract_images=False)
25+
26+
# We expect page 5 to be the best match.
27+
assert data[0]["inputs"][0].page_number == 5
28+
assert len(client.get_by_ids([d["_id"] for d in resp])) == len(resp)
29+
client.delete_by_ids([d["_id"] for d in resp])
30+
client.close()
31+
32+
33+
Querying Against Parquet Data
34+
-----------------------------
35+
36+
.. code-block::python
37+
38+
import os
39+
from pymongo_voyageai import PyMongoVoyageAI
40+
client = PyMongoVoyageAI(
41+
voyageai_api_key=os.environ["VOYAGEAI_API_KEY"],
42+
s3_bucket_name=os.environ["S3_BUCKET_NAME"],
43+
mongo_connection_string=os.environ["MONGODB_URI"],
44+
collection_name="test",
45+
database_name="test_db",
46+
)
47+
48+
url = "hf://datasets/princeton-nlp/CharXiv/val.parquet"
49+
documents = client.url_to_images(url, image_column="image", end=3)
50+
resp = client.add_documents(documents)
51+
client.wait_for_indexing()
52+
query = "3D loss landscapes for different training strategies"
53+
data = client.similarity_search(query, extract_images=True)
54+
55+
# The best match should be the third input image.
56+
assert data[0]["inputs"][0].image.tobytes() == documents[2].image.tobytes()
57+
client.delete_by_ids([d["_id"] for d in resp])
58+
client.close()
59+
60+
61+
Combining Text and Images
62+
-------------------------
63+
64+
.. code-block::python
65+
66+
import os
67+
from pymongo_voyageai import PyMongoVoyageAI
68+
client = PyMongoVoyageAI(
69+
voyageai_api_key=os.environ["VOYAGEAI_API_KEY"],
70+
s3_bucket_name=os.environ["S3_BUCKET_NAME"],
71+
mongo_connection_string=os.environ["MONGODB_URI"],
72+
collection_name="test",
73+
database_name="test_db",
74+
)
75+
76+
text = "Voyage AI makes best-in-class embedding models and rerankers."
77+
images = client.url_to_images("https://www.voyageai.com/header-bg.png")
78+
image = images[0].image
79+
resp = client.add_documents(
80+
[
81+
[text], # 0. single text
82+
[image], # 1. single image
83+
[text, image], # 2. text + image
84+
[image, text], # 3. image + text
85+
]
86+
)
87+
client.wait_for_indexing()
88+
89+
# The interleaved inputs should have different but similar embeddings.
90+
embeddings = [d["embedding"] for d in resp]
91+
assert embeddings[2] != embeddings[3]
92+
assert np.dot(embeddings[2], embeddings[3]) > 0.95
93+
client.delete_by_ids([d["_id"] for d in resp])
94+
client.close()
95+
96+
97+
Using Async API
98+
---------------
99+
100+
.. code-block::python
101+
102+
import os
103+
from pymongo_voyageai import PyMongoVoyageAI
104+
client = PyMongoVoyageAI(
105+
voyageai_api_key=os.environ["VOYAGEAI_API_KEY"],
106+
s3_bucket_name=os.environ["S3_BUCKET_NAME"],
107+
mongo_connection_string=os.environ["MONGODB_URI"],
108+
collection_name="test",
109+
database_name="test_db",
110+
)
111+
112+
url = "hf://datasets/princeton-nlp/CharXiv/val.parquet"
113+
documents = await client.aurl_to_images(url, image_column="image", end=3)
114+
resp = await client.aadd_documents(documents)
115+
await client.await_for_indexing()
116+
117+
query = "3D loss landscapes for different training strategies"
118+
data = await client.asimilarity_search(query, extract_images=True)
119+
120+
# The best match should be the third input image.
121+
assert data[0]["inputs"][0].image.tobytes() == documents[2].image.tobytes()
122+
ids = await client.aget_by_ids([d["_id"] for d in resp])
123+
assert len(ids) == len(resp)
124+
125+
await client.adelete_by_ids([d["_id"] for d in resp])
126+
await client.adelete_many({})
127+
await client.aclose()

docs/index.rst

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -46,30 +46,8 @@ Then, perform the multi-modal embedding:
4646
client.close()
4747
4848
49-
API
50-
===
49+
.. toctree::
50+
:maxdepth: 2
5151

52-
Classes
53-
-------
54-
55-
.. autoclass:: pymongo_voyageai.PyMongoVoyageAI
56-
:members:
57-
58-
.. automethod:: __init__
59-
60-
.. autoclass:: pymongo_voyageai.DocumentType
61-
62-
.. autoclass:: pymongo_voyageai.Document
63-
64-
.. autoclass:: pymongo_voyageai.ImageDocument
65-
66-
.. autoclass:: pymongo_voyageai.TextDocument
67-
68-
.. autoclass:: pymongo_voyageai.StoredDocument
69-
70-
.. autoclass:: pymongo_voyageai.ObjectStorage
71-
:members:
72-
73-
.. autoclass:: pymongo_voyageai.S3Storage
74-
75-
.. automethod:: __init__
52+
api
53+
examples

pymongo_voyageai/client.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,108 @@
2323

2424

2525
class PyMongoVoyageAI:
26+
"""MongoDB and VoyageAI integration for multimodal embeddings.
27+
28+
PyMongoVoyageAI performs data operations on
29+
text, images, embeddings and arbitrary data.
30+
The PyMongoVoyageAI provides Vector Search
31+
based on similarity of embedding vectors following the
32+
Hierarchical Navigable Small Worlds (HNSW) algorithm.
33+
34+
Setup:
35+
* Set up a MongoDB Atlas cluster. The free tier M0 will allow you to start.
36+
Search Indexes are only available on Atlas, the fully managed cloud service,
37+
not the self-managed MongoDB.
38+
Follow [this guide](https://www.mongodb.com/basics/mongodb-atlas-tutorial)
39+
40+
* Create a Collection and a Vector Search Index. The procedure is described
41+
[here](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#procedure).
42+
You can optionally supply a `dimensions` argument to programmatically create a Vector
43+
Search Index.
44+
45+
* Set up your VoyageAI account on dash.voyageai.com. You can either provide the
46+
`voyageai_api_key` to the constructor or create a VoyageAI `Client` yourself and pass
47+
it as `voyageai_client`.
48+
49+
* Set up an S3 bucket for storage. Either provide the `s3_bucket_name` to use the default
50+
AWS credentials or provide an instantiated S3 client to an `S3Storage` object and provide
51+
that object as `storage_object`. For local testing, you could instead pass a
52+
`MemoryStorage` object.
53+
54+
Instantiate:
55+
.. code-block:: python
56+
57+
import os
58+
from pymongo import MongoClient
59+
from pymongo_voyageia import PyMongoVoyageAI
60+
61+
client = PyMongoVoyageAI.from_connection_string(
62+
connection_string=os.environ["MONGODB_ATLAS_CONNECTION_STRING"],
63+
database_name="db_name",
64+
collection_name="collection_name",
65+
s3_bucket_name=os.environ["S3_BUCKET_NAME"],
66+
voyageai_api_key=os.environ["VOYAGEAI_API_KEY"],
67+
)
68+
69+
Add Documents:
70+
.. code-block:: python
71+
72+
from pymongo_voyageai import TextDocument, ImageDocument
73+
74+
text = TextDocument(text="foo", metadata={"baz": "bar"})
75+
images = client.url_to_images(
76+
"https://www.fdrlibrary.org/documents/356632/390886/readingcopy.pdf"
77+
)
78+
documents = [text1, images[0], images[1]]
79+
ids = ["1", "2", "3"]
80+
client.add_documents(documents=documents, ids=ids)
81+
82+
Delete Documents:
83+
.. code-block:: python
84+
85+
client.delete(ids=["3"])
86+
87+
Search:
88+
.. code-block:: python
89+
90+
results = client.similarity_search(query="thud", k=1)
91+
for doc in results:
92+
print(f"* {doc['id']} [{doc['inputs']}]")
93+
94+
95+
Search with filter:
96+
.. code-block:: python
97+
98+
results = client.similarity_search(query="thud", k=1, post_filter=[{"bar": "baz"}])
99+
for doc in results:
100+
print(f"* {doc['id']} [{doc['inputs']}]")
101+
102+
Search with score:
103+
.. code-block:: python
104+
105+
results = client.similarity_search(query="qux", k=1, include_scores=True)
106+
107+
for doc in results:
108+
print(f"* [SIM={doc['score']:3f}] {doc['id']} [{doc['inputs']}]")
109+
110+
Async:
111+
.. code-block:: python
112+
113+
# add documents
114+
# await client.aadd_documents(documents=documents, ids=ids)
115+
116+
# delete documents
117+
# await client.adelete(ids=["3"])
118+
119+
# search
120+
# results = client.asimilarity_search(query="thud",k=1)
121+
122+
# search with score
123+
results = await client.asimilarity_search(query="qux", k=1, include_scores=True)
124+
for doc in results:
125+
print(f"* [SIM={doc['score']:3f}] {doc['id']} [{doc['inputs']}]")
126+
"""
127+
26128
def __init__(
27129
self,
28130
collection_name: str,

0 commit comments

Comments
 (0)