Update documentation

blink1073 · blink1073 · commit 8d62b7cc797c · 2025-04-21T12:04:54.000-05:00
diff --git a/docs/api.rst b/docs/api.rst
@@ -0,0 +1,30 @@
+API
+===
+
+Classes
+-------
+
+.. autoclass:: pymongo_voyageai.PyMongoVoyageAI
+   :members:
+
+   .. automethod:: __init__
+
+.. autoclass:: pymongo_voyageai.ImageDocument
+
+.. autoclass:: pymongo_voyageai.TextDocument
+
+.. autoclass:: pymongo_voyageai.StoredDocument
+
+.. autoclass:: pymongo_voyageai.S3Storage
+
+    .. automethod:: __init__
+
+.. autoclass:: pymongo_voyageai.MemoryStorage
+   :members:
+
+.. autoclass:: pymongo_voyageai.ObjectStorage
+   :members:
+
+.. autoclass:: pymongo_voyageai.DocumentType
+
+.. autoclass:: pymongo_voyageai.Document
diff --git a/docs/examples.rst b/docs/examples.rst
@@ -0,0 +1,127 @@
+Examples
+========
+
+Querying Against PDF Pages
+--------------------------
+
+.. code-block::python
+
+    import os
+    from pymongo_voyageai import PyMongoVoyageAI
+    client = PyMongoVoyageAI(
+        voyageai_api_key=os.environ["VOYAGEAI_API_KEY"],
+        s3_bucket_name=os.environ["S3_BUCKET_NAME"],
+        mongo_connection_string=os.environ["MONGODB_URI"],
+        collection_name="test",
+        database_name="test_db",
+    )
+
+    query = "The consequences of a dictator's peace"
+    url = "https://www.fdrlibrary.org/documents/356632/390886/readingcopy.pdf"
+    images = client.url_to_images(url)
+    resp = client.add_documents(images)
+    client.wait_for_indexing()
+    data = client.similarity_search(query, extract_images=False)
+
+    # We expect page 5 to be the best match.
+    assert data[0]["inputs"][0].page_number == 5
+    assert len(client.get_by_ids([d["_id"] for d in resp])) == len(resp)
+    client.delete_by_ids([d["_id"] for d in resp])
+    client.close()
+
+
+Querying Against Parquet Data
+-----------------------------
+
+.. code-block::python
+
+    import os
+    from pymongo_voyageai import PyMongoVoyageAI
+    client = PyMongoVoyageAI(
+        voyageai_api_key=os.environ["VOYAGEAI_API_KEY"],
+        s3_bucket_name=os.environ["S3_BUCKET_NAME"],
+        mongo_connection_string=os.environ["MONGODB_URI"],
+        collection_name="test",
+        database_name="test_db",
+    )
+
+    url = "hf://datasets/princeton-nlp/CharXiv/val.parquet"
+    documents = client.url_to_images(url, image_column="image", end=3)
+    resp = client.add_documents(documents)
+    client.wait_for_indexing()
+    query = "3D loss landscapes for different training strategies"
+    data = client.similarity_search(query, extract_images=True)
+
+    # The best match should be the third input image.
+    assert data[0]["inputs"][0].image.tobytes() == documents[2].image.tobytes()
+    client.delete_by_ids([d["_id"] for d in resp])
+    client.close()
+
+
+Combining Text and Images
+-------------------------
+
+.. code-block::python
+
+    import os
+    from pymongo_voyageai import PyMongoVoyageAI
+    client = PyMongoVoyageAI(
+        voyageai_api_key=os.environ["VOYAGEAI_API_KEY"],
+        s3_bucket_name=os.environ["S3_BUCKET_NAME"],
+        mongo_connection_string=os.environ["MONGODB_URI"],
+        collection_name="test",
+        database_name="test_db",
+    )
+
+    text = "Voyage AI makes best-in-class embedding models and rerankers."
+    images = client.url_to_images("https://www.voyageai.com/header-bg.png")
+    image = images[0].image
+    resp = client.add_documents(
+        [
+            [text],  # 0. single text
+            [image],  # 1. single image
+            [text, image],  # 2. text + image
+            [image, text],  # 3. image + text
+        ]
+    )
+    client.wait_for_indexing()
+
+    # The interleaved inputs should have different but similar embeddings.
+    embeddings = [d["embedding"] for d in resp]
+    assert embeddings[2] != embeddings[3]
+    assert np.dot(embeddings[2], embeddings[3]) > 0.95
+    client.delete_by_ids([d["_id"] for d in resp])
+    client.close()
+
+
+Using Async API
+---------------
+
+.. code-block::python
+
+    import os
+    from pymongo_voyageai import PyMongoVoyageAI
+    client = PyMongoVoyageAI(
+        voyageai_api_key=os.environ["VOYAGEAI_API_KEY"],
+        s3_bucket_name=os.environ["S3_BUCKET_NAME"],
+        mongo_connection_string=os.environ["MONGODB_URI"],
+        collection_name="test",
+        database_name="test_db",
+    )
+
+    url = "hf://datasets/princeton-nlp/CharXiv/val.parquet"
+    documents = await client.aurl_to_images(url, image_column="image", end=3)
+    resp = await client.aadd_documents(documents)
+    await client.await_for_indexing()
+
+    query = "3D loss landscapes for different training strategies"
+    data = await client.asimilarity_search(query, extract_images=True)
+
+    # The best match should be the third input image.
+    assert data[0]["inputs"][0].image.tobytes() == documents[2].image.tobytes()
+    ids = await client.aget_by_ids([d["_id"] for d in resp])
+    assert len(ids) == len(resp)
+
+    await client.adelete_by_ids([d["_id"] for d in resp])
+    await client.adelete_many({})
+    await client.aclose()
diff --git a/docs/index.rst b/docs/index.rst
@@ -46,30 +46,8 @@ Then, perform the multi-modal embedding:
     client.close()
 
 
-API
-===
+.. toctree::
+ :maxdepth: 2
 
-Classes
--------
-
-.. autoclass:: pymongo_voyageai.PyMongoVoyageAI
-   :members:
-
-   .. automethod:: __init__
-
-.. autoclass:: pymongo_voyageai.DocumentType
-
-.. autoclass:: pymongo_voyageai.Document
-
-.. autoclass:: pymongo_voyageai.ImageDocument
-
-.. autoclass:: pymongo_voyageai.TextDocument
-
-.. autoclass:: pymongo_voyageai.StoredDocument
-
-.. autoclass:: pymongo_voyageai.ObjectStorage
-   :members:
-
-.. autoclass:: pymongo_voyageai.S3Storage
-
-    .. automethod:: __init__
+ api
+ examples
diff --git a/pymongo_voyageai/client.py b/pymongo_voyageai/client.py
@@ -23,6 +23,108 @@
 
 
 class PyMongoVoyageAI:
+    """MongoDB and VoyageAI integration for multimodal embeddings.
+
+    PyMongoVoyageAI performs data operations on
+    text, images, embeddings and arbitrary data.
+    The PyMongoVoyageAI provides Vector Search
+    based on similarity of embedding vectors following the
+    Hierarchical Navigable Small Worlds (HNSW) algorithm.
+
+       Setup:
+        * Set up a MongoDB Atlas cluster. The free tier M0 will allow you to start.
+          Search Indexes are only available on Atlas, the fully managed cloud service,
+          not the self-managed MongoDB.
+          Follow [this guide](https://www.mongodb.com/basics/mongodb-atlas-tutorial)
+
+        * Create a Collection and a Vector Search Index.  The procedure is described
+          [here](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#procedure).
+          You can optionally supply a `dimensions` argument to programmatically create a Vector
+          Search Index.
+
+        * Set up your VoyageAI account on dash.voyageai.com.  You can either provide the
+          `voyageai_api_key` to the constructor or create a VoyageAI `Client` yourself and pass
+          it as `voyageai_client`.
+
+        * Set up an S3 bucket for storage.  Either provide the `s3_bucket_name` to use the default
+          AWS credentials or provide an instantiated S3 client to an `S3Storage` object and provide
+          that object as `storage_object`.  For local testing, you could instead pass a
+          `MemoryStorage` object.
+
+    Instantiate:
+        .. code-block:: python
+
+            import os
+            from pymongo import MongoClient
+            from pymongo_voyageia import PyMongoVoyageAI
+
+            client = PyMongoVoyageAI.from_connection_string(
+                connection_string=os.environ["MONGODB_ATLAS_CONNECTION_STRING"],
+                database_name="db_name",
+                collection_name="collection_name",
+                s3_bucket_name=os.environ["S3_BUCKET_NAME"],
+                voyageai_api_key=os.environ["VOYAGEAI_API_KEY"],
+            )
+
+    Add Documents:
+        .. code-block:: python
+
+            from pymongo_voyageai import TextDocument, ImageDocument
+
+            text = TextDocument(text="foo", metadata={"baz": "bar"})
+            images = client.url_to_images(
+                "https://www.fdrlibrary.org/documents/356632/390886/readingcopy.pdf"
+            )
+            documents = [text1, images[0], images[1]]
+            ids = ["1", "2", "3"]
+            client.add_documents(documents=documents, ids=ids)
+
+    Delete Documents:
+        .. code-block:: python
+
+            client.delete(ids=["3"])
+
+    Search:
+        .. code-block:: python
+
+            results = client.similarity_search(query="thud", k=1)
+            for doc in results:
+                print(f"* {doc['id']} [{doc['inputs']}]")
+
+
+    Search with filter:
+        .. code-block:: python
+
+            results = client.similarity_search(query="thud", k=1, post_filter=[{"bar": "baz"}])
+            for doc in results:
+                print(f"* {doc['id']} [{doc['inputs']}]")
+
+    Search with score:
+        .. code-block:: python
+
+            results = client.similarity_search(query="qux", k=1, include_scores=True)
+
+            for doc in results:
+                print(f"* [SIM={doc['score']:3f}] {doc['id']} [{doc['inputs']}]")
+
+    Async:
+        .. code-block:: python
+
+            # add documents
+            # await client.aadd_documents(documents=documents, ids=ids)
+
+            # delete documents
+            # await client.adelete(ids=["3"])
+
+            # search
+            # results = client.asimilarity_search(query="thud",k=1)
+
+            # search with score
+            results = await client.asimilarity_search(query="qux", k=1, include_scores=True)
+            for doc in results:
+                print(f"* [SIM={doc['score']:3f}] {doc['id']} [{doc['inputs']}]")
+    """
+
     def __init__(
         self,
         collection_name: str,