|
| 1 | +Examples |
| 2 | +======== |
| 3 | + |
| 4 | +Querying Against PDF Pages |
| 5 | +-------------------------- |
| 6 | + |
| 7 | +.. code-block::python |
| 8 | +
|
| 9 | + import os |
| 10 | + from pymongo_voyageai import PyMongoVoyageAI |
| 11 | + client = PyMongoVoyageAI( |
| 12 | + voyageai_api_key=os.environ["VOYAGEAI_API_KEY"], |
| 13 | + s3_bucket_name=os.environ["S3_BUCKET_NAME"], |
| 14 | + mongo_connection_string=os.environ["MONGODB_URI"], |
| 15 | + collection_name="test", |
| 16 | + database_name="test_db", |
| 17 | + ) |
| 18 | +
|
| 19 | + query = "The consequences of a dictator's peace" |
| 20 | + url = "https://www.fdrlibrary.org/documents/356632/390886/readingcopy.pdf" |
| 21 | + images = client.url_to_images(url) |
| 22 | + resp = client.add_documents(images) |
| 23 | + client.wait_for_indexing() |
| 24 | + data = client.similarity_search(query, extract_images=False) |
| 25 | +
|
| 26 | + # We expect page 5 to be the best match. |
| 27 | + assert data[0]["inputs"][0].page_number == 5 |
| 28 | + assert len(client.get_by_ids([d["_id"] for d in resp])) == len(resp) |
| 29 | + client.delete_by_ids([d["_id"] for d in resp]) |
| 30 | + client.close() |
| 31 | +
|
| 32 | +
|
| 33 | +Querying Against Parquet Data |
| 34 | +----------------------------- |
| 35 | + |
| 36 | +.. code-block::python |
| 37 | +
|
| 38 | + import os |
| 39 | + from pymongo_voyageai import PyMongoVoyageAI |
| 40 | + client = PyMongoVoyageAI( |
| 41 | + voyageai_api_key=os.environ["VOYAGEAI_API_KEY"], |
| 42 | + s3_bucket_name=os.environ["S3_BUCKET_NAME"], |
| 43 | + mongo_connection_string=os.environ["MONGODB_URI"], |
| 44 | + collection_name="test", |
| 45 | + database_name="test_db", |
| 46 | + ) |
| 47 | +
|
| 48 | + url = "hf://datasets/princeton-nlp/CharXiv/val.parquet" |
| 49 | + documents = client.url_to_images(url, image_column="image", end=3) |
| 50 | + resp = client.add_documents(documents) |
| 51 | + client.wait_for_indexing() |
| 52 | + query = "3D loss landscapes for different training strategies" |
| 53 | + data = client.similarity_search(query, extract_images=True) |
| 54 | +
|
| 55 | + # The best match should be the third input image. |
| 56 | + assert data[0]["inputs"][0].image.tobytes() == documents[2].image.tobytes() |
| 57 | + client.delete_by_ids([d["_id"] for d in resp]) |
| 58 | + client.close() |
| 59 | +
|
| 60 | +
|
| 61 | +Combining Text and Images |
| 62 | +------------------------- |
| 63 | + |
| 64 | +.. code-block::python |
| 65 | +
|
| 66 | + import os |
| 67 | + from pymongo_voyageai import PyMongoVoyageAI |
| 68 | + client = PyMongoVoyageAI( |
| 69 | + voyageai_api_key=os.environ["VOYAGEAI_API_KEY"], |
| 70 | + s3_bucket_name=os.environ["S3_BUCKET_NAME"], |
| 71 | + mongo_connection_string=os.environ["MONGODB_URI"], |
| 72 | + collection_name="test", |
| 73 | + database_name="test_db", |
| 74 | + ) |
| 75 | +
|
| 76 | + text = "Voyage AI makes best-in-class embedding models and rerankers." |
| 77 | + images = client.url_to_images("https://www.voyageai.com/header-bg.png") |
| 78 | + image = images[0].image |
| 79 | + resp = client.add_documents( |
| 80 | + [ |
| 81 | + [text], # 0. single text |
| 82 | + [image], # 1. single image |
| 83 | + [text, image], # 2. text + image |
| 84 | + [image, text], # 3. image + text |
| 85 | + ] |
| 86 | + ) |
| 87 | + client.wait_for_indexing() |
| 88 | +
|
| 89 | + # The interleaved inputs should have different but similar embeddings. |
| 90 | + embeddings = [d["embedding"] for d in resp] |
| 91 | + assert embeddings[2] != embeddings[3] |
| 92 | + assert np.dot(embeddings[2], embeddings[3]) > 0.95 |
| 93 | + client.delete_by_ids([d["_id"] for d in resp]) |
| 94 | + client.close() |
| 95 | +
|
| 96 | +
|
| 97 | +Using Async API |
| 98 | +--------------- |
| 99 | + |
| 100 | +.. code-block::python |
| 101 | +
|
| 102 | + import os |
| 103 | + from pymongo_voyageai import PyMongoVoyageAI |
| 104 | + client = PyMongoVoyageAI( |
| 105 | + voyageai_api_key=os.environ["VOYAGEAI_API_KEY"], |
| 106 | + s3_bucket_name=os.environ["S3_BUCKET_NAME"], |
| 107 | + mongo_connection_string=os.environ["MONGODB_URI"], |
| 108 | + collection_name="test", |
| 109 | + database_name="test_db", |
| 110 | + ) |
| 111 | +
|
| 112 | + url = "hf://datasets/princeton-nlp/CharXiv/val.parquet" |
| 113 | + documents = await client.aurl_to_images(url, image_column="image", end=3) |
| 114 | + resp = await client.aadd_documents(documents) |
| 115 | + await client.await_for_indexing() |
| 116 | +
|
| 117 | + query = "3D loss landscapes for different training strategies" |
| 118 | + data = await client.asimilarity_search(query, extract_images=True) |
| 119 | +
|
| 120 | + # The best match should be the third input image. |
| 121 | + assert data[0]["inputs"][0].image.tobytes() == documents[2].image.tobytes() |
| 122 | + ids = await client.aget_by_ids([d["_id"] for d in resp]) |
| 123 | + assert len(ids) == len(resp) |
| 124 | +
|
| 125 | + await client.adelete_by_ids([d["_id"] for d in resp]) |
| 126 | + await client.adelete_many({}) |
| 127 | + await client.aclose() |
0 commit comments