|
| 1 | +import logging |
| 2 | +from os import getenv |
| 3 | + |
| 4 | +import typer |
| 5 | +from azure.cosmos import CosmosClient, PartitionKey |
| 6 | +from dotenv import load_dotenv |
| 7 | +from langchain_community.document_loaders import PyMuPDFLoader |
| 8 | +from langchain_community.vectorstores.azure_cosmos_db_no_sql import AzureCosmosDBNoSqlVectorSearch |
| 9 | +from langchain_openai import AzureOpenAIEmbeddings |
| 10 | +from langchain_text_splitters import RecursiveCharacterTextSplitter |
| 11 | + |
| 12 | +load_dotenv() |
| 13 | +logger = logging.getLogger(__name__) |
| 14 | +app = typer.Typer() |
| 15 | + |
| 16 | + |
| 17 | +# https://python.langchain.com/docs/integrations/vectorstores/azure_cosmos_db_no_sql/ |
| 18 | +@app.command() |
| 19 | +def insert_data( |
| 20 | + pdf_url: str = "https://arxiv.org/pdf/2303.08774.pdf", |
| 21 | + chunk_size: int = 2000, |
| 22 | + chunk_overlap: int = 0, |
| 23 | + verbose: bool = True, |
| 24 | +): |
| 25 | + if verbose: |
| 26 | + logging.basicConfig(level=logging.DEBUG) |
| 27 | + |
| 28 | + # Load the PDF |
| 29 | + loader = PyMuPDFLoader(file_path=pdf_url) |
| 30 | + data = loader.load() |
| 31 | + |
| 32 | + # Split the text into chunks |
| 33 | + docs = RecursiveCharacterTextSplitter( |
| 34 | + chunk_size=chunk_size, |
| 35 | + chunk_overlap=chunk_overlap, |
| 36 | + ).split_documents(data) |
| 37 | + |
| 38 | + try: |
| 39 | + # Insert the data into Azure Cosmos DB |
| 40 | + database_name = getenv("AZURE_COSMOS_DB_DATABASE_NAME") |
| 41 | + AzureCosmosDBNoSqlVectorSearch.from_documents( |
| 42 | + documents=docs, |
| 43 | + embedding=AzureOpenAIEmbeddings( |
| 44 | + api_key=getenv("AZURE_OPENAI_API_KEY"), |
| 45 | + api_version=getenv("AZURE_OPENAI_API_VERSION"), |
| 46 | + azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"), |
| 47 | + model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"), |
| 48 | + ), |
| 49 | + cosmos_client=CosmosClient.from_connection_string(getenv("AZURE_COSMOS_DB_CONNECTION_STRING")), |
| 50 | + database_name=database_name, |
| 51 | + container_name=getenv("AZURE_COSMOS_DB_CONTAINER_NAME"), |
| 52 | + vector_embedding_policy={ |
| 53 | + "vectorEmbeddings": [ |
| 54 | + { |
| 55 | + "path": "/embedding", |
| 56 | + "dataType": "float32", |
| 57 | + "distanceFunction": "cosine", |
| 58 | + "dimensions": 3072, # for text-embedding-3-large |
| 59 | + } |
| 60 | + ] |
| 61 | + }, |
| 62 | + indexing_policy={ |
| 63 | + "indexingMode": "consistent", |
| 64 | + "includedPaths": [{"path": "/*"}], |
| 65 | + "excludedPaths": [{"path": '/"_etag"/?'}], |
| 66 | + "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}], |
| 67 | + }, |
| 68 | + cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, |
| 69 | + cosmos_database_properties={"id": database_name}, # need to add this |
| 70 | + ) |
| 71 | + except Exception as e: |
| 72 | + logger.error(f"error: {e}") |
| 73 | + |
| 74 | + |
| 75 | +@app.command() |
| 76 | +def query_data( |
| 77 | + query: str = "What were the compute requirements for training GPT 4", |
| 78 | + verbose: bool = True, |
| 79 | +): |
| 80 | + if verbose: |
| 81 | + logging.basicConfig(level=logging.DEBUG) |
| 82 | + |
| 83 | + database_name = getenv("AZURE_COSMOS_DB_DATABASE_NAME") |
| 84 | + vector_search = AzureCosmosDBNoSqlVectorSearch( |
| 85 | + embedding=AzureOpenAIEmbeddings( |
| 86 | + api_key=getenv("AZURE_OPENAI_API_KEY"), |
| 87 | + api_version=getenv("AZURE_OPENAI_API_VERSION"), |
| 88 | + azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"), |
| 89 | + model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"), |
| 90 | + ), |
| 91 | + cosmos_client=CosmosClient.from_connection_string(getenv("AZURE_COSMOS_DB_CONNECTION_STRING")), |
| 92 | + database_name=database_name, |
| 93 | + container_name=getenv("AZURE_COSMOS_DB_CONTAINER_NAME"), |
| 94 | + vector_embedding_policy={ |
| 95 | + "vectorEmbeddings": [ |
| 96 | + { |
| 97 | + "path": "/embedding", |
| 98 | + "dataType": "float32", |
| 99 | + "distanceFunction": "cosine", |
| 100 | + "dimensions": 3072, # for text-embedding-3-large |
| 101 | + } |
| 102 | + ] |
| 103 | + }, |
| 104 | + indexing_policy={ |
| 105 | + "indexingMode": "consistent", |
| 106 | + "includedPaths": [{"path": "/*"}], |
| 107 | + "excludedPaths": [{"path": '/"_etag"/?'}], |
| 108 | + "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}], |
| 109 | + }, |
| 110 | + cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, |
| 111 | + cosmos_database_properties={"id": database_name}, |
| 112 | + ) |
| 113 | + |
| 114 | + try: |
| 115 | + results = vector_search.similarity_search(query=query) |
| 116 | + for idx, result in enumerate(results): |
| 117 | + print(f"Result {idx + 1}: {result}") |
| 118 | + except Exception as e: |
| 119 | + logger.error(f"error: {e}") |
| 120 | + |
| 121 | + |
| 122 | +if __name__ == "__main__": |
| 123 | + load_dotenv() |
| 124 | + app() |
0 commit comments