From 41840da643aafd568e4e8602b1c9239b1f559ff8 Mon Sep 17 00:00:00 2001 From: ks6088ts Date: Sun, 24 Nov 2024 06:46:57 +0900 Subject: [PATCH 1/3] Update Azure Cosmos DB integration and setup documentation --- apps/3_call_azure_cosmos_db/.env.template | 10 ++ apps/3_call_azure_cosmos_db/README.md | 21 ++++ apps/3_call_azure_cosmos_db/requirements.txt | 0 .../3_call_azure_cosmos_db/vector_database.py | 108 +++++++----------- 4 files changed, 74 insertions(+), 65 deletions(-) create mode 100644 apps/3_call_azure_cosmos_db/.env.template create mode 100644 apps/3_call_azure_cosmos_db/README.md create mode 100644 apps/3_call_azure_cosmos_db/requirements.txt diff --git a/apps/3_call_azure_cosmos_db/.env.template b/apps/3_call_azure_cosmos_db/.env.template new file mode 100644 index 0000000..4a93f16 --- /dev/null +++ b/apps/3_call_azure_cosmos_db/.env.template @@ -0,0 +1,10 @@ +# Azure OpenAI Service +AZURE_OPENAI_ENDPOINT="https://.openai.azure.com/" +AZURE_OPENAI_API_KEY="" +AZURE_OPENAI_API_VERSION="2024-10-21" +AZURE_OPENAI_EMBEDDING_MODEL="text-embedding-3-large" + +# Azure Cosmos DB +AZURE_COSMOS_DB_CONNECTION_STRING="AccountEndpoint=https://.documents.azure.com:443/;AccountKey=;" +AZURE_COSMOS_DB_DATABASE_NAME="workshop" +AZURE_COSMOS_DB_CONTAINER_NAME="chat" diff --git a/apps/3_call_azure_cosmos_db/README.md b/apps/3_call_azure_cosmos_db/README.md new file mode 100644 index 0000000..b178c5b --- /dev/null +++ b/apps/3_call_azure_cosmos_db/README.md @@ -0,0 +1,21 @@ +# Setup + +```shell +# Set up virtual environment +python -m venv .venv +source .venv/bin/activate + +# Install dependencies +pip install typer python-dotenv azure-cosmos langchain-openai langchain-community +# pip install -r requirements.txt + +python vector_database.py --help +``` + +# References + +- [Azure Cosmos DB No SQL](https://python.langchain.com/docs/integrations/vectorstores/azure_cosmos_db_no_sql/) +- [Learn Azure Azure Cosmos DB Vector database](https://learn.microsoft.com/azure/cosmos-db/vector-database) +- [AzureDataRetrievalAugmentedGenerationSamples/Python/CosmosDB-NoSQL_VectorSearch](https://github.com/microsoft/AzureDataRetrievalAugmentedGenerationSamples/tree/main/Python/CosmosDB-NoSQL_VectorSearch) +- [Azure Cosmos DB ベクター検索機能と RAG の実装ガイド](https://note.com/generativeai_new/n/n3fcb2e57d195) +- [Azure CosmosDB for NoSQL でベクトル検索しよう!!](https://zenn.dev/nomhiro/articles/cosmos-nosql-vector-search) diff --git a/apps/3_call_azure_cosmos_db/requirements.txt b/apps/3_call_azure_cosmos_db/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/apps/3_call_azure_cosmos_db/vector_database.py b/apps/3_call_azure_cosmos_db/vector_database.py index e8c84b0..c3688f1 100644 --- a/apps/3_call_azure_cosmos_db/vector_database.py +++ b/apps/3_call_azure_cosmos_db/vector_database.py @@ -14,7 +14,45 @@ app = typer.Typer() -# https://python.langchain.com/docs/integrations/vectorstores/azure_cosmos_db_no_sql/ +def get_vector_embedding_policy(): + return { + "vectorEmbeddings": [ + { + "path": "/embedding", + "dataType": "float32", + "distanceFunction": "cosine", + "dimensions": 3072, # for text-embedding-3-large + } + ] + } + + +def get_indexing_policy(): + return { + "indexingMode": "consistent", + "includedPaths": [{"path": "/*"}], + "excludedPaths": [{"path": '/"_etag"/?'}], + "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}], + } + + +def get_azure_cosmos_db_no_sql_vector_search(): + return AzureCosmosDBNoSqlVectorSearch( + embedding=AzureOpenAIEmbeddings( + api_key=getenv("AZURE_OPENAI_API_KEY"), + api_version=getenv("AZURE_OPENAI_API_VERSION"), + azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"), + model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"), + ), + cosmos_client=CosmosClient.from_connection_string(getenv("AZURE_COSMOS_DB_CONNECTION_STRING")), + database_name=getenv("AZURE_COSMOS_DB_DATABASE_NAME"), + container_name=getenv("AZURE_COSMOS_DB_CONTAINER_NAME"), + vector_embedding_policy=get_vector_embedding_policy(), + indexing_policy=get_indexing_policy(), + cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, + cosmos_database_properties={"id": getenv("AZURE_COSMOS_DB_DATABASE_NAME")}, + ) + @app.command() def insert_data( pdf_url: str = "https://arxiv.org/pdf/2303.08774.pdf", @@ -36,38 +74,8 @@ def insert_data( ).split_documents(data) try: - # Insert the data into Azure Cosmos DB - database_name = getenv("AZURE_COSMOS_DB_DATABASE_NAME") - AzureCosmosDBNoSqlVectorSearch.from_documents( - documents=docs, - embedding=AzureOpenAIEmbeddings( - api_key=getenv("AZURE_OPENAI_API_KEY"), - api_version=getenv("AZURE_OPENAI_API_VERSION"), - azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"), - model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"), - ), - cosmos_client=CosmosClient.from_connection_string(getenv("AZURE_COSMOS_DB_CONNECTION_STRING")), - database_name=database_name, - container_name=getenv("AZURE_COSMOS_DB_CONTAINER_NAME"), - vector_embedding_policy={ - "vectorEmbeddings": [ - { - "path": "/embedding", - "dataType": "float32", - "distanceFunction": "cosine", - "dimensions": 3072, # for text-embedding-3-large - } - ] - }, - indexing_policy={ - "indexingMode": "consistent", - "includedPaths": [{"path": "/*"}], - "excludedPaths": [{"path": '/"_etag"/?'}], - "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}], - }, - cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, - cosmos_database_properties={"id": database_name}, # need to add this - ) + vector_search = get_azure_cosmos_db_no_sql_vector_search() + vector_search.add_documents(docs) except Exception as e: logger.error(f"error: {e}") @@ -79,40 +87,10 @@ def query_data( ): if verbose: logging.basicConfig(level=logging.DEBUG) - - database_name = getenv("AZURE_COSMOS_DB_DATABASE_NAME") - vector_search = AzureCosmosDBNoSqlVectorSearch( - embedding=AzureOpenAIEmbeddings( - api_key=getenv("AZURE_OPENAI_API_KEY"), - api_version=getenv("AZURE_OPENAI_API_VERSION"), - azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"), - model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"), - ), - cosmos_client=CosmosClient.from_connection_string(getenv("AZURE_COSMOS_DB_CONNECTION_STRING")), - database_name=database_name, - container_name=getenv("AZURE_COSMOS_DB_CONTAINER_NAME"), - vector_embedding_policy={ - "vectorEmbeddings": [ - { - "path": "/embedding", - "dataType": "float32", - "distanceFunction": "cosine", - "dimensions": 3072, # for text-embedding-3-large - } - ] - }, - indexing_policy={ - "indexingMode": "consistent", - "includedPaths": [{"path": "/*"}], - "excludedPaths": [{"path": '/"_etag"/?'}], - "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}], - }, - cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, - cosmos_database_properties={"id": database_name}, - ) - try: + vector_search = get_azure_cosmos_db_no_sql_vector_search() results = vector_search.similarity_search(query=query) + logger.info(f"got {len(results)} results") for idx, result in enumerate(results): print(f"Result {idx + 1}: {result}") except Exception as e: From 39da483230d95e1ab696c1bb91a2e981b9af2904 Mon Sep 17 00:00:00 2001 From: ks6088ts Date: Sun, 24 Nov 2024 07:17:23 +0900 Subject: [PATCH 2/3] pip freeze > requirements.txt --- requirements.txt | 89 +++++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/requirements.txt b/requirements.txt index 975d120..0fb11fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,33 +1,60 @@ -openai==1.43.1 +aiohappyeyeballs==2.4.3 +aiohttp==3.11.7 +aiosignal==1.3.1 +annotated-types==0.7.0 +anyio==4.6.2.post1 +attrs==24.2.0 +azure-core==1.32.0 +azure-cosmos==4.9.0 +certifi==2024.8.30 +charset-normalizer==3.4.0 +click==8.1.7 +dataclasses-json==0.6.7 +distro==1.9.0 +frozenlist==1.5.0 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.27.2 +httpx-sse==0.4.0 +idna==3.10 +jiter==0.7.1 +jsonpatch==1.33 +jsonpointer==3.0.0 +langchain==0.3.8 +langchain-community==0.3.8 +langchain-core==0.3.21 +langchain-openai==0.2.9 +langchain-text-splitters==0.3.2 +langsmith==0.1.145 +markdown-it-py==3.0.0 +marshmallow==3.23.1 +mdurl==0.1.2 +multidict==6.1.0 +mypy-extensions==1.0.0 +numpy==1.26.4 +openai==1.55.0 +orjson==3.10.12 +packaging==24.2 +propcache==0.2.0 +pydantic==2.10.1 +pydantic-settings==2.6.1 +pydantic_core==2.27.1 +Pygments==2.18.0 python-dotenv==1.0.1 -streamlit==1.38.0 -azure-cosmos==4.7.0 -plotly==5.24.1 -pandas==2.2.2 -langchain==0.2.16 -langchain-openai==0.1.25 -langchain-community==0.2.12 -azure-search-documents==11.5.1 -azure-identity==1.18.0 -azure-ai-documentintelligence==1.0.0b4 -azure-storage-blob==12.23.1 +PyYAML==6.0.2 +regex==2024.11.6 requests==2.32.3 -promptflow==1.15.0 -promptflow-evals==0.3.2 -langgraph==0.2.23 -langchain-chroma==0.1.4 -beautifulsoup4==4.12.3 -langgraph-checkpoint-sqlite==1.0.4 -playwright==1.47.0 -lxml==5.3.0 -nest-asyncio==1.6.0 -typer==0.12.5 - -# To run 99_streamlit_examples/pages/10_Object_Detection.py -# ultralytics==8.2.89 - -# To run 99_streamlit_examples/pages/11_Pose_Estimation.py -# mediapipe==0.10.14 - -# To run 99_streamlit_examples/pages/12_Video_processing.py -# opencv-python-headless==4.10.0.84 +requests-toolbelt==1.0.0 +rich==13.9.4 +shellingham==1.5.4 +six==1.16.0 +sniffio==1.3.1 +SQLAlchemy==2.0.35 +tenacity==9.0.0 +tiktoken==0.8.0 +tqdm==4.67.0 +typer==0.13.1 +typing-inspect==0.9.0 +typing_extensions==4.12.2 +urllib3==2.2.3 +yarl==1.18.0 From 3cff04a0f6d8020df44aa99572339479d9515684 Mon Sep 17 00:00:00 2001 From: ks6088ts Date: Sun, 24 Nov 2024 07:20:10 +0900 Subject: [PATCH 3/3] fix style issues --- apps/3_call_azure_cosmos_db/vector_database.py | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/3_call_azure_cosmos_db/vector_database.py b/apps/3_call_azure_cosmos_db/vector_database.py index c3688f1..abc277d 100644 --- a/apps/3_call_azure_cosmos_db/vector_database.py +++ b/apps/3_call_azure_cosmos_db/vector_database.py @@ -53,6 +53,7 @@ def get_azure_cosmos_db_no_sql_vector_search(): cosmos_database_properties={"id": getenv("AZURE_COSMOS_DB_DATABASE_NAME")}, ) + @app.command() def insert_data( pdf_url: str = "https://arxiv.org/pdf/2303.08774.pdf",