Merge pull request #175 from ks6088ts-labs/feature/issue-174_vectordb

ks6088ts · web-flow · commit 1228a41b5273 · 2024-11-22T16:29:05.000+09:00
call vector database features
diff --git a/.env.template b/.env.template
@@ -1,7 +1,7 @@
 # Azure OpenAI Service
 AZURE_OPENAI_ENDPOINT="https://<YOUR_AOAI_NAME>.openai.azure.com/"
 AZURE_OPENAI_API_KEY="<YOUR_API_KEY>"
-AZURE_OPENAI_API_VERSION="2024-07-01-preview"
+AZURE_OPENAI_API_VERSION="2024-10-21"
 AZURE_OPENAI_GPT_MODEL="gpt-4o"
 AZURE_OPENAI_STT_MODEL="whisper"
 AZURE_OPENAI_TTS_MODEL="tts-hd"
diff --git a/apps/3_call_azure_cosmos_db/vector_database.py b/apps/3_call_azure_cosmos_db/vector_database.py
@@ -0,0 +1,124 @@
+import logging
+from os import getenv
+
+import typer
+from azure.cosmos import CosmosClient, PartitionKey
+from dotenv import load_dotenv
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_community.vectorstores.azure_cosmos_db_no_sql import AzureCosmosDBNoSqlVectorSearch
+from langchain_openai import AzureOpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+load_dotenv()
+logger = logging.getLogger(__name__)
+app = typer.Typer()
+
+
+# https://python.langchain.com/docs/integrations/vectorstores/azure_cosmos_db_no_sql/
+@app.command()
+def insert_data(
+    pdf_url: str = "https://arxiv.org/pdf/2303.08774.pdf",
+    chunk_size: int = 2000,
+    chunk_overlap: int = 0,
+    verbose: bool = True,
+):
+    if verbose:
+        logging.basicConfig(level=logging.DEBUG)
+
+    # Load the PDF
+    loader = PyMuPDFLoader(file_path=pdf_url)
+    data = loader.load()
+
+    # Split the text into chunks
+    docs = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+    ).split_documents(data)
+
+    try:
+        # Insert the data into Azure Cosmos DB
+        database_name = getenv("AZURE_COSMOS_DB_DATABASE_NAME")
+        AzureCosmosDBNoSqlVectorSearch.from_documents(
+            documents=docs,
+            embedding=AzureOpenAIEmbeddings(
+                api_key=getenv("AZURE_OPENAI_API_KEY"),
+                api_version=getenv("AZURE_OPENAI_API_VERSION"),
+                azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
+                model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"),
+            ),
+            cosmos_client=CosmosClient.from_connection_string(getenv("AZURE_COSMOS_DB_CONNECTION_STRING")),
+            database_name=database_name,
+            container_name=getenv("AZURE_COSMOS_DB_CONTAINER_NAME"),
+            vector_embedding_policy={
+                "vectorEmbeddings": [
+                    {
+                        "path": "/embedding",
+                        "dataType": "float32",
+                        "distanceFunction": "cosine",
+                        "dimensions": 3072,  # for text-embedding-3-large
+                    }
+                ]
+            },
+            indexing_policy={
+                "indexingMode": "consistent",
+                "includedPaths": [{"path": "/*"}],
+                "excludedPaths": [{"path": '/"_etag"/?'}],
+                "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
+            },
+            cosmos_container_properties={"partition_key": PartitionKey(path="/id")},
+            cosmos_database_properties={"id": database_name},  # need to add this
+        )
+    except Exception as e:
+        logger.error(f"error: {e}")
+
+
+@app.command()
+def query_data(
+    query: str = "What were the compute requirements for training GPT 4",
+    verbose: bool = True,
+):
+    if verbose:
+        logging.basicConfig(level=logging.DEBUG)
+
+    database_name = getenv("AZURE_COSMOS_DB_DATABASE_NAME")
+    vector_search = AzureCosmosDBNoSqlVectorSearch(
+        embedding=AzureOpenAIEmbeddings(
+            api_key=getenv("AZURE_OPENAI_API_KEY"),
+            api_version=getenv("AZURE_OPENAI_API_VERSION"),
+            azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
+            model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"),
+        ),
+        cosmos_client=CosmosClient.from_connection_string(getenv("AZURE_COSMOS_DB_CONNECTION_STRING")),
+        database_name=database_name,
+        container_name=getenv("AZURE_COSMOS_DB_CONTAINER_NAME"),
+        vector_embedding_policy={
+            "vectorEmbeddings": [
+                {
+                    "path": "/embedding",
+                    "dataType": "float32",
+                    "distanceFunction": "cosine",
+                    "dimensions": 3072,  # for text-embedding-3-large
+                }
+            ]
+        },
+        indexing_policy={
+            "indexingMode": "consistent",
+            "includedPaths": [{"path": "/*"}],
+            "excludedPaths": [{"path": '/"_etag"/?'}],
+            "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
+        },
+        cosmos_container_properties={"partition_key": PartitionKey(path="/id")},
+        cosmos_database_properties={"id": database_name},
+    )
+
+    try:
+        results = vector_search.similarity_search(query=query)
+        for idx, result in enumerate(results):
+            print(f"Result {idx + 1}: {result}")
+    except Exception as e:
+        logger.error(f"error: {e}")
+
+
+if __name__ == "__main__":
+    load_dotenv()
+    app()
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ nest-asyncio = "^1.6.0"
 typer = "^0.12.5"
 azure-cognitiveservices-speech = "^1.40.0"
 openai-whisper = "^20240930"
+pymupdf = "^1.24.14"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^4.0.0"