From 71a293ffffc48bcd215a14c36482fe1972304770 Mon Sep 17 00:00:00 2001 From: ks6088ts Date: Fri, 22 Nov 2024 14:27:14 +0900 Subject: [PATCH] call vector database features --- .env.template | 2 +- .../3_call_azure_cosmos_db/vector_database.py | 124 ++++++++++++++++++ poetry.lock | 63 ++++++--- pyproject.toml | 1 + 4 files changed, 172 insertions(+), 18 deletions(-) create mode 100644 apps/3_call_azure_cosmos_db/vector_database.py diff --git a/.env.template b/.env.template index 42e6011..9bcd20b 100644 --- a/.env.template +++ b/.env.template @@ -1,7 +1,7 @@ # Azure OpenAI Service AZURE_OPENAI_ENDPOINT="https://.openai.azure.com/" AZURE_OPENAI_API_KEY="" -AZURE_OPENAI_API_VERSION="2024-07-01-preview" +AZURE_OPENAI_API_VERSION="2024-10-21" AZURE_OPENAI_GPT_MODEL="gpt-4o" AZURE_OPENAI_STT_MODEL="whisper" AZURE_OPENAI_TTS_MODEL="tts-hd" diff --git a/apps/3_call_azure_cosmos_db/vector_database.py b/apps/3_call_azure_cosmos_db/vector_database.py new file mode 100644 index 0000000..e8c84b0 --- /dev/null +++ b/apps/3_call_azure_cosmos_db/vector_database.py @@ -0,0 +1,124 @@ +import logging +from os import getenv + +import typer +from azure.cosmos import CosmosClient, PartitionKey +from dotenv import load_dotenv +from langchain_community.document_loaders import PyMuPDFLoader +from langchain_community.vectorstores.azure_cosmos_db_no_sql import AzureCosmosDBNoSqlVectorSearch +from langchain_openai import AzureOpenAIEmbeddings +from langchain_text_splitters import RecursiveCharacterTextSplitter + +load_dotenv() +logger = logging.getLogger(__name__) +app = typer.Typer() + + +# https://python.langchain.com/docs/integrations/vectorstores/azure_cosmos_db_no_sql/ +@app.command() +def insert_data( + pdf_url: str = "https://arxiv.org/pdf/2303.08774.pdf", + chunk_size: int = 2000, + chunk_overlap: int = 0, + verbose: bool = True, +): + if verbose: + logging.basicConfig(level=logging.DEBUG) + + # Load the PDF + loader = PyMuPDFLoader(file_path=pdf_url) + data = loader.load() + + # Split the text into chunks + docs = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ).split_documents(data) + + try: + # Insert the data into Azure Cosmos DB + database_name = getenv("AZURE_COSMOS_DB_DATABASE_NAME") + AzureCosmosDBNoSqlVectorSearch.from_documents( + documents=docs, + embedding=AzureOpenAIEmbeddings( + api_key=getenv("AZURE_OPENAI_API_KEY"), + api_version=getenv("AZURE_OPENAI_API_VERSION"), + azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"), + model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"), + ), + cosmos_client=CosmosClient.from_connection_string(getenv("AZURE_COSMOS_DB_CONNECTION_STRING")), + database_name=database_name, + container_name=getenv("AZURE_COSMOS_DB_CONTAINER_NAME"), + vector_embedding_policy={ + "vectorEmbeddings": [ + { + "path": "/embedding", + "dataType": "float32", + "distanceFunction": "cosine", + "dimensions": 3072, # for text-embedding-3-large + } + ] + }, + indexing_policy={ + "indexingMode": "consistent", + "includedPaths": [{"path": "/*"}], + "excludedPaths": [{"path": '/"_etag"/?'}], + "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}], + }, + cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, + cosmos_database_properties={"id": database_name}, # need to add this + ) + except Exception as e: + logger.error(f"error: {e}") + + +@app.command() +def query_data( + query: str = "What were the compute requirements for training GPT 4", + verbose: bool = True, +): + if verbose: + logging.basicConfig(level=logging.DEBUG) + + database_name = getenv("AZURE_COSMOS_DB_DATABASE_NAME") + vector_search = AzureCosmosDBNoSqlVectorSearch( + embedding=AzureOpenAIEmbeddings( + api_key=getenv("AZURE_OPENAI_API_KEY"), + api_version=getenv("AZURE_OPENAI_API_VERSION"), + azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"), + model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"), + ), + cosmos_client=CosmosClient.from_connection_string(getenv("AZURE_COSMOS_DB_CONNECTION_STRING")), + database_name=database_name, + container_name=getenv("AZURE_COSMOS_DB_CONTAINER_NAME"), + vector_embedding_policy={ + "vectorEmbeddings": [ + { + "path": "/embedding", + "dataType": "float32", + "distanceFunction": "cosine", + "dimensions": 3072, # for text-embedding-3-large + } + ] + }, + indexing_policy={ + "indexingMode": "consistent", + "includedPaths": [{"path": "/*"}], + "excludedPaths": [{"path": '/"_etag"/?'}], + "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}], + }, + cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, + cosmos_database_properties={"id": database_name}, + ) + + try: + results = vector_search.similarity_search(query=query) + for idx, result in enumerate(results): + print(f"Result {idx + 1}: {result}") + except Exception as e: + logger.error(f"error: {e}") + + +if __name__ == "__main__": + load_dotenv() + app() diff --git a/poetry.lock b/poetry.lock index bfa4026..26961af 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1947,6 +1947,17 @@ http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "httpx-sse" +version = "0.4.0" +description = "Consume Server-Sent Event (SSE) messages with HTTPX." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpx-sse-0.4.0.tar.gz", hash = "sha256:1e81a3a3070ce322add1d3529ed42eb5f70817f45ed6ec915ab753f961139721"}, + {file = "httpx_sse-0.4.0-py3-none-any.whl", hash = "sha256:f329af6eae57eaa2bdfd962b42524764af68075ea87370a2de920af5341e318f"}, +] + [[package]] name = "huggingface-hub" version = "0.25.1" @@ -2560,19 +2571,19 @@ adal = ["adal (>=1.0.2)"] [[package]] name = "langchain" -version = "0.3.2" +version = "0.3.7" description = "Building applications with LLMs through composability" optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "langchain-0.3.2-py3-none-any.whl", hash = "sha256:cf005dcba132e46fb5e8d3dfaf7f8751bffd2d73e738c36be58f41edc7e3a4b8"}, - {file = "langchain-0.3.2.tar.gz", hash = "sha256:dc330e6eb10d81d23ba0305d18358702c73cc59e95c410eca6c6779aab4ddc9b"}, + {file = "langchain-0.3.7-py3-none-any.whl", hash = "sha256:cf4af1d5751dacdc278df3de1ff3cbbd8ca7eb55d39deadccdd7fb3d3ee02ac0"}, + {file = "langchain-0.3.7.tar.gz", hash = "sha256:2e4f83bf794ba38562f7ba0ede8171d7e28a583c0cec6f8595cfe72147d336b2"}, ] [package.dependencies] aiohttp = ">=3.8.3,<4.0.0" async-timeout = {version = ">=4.0.0,<5.0.0", markers = "python_version < \"3.11\""} -langchain-core = ">=0.3.8,<0.4.0" +langchain-core = ">=0.3.15,<0.4.0" langchain-text-splitters = ">=0.3.0,<0.4.0" langsmith = ">=0.1.17,<0.2.0" numpy = [ @@ -2583,7 +2594,7 @@ pydantic = ">=2.7.4,<3.0.0" PyYAML = ">=5.3" requests = ">=2,<3" SQLAlchemy = ">=1.4,<3" -tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" +tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10" [[package]] name = "langchain-chroma" @@ -2607,20 +2618,21 @@ numpy = [ [[package]] name = "langchain-community" -version = "0.3.1" +version = "0.3.7" description = "Community contributed LangChain integrations." optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "langchain_community-0.3.1-py3-none-any.whl", hash = "sha256:627eb26c16417764762ac47dd0d3005109f750f40242a88bb8f2958b798bcf90"}, - {file = "langchain_community-0.3.1.tar.gz", hash = "sha256:c964a70628f266a61647e58f2f0434db633d4287a729f100a81dd8b0654aec93"}, + {file = "langchain_community-0.3.7-py3-none-any.whl", hash = "sha256:048f89d9a54b0720a0f865d5d469494e088cb9970a2397b19446ce0d84867141"}, + {file = "langchain_community-0.3.7.tar.gz", hash = "sha256:5b7a5cea82bedbf3ea276eac56128e00dbaf86561991cfc80fb21175a343c9a3"}, ] [package.dependencies] aiohttp = ">=3.8.3,<4.0.0" dataclasses-json = ">=0.5.7,<0.7" -langchain = ">=0.3.1,<0.4.0" -langchain-core = ">=0.3.6,<0.4.0" +httpx-sse = ">=0.4.0,<0.5.0" +langchain = ">=0.3.7,<0.4.0" +langchain-core = ">=0.3.17,<0.4.0" langsmith = ">=0.1.125,<0.2.0" numpy = [ {version = ">=1,<2", markers = "python_version < \"3.12\""}, @@ -2629,18 +2641,18 @@ numpy = [ pydantic-settings = ">=2.4.0,<3.0.0" PyYAML = ">=5.3" requests = ">=2,<3" -SQLAlchemy = ">=1.4,<3" -tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" +SQLAlchemy = ">=1.4,<2.0.36" +tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10" [[package]] name = "langchain-core" -version = "0.3.9" +version = "0.3.19" description = "Building applications with LLMs through composability" optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "langchain_core-0.3.9-py3-none-any.whl", hash = "sha256:26efa048666c7de56d0ab311de2c0778b04cbb2ffe95bff76139118f13815d01"}, - {file = "langchain_core-0.3.9.tar.gz", hash = "sha256:7a6ac988d24d0ddce5874b28f538cd95f69f502b7f50581de22aca0dc58199a8"}, + {file = "langchain_core-0.3.19-py3-none-any.whl", hash = "sha256:562b7cc3c15dfaa9270cb1496990c1f3b3e0b660c4d6a3236d7f693346f2a96c"}, + {file = "langchain_core-0.3.19.tar.gz", hash = "sha256:126d9e8cadb2a5b8d1793a228c0783a3b608e36064d5a2ef1a4d38d07a344523"}, ] [package.dependencies] @@ -2652,7 +2664,7 @@ pydantic = [ {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, ] PyYAML = ">=5.3" -tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" +tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10.0.0" typing-extensions = ">=4.7" [[package]] @@ -5374,6 +5386,23 @@ pyyaml = "*" [package.extras] extra = ["pygments (>=2.12)"] +[[package]] +name = "pymupdf" +version = "1.24.14" +description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents." +optional = false +python-versions = ">=3.9" +files = [ + {file = "PyMuPDF-1.24.14-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:b3ad7a4f4b607ff97f2e1b8111823dd3797dbb381ec851c3ae4695fea6f68478"}, + {file = "PyMuPDF-1.24.14-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:755906af4b4d693552ae5469ba682075853f4dc8a70639affd1bd6c049c5d900"}, + {file = "PyMuPDF-1.24.14-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:37f24108e2e18150fb8d512dcccdfa1e3d9b9dd203ffaa7ffb959bb20aea40b4"}, + {file = "PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0de4f5ed903c2be6d0abcccdc796368939b51ce03916eb53292916e3b6ea65d6"}, + {file = "PyMuPDF-1.24.14-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:2d1b5c47df2f8055de5dedfbd3189c742188261a8c257f406378382adac94cff"}, + {file = "PyMuPDF-1.24.14-cp39-abi3-win32.whl", hash = "sha256:60a7ee7db3e0d3a4dcbe6df2781ba4487acb7e515c64ea9c857504f44effcb25"}, + {file = "PyMuPDF-1.24.14-cp39-abi3-win_amd64.whl", hash = "sha256:3d1f1ec2fe0249484afde7a0fc02589f19aaeb47c42939d23ae1d012aa1bc59b"}, + {file = "PyMuPDF-1.24.14.tar.gz", hash = "sha256:0eed9f998525eaf39706dbf2d0cf3162150f0f526e4a36b1748ffa50bde581ae"}, +] + [[package]] name = "pyparsing" version = "3.1.4" @@ -7456,4 +7485,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "11c6ecf61c307492ece6e90c96a215bfd09be2449b00c62944351389b025522d" +content-hash = "ff8989bc2e9615eee1f3d9c0ca9fcdecf7685fd019b28d127e8f83582a6a0a47" diff --git a/pyproject.toml b/pyproject.toml index 487256b..94b8265 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ nest-asyncio = "^1.6.0" typer = "^0.12.5" azure-cognitiveservices-speech = "^1.40.0" openai-whisper = "^20240930" +pymupdf = "^1.24.14" [tool.poetry.group.dev.dependencies] pre-commit = "^4.0.0"