Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .env.template
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Azure OpenAI Service
AZURE_OPENAI_ENDPOINT="https://<YOUR_AOAI_NAME>.openai.azure.com/"
AZURE_OPENAI_API_KEY="<YOUR_API_KEY>"
AZURE_OPENAI_API_VERSION="2024-07-01-preview"
AZURE_OPENAI_API_VERSION="2024-10-21"
AZURE_OPENAI_GPT_MODEL="gpt-4o"
AZURE_OPENAI_STT_MODEL="whisper"
AZURE_OPENAI_TTS_MODEL="tts-hd"
Expand Down
124 changes: 124 additions & 0 deletions apps/3_call_azure_cosmos_db/vector_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import logging
from os import getenv

import typer
from azure.cosmos import CosmosClient, PartitionKey
from dotenv import load_dotenv
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores.azure_cosmos_db_no_sql import AzureCosmosDBNoSqlVectorSearch
from langchain_openai import AzureOpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

load_dotenv()
logger = logging.getLogger(__name__)
app = typer.Typer()


# https://python.langchain.com/docs/integrations/vectorstores/azure_cosmos_db_no_sql/
@app.command()
def insert_data(
pdf_url: str = "https://arxiv.org/pdf/2303.08774.pdf",
chunk_size: int = 2000,
chunk_overlap: int = 0,
verbose: bool = True,
):
if verbose:
logging.basicConfig(level=logging.DEBUG)

# Load the PDF
loader = PyMuPDFLoader(file_path=pdf_url)
data = loader.load()

# Split the text into chunks
docs = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
).split_documents(data)

try:
# Insert the data into Azure Cosmos DB
database_name = getenv("AZURE_COSMOS_DB_DATABASE_NAME")
AzureCosmosDBNoSqlVectorSearch.from_documents(
documents=docs,
embedding=AzureOpenAIEmbeddings(
api_key=getenv("AZURE_OPENAI_API_KEY"),
api_version=getenv("AZURE_OPENAI_API_VERSION"),
azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"),
),
cosmos_client=CosmosClient.from_connection_string(getenv("AZURE_COSMOS_DB_CONNECTION_STRING")),
database_name=database_name,
container_name=getenv("AZURE_COSMOS_DB_CONTAINER_NAME"),
vector_embedding_policy={
"vectorEmbeddings": [
{
"path": "/embedding",
"dataType": "float32",
"distanceFunction": "cosine",
"dimensions": 3072, # for text-embedding-3-large
}
]
},
indexing_policy={
"indexingMode": "consistent",
"includedPaths": [{"path": "/*"}],
"excludedPaths": [{"path": '/"_etag"/?'}],
"vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
},
cosmos_container_properties={"partition_key": PartitionKey(path="/id")},
cosmos_database_properties={"id": database_name}, # need to add this
)
except Exception as e:
logger.error(f"error: {e}")


@app.command()
def query_data(
query: str = "What were the compute requirements for training GPT 4",
verbose: bool = True,
):
if verbose:
logging.basicConfig(level=logging.DEBUG)

database_name = getenv("AZURE_COSMOS_DB_DATABASE_NAME")
vector_search = AzureCosmosDBNoSqlVectorSearch(
embedding=AzureOpenAIEmbeddings(
api_key=getenv("AZURE_OPENAI_API_KEY"),
api_version=getenv("AZURE_OPENAI_API_VERSION"),
azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"),
),
cosmos_client=CosmosClient.from_connection_string(getenv("AZURE_COSMOS_DB_CONNECTION_STRING")),
database_name=database_name,
container_name=getenv("AZURE_COSMOS_DB_CONTAINER_NAME"),
vector_embedding_policy={
"vectorEmbeddings": [
{
"path": "/embedding",
"dataType": "float32",
"distanceFunction": "cosine",
"dimensions": 3072, # for text-embedding-3-large
}
]
},
indexing_policy={
"indexingMode": "consistent",
"includedPaths": [{"path": "/*"}],
"excludedPaths": [{"path": '/"_etag"/?'}],
"vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
},
cosmos_container_properties={"partition_key": PartitionKey(path="/id")},
cosmos_database_properties={"id": database_name},
)

try:
results = vector_search.similarity_search(query=query)
for idx, result in enumerate(results):
print(f"Result {idx + 1}: {result}")
except Exception as e:
logger.error(f"error: {e}")


if __name__ == "__main__":
load_dotenv()
app()
63 changes: 46 additions & 17 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ nest-asyncio = "^1.6.0"
typer = "^0.12.5"
azure-cognitiveservices-speech = "^1.40.0"
openai-whisper = "^20240930"
pymupdf = "^1.24.14"

[tool.poetry.group.dev.dependencies]
pre-commit = "^4.0.0"
Expand Down