Skip to content

Commit 71a293f

Browse files
committed
call vector database features
1 parent 5299a15 commit 71a293f

File tree

4 files changed

+172
-18
lines changed

4 files changed

+172
-18
lines changed

.env.template

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Azure OpenAI Service
22
AZURE_OPENAI_ENDPOINT="https://<YOUR_AOAI_NAME>.openai.azure.com/"
33
AZURE_OPENAI_API_KEY="<YOUR_API_KEY>"
4-
AZURE_OPENAI_API_VERSION="2024-07-01-preview"
4+
AZURE_OPENAI_API_VERSION="2024-10-21"
55
AZURE_OPENAI_GPT_MODEL="gpt-4o"
66
AZURE_OPENAI_STT_MODEL="whisper"
77
AZURE_OPENAI_TTS_MODEL="tts-hd"
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import logging
2+
from os import getenv
3+
4+
import typer
5+
from azure.cosmos import CosmosClient, PartitionKey
6+
from dotenv import load_dotenv
7+
from langchain_community.document_loaders import PyMuPDFLoader
8+
from langchain_community.vectorstores.azure_cosmos_db_no_sql import AzureCosmosDBNoSqlVectorSearch
9+
from langchain_openai import AzureOpenAIEmbeddings
10+
from langchain_text_splitters import RecursiveCharacterTextSplitter
11+
12+
load_dotenv()
13+
logger = logging.getLogger(__name__)
14+
app = typer.Typer()
15+
16+
17+
# https://python.langchain.com/docs/integrations/vectorstores/azure_cosmos_db_no_sql/
18+
@app.command()
19+
def insert_data(
20+
pdf_url: str = "https://arxiv.org/pdf/2303.08774.pdf",
21+
chunk_size: int = 2000,
22+
chunk_overlap: int = 0,
23+
verbose: bool = True,
24+
):
25+
if verbose:
26+
logging.basicConfig(level=logging.DEBUG)
27+
28+
# Load the PDF
29+
loader = PyMuPDFLoader(file_path=pdf_url)
30+
data = loader.load()
31+
32+
# Split the text into chunks
33+
docs = RecursiveCharacterTextSplitter(
34+
chunk_size=chunk_size,
35+
chunk_overlap=chunk_overlap,
36+
).split_documents(data)
37+
38+
try:
39+
# Insert the data into Azure Cosmos DB
40+
database_name = getenv("AZURE_COSMOS_DB_DATABASE_NAME")
41+
AzureCosmosDBNoSqlVectorSearch.from_documents(
42+
documents=docs,
43+
embedding=AzureOpenAIEmbeddings(
44+
api_key=getenv("AZURE_OPENAI_API_KEY"),
45+
api_version=getenv("AZURE_OPENAI_API_VERSION"),
46+
azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
47+
model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"),
48+
),
49+
cosmos_client=CosmosClient.from_connection_string(getenv("AZURE_COSMOS_DB_CONNECTION_STRING")),
50+
database_name=database_name,
51+
container_name=getenv("AZURE_COSMOS_DB_CONTAINER_NAME"),
52+
vector_embedding_policy={
53+
"vectorEmbeddings": [
54+
{
55+
"path": "/embedding",
56+
"dataType": "float32",
57+
"distanceFunction": "cosine",
58+
"dimensions": 3072, # for text-embedding-3-large
59+
}
60+
]
61+
},
62+
indexing_policy={
63+
"indexingMode": "consistent",
64+
"includedPaths": [{"path": "/*"}],
65+
"excludedPaths": [{"path": '/"_etag"/?'}],
66+
"vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
67+
},
68+
cosmos_container_properties={"partition_key": PartitionKey(path="/id")},
69+
cosmos_database_properties={"id": database_name}, # need to add this
70+
)
71+
except Exception as e:
72+
logger.error(f"error: {e}")
73+
74+
75+
@app.command()
76+
def query_data(
77+
query: str = "What were the compute requirements for training GPT 4",
78+
verbose: bool = True,
79+
):
80+
if verbose:
81+
logging.basicConfig(level=logging.DEBUG)
82+
83+
database_name = getenv("AZURE_COSMOS_DB_DATABASE_NAME")
84+
vector_search = AzureCosmosDBNoSqlVectorSearch(
85+
embedding=AzureOpenAIEmbeddings(
86+
api_key=getenv("AZURE_OPENAI_API_KEY"),
87+
api_version=getenv("AZURE_OPENAI_API_VERSION"),
88+
azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
89+
model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"),
90+
),
91+
cosmos_client=CosmosClient.from_connection_string(getenv("AZURE_COSMOS_DB_CONNECTION_STRING")),
92+
database_name=database_name,
93+
container_name=getenv("AZURE_COSMOS_DB_CONTAINER_NAME"),
94+
vector_embedding_policy={
95+
"vectorEmbeddings": [
96+
{
97+
"path": "/embedding",
98+
"dataType": "float32",
99+
"distanceFunction": "cosine",
100+
"dimensions": 3072, # for text-embedding-3-large
101+
}
102+
]
103+
},
104+
indexing_policy={
105+
"indexingMode": "consistent",
106+
"includedPaths": [{"path": "/*"}],
107+
"excludedPaths": [{"path": '/"_etag"/?'}],
108+
"vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
109+
},
110+
cosmos_container_properties={"partition_key": PartitionKey(path="/id")},
111+
cosmos_database_properties={"id": database_name},
112+
)
113+
114+
try:
115+
results = vector_search.similarity_search(query=query)
116+
for idx, result in enumerate(results):
117+
print(f"Result {idx + 1}: {result}")
118+
except Exception as e:
119+
logger.error(f"error: {e}")
120+
121+
122+
if __name__ == "__main__":
123+
load_dotenv()
124+
app()

poetry.lock

Lines changed: 46 additions & 17 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ nest-asyncio = "^1.6.0"
3939
typer = "^0.12.5"
4040
azure-cognitiveservices-speech = "^1.40.0"
4141
openai-whisper = "^20240930"
42+
pymupdf = "^1.24.14"
4243

4344
[tool.poetry.group.dev.dependencies]
4445
pre-commit = "^4.0.0"

0 commit comments

Comments
 (0)