Skip to content

Commit 7d98e5e

Browse files
authored
Merge pull request #130 from ks6088ts-labs/feature/issue-126_support-cosmosdb
support Azure Cosmos DB No SQL
2 parents 30b5be6 + 3148515 commit 7d98e5e

File tree

6 files changed

+107
-29
lines changed

6 files changed

+107
-29
lines changed

infra/modules/cosmosDb.bicep

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ var locations = [
7272
}
7373
]
7474

75-
resource account 'Microsoft.DocumentDB/databaseAccounts@2024-02-15-preview' = {
75+
resource account 'Microsoft.DocumentDB/databaseAccounts@2024-05-15' = {
7676
name: accountName
7777
location: location
7878
tags: tags
@@ -82,6 +82,11 @@ resource account 'Microsoft.DocumentDB/databaseAccounts@2024-02-15-preview' = {
8282
locations: locations
8383
databaseAccountOfferType: 'Standard'
8484
enableAutomaticFailover: systemManagedFailover
85+
capabilities: [
86+
{
87+
name: 'EnableNoSQLVectorSearch'
88+
}
89+
]
8590
}
8691
}
8792

poetry.lock

Lines changed: 10 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,10 @@ pymupdf = "^1.24.7"
6767
faiss-cpu = "^1.8.0.post1"
6868
langchain = "^0.2.10"
6969
langchain-openai = "^0.1.17"
70+
langchain-community = "^0.2.9"
7071
pandas = "^2.2.2"
7172
azure-search-documents = "^11.5.0"
73+
azure-cosmos = "^4.7.0"
7274

7375
[build-system]
7476
requires = ["poetry-core"]

scripts/README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ make install-deps-dev
77
# Help
88
poetry run python scripts/manage_vector_store.py --help
99

10+
VECTOR_STORE_TYPE=faiss
11+
1012
# Create a new vector store in local
11-
poetry run python scripts/manage_vector_store.py create-vector-store
13+
poetry run python scripts/manage_vector_store.py create --vector-store-type $VECTOR_STORE_TYPE
14+
15+
# Search for similar vectors
16+
poetry run python scripts/manage_vector_store.py search --vector-store-type $VECTOR_STORE_TYPE
1217
```

scripts/manage_vector_store.env.sample

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,7 @@ AZURE_OPENAI_MODEL_CHAT = "gpt-4o"
88
# Azure AI Search
99
AZURE_AI_SEARCH_ENDPOINT = "https://<your-aisearch-name>.search.windows.net"
1010
AZURE_AI_SEARCH_API_KEY = "<api-key>"
11+
12+
# Azure Cosmos DB
13+
AZURE_COSMOSDB_ENDPOINT = "https://<your-cosmosdb-name>.documents.azure.com:443/"
14+
AZURE_COSMOSDB_KEY = "<api-key>"

scripts/manage_vector_store.py

Lines changed: 79 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@
55
from typing import Annotated
66

77
import typer
8+
from azure.cosmos import CosmosClient, PartitionKey
89
from dotenv import load_dotenv
910
from langchain_community.document_loaders.csv_loader import CSVLoader
1011
from langchain_community.vectorstores import FAISS, VectorStore
12+
from langchain_community.vectorstores.azure_cosmos_db_no_sql import AzureCosmosDBNoSqlVectorSearch
1113
from langchain_community.vectorstores.azuresearch import AzureSearch
1214
from langchain_core.documents import Document
1315
from langchain_openai import AzureOpenAIEmbeddings
@@ -16,8 +18,9 @@
1618

1719

1820
class VectorStoreType(str, Enum):
19-
AzureAISearch = "azureaisearch"
2021
Faiss = "faiss"
22+
AzureAISearch = "azureaisearch"
23+
AzureCosmosDbNoSql = "azurecosmosdbnosql"
2124

2225

2326
def get_log_level(debug: bool) -> int:
@@ -41,10 +44,39 @@ def get_embeddings():
4144
)
4245

4346

47+
def get_cosmos_client():
48+
return CosmosClient(
49+
url=getenv("AZURE_COSMOSDB_ENDPOINT"),
50+
credential=getenv("AZURE_COSMOSDB_KEY"),
51+
)
52+
53+
4454
def get_local_vector_store_path(identifier: str):
4555
return f"./artifacts/vectorstore/{identifier}"
4656

4757

58+
def get_vector_embedding_policy():
59+
return {
60+
"vectorEmbeddings": [
61+
{
62+
"path": "/embedding",
63+
"dataType": "float32",
64+
"distanceFunction": "cosine",
65+
"dimensions": 1536,
66+
}
67+
]
68+
}
69+
70+
71+
def get_indexing_policy():
72+
return {
73+
"indexingMode": "consistent",
74+
"includedPaths": [{"path": "/*"}],
75+
"excludedPaths": [{"path": '/"_etag"/?'}],
76+
"vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
77+
}
78+
79+
4880
def create_azure_search(index_name: str) -> AzureSearch:
4981
return AzureSearch(
5082
azure_search_endpoint=getenv("AZURE_AI_SEARCH_ENDPOINT"),
@@ -59,40 +91,69 @@ def get_vector_store(
5991
vector_store_type: VectorStoreType,
6092
identifier: str,
6193
) -> VectorStore:
62-
if vector_store_type == VectorStoreType.AzureAISearch:
63-
logging.info("Creating Azure AI Search vector store")
64-
return create_azure_search(identifier)
65-
elif vector_store_type == VectorStoreType.Faiss:
94+
if vector_store_type == VectorStoreType.Faiss:
6695
logging.info("Creating Faiss vector store")
6796
return FAISS.load_local(
6897
folder_path=get_local_vector_store_path(identifier),
6998
embeddings=get_embeddings(),
7099
allow_dangerous_deserialization=True,
71100
)
101+
elif vector_store_type == VectorStoreType.AzureAISearch:
102+
logging.info("Creating Azure AI Search vector store")
103+
return create_azure_search(identifier)
104+
elif vector_store_type == VectorStoreType.AzureCosmosDbNoSql:
105+
logging.info("Creating Azure Cosmos DB NoSQL vector store")
106+
cosmos_database_name = "langchain_python_db"
107+
return AzureCosmosDBNoSqlVectorSearch(
108+
cosmos_client=get_cosmos_client(),
109+
embedding=get_embeddings(),
110+
vector_embedding_policy=get_vector_embedding_policy(),
111+
indexing_policy=get_indexing_policy(),
112+
cosmos_container_properties={"partition_key": PartitionKey(path="/id")},
113+
cosmos_database_properties={"id": cosmos_database_name},
114+
database_name=cosmos_database_name,
115+
container_name="langchain_python_container",
116+
)
72117

73118

74-
def _create_vector_store(
119+
def create_vector_store(
75120
vector_store_type: VectorStoreType,
76121
identifier: str,
77122
documents: list[Document],
78123
) -> VectorStore:
79-
if vector_store_type == VectorStoreType.AzureAISearch:
124+
if vector_store_type == VectorStoreType.Faiss:
125+
logging.info("Creating Faiss vector store")
126+
vector_store: FAISS = FAISS.from_documents(
127+
documents=documents,
128+
embedding=get_embeddings(),
129+
)
130+
vector_store.save_local(folder_path=get_local_vector_store_path(identifier))
131+
return
132+
elif vector_store_type == VectorStoreType.AzureAISearch:
80133
logging.info("Creating Azure AI Search vector store")
81134
vector_store = create_azure_search(identifier)
82135
vector_store.add_documents(documents=documents)
83136
return
84-
elif vector_store_type == VectorStoreType.Faiss:
85-
logging.info("Creating Faiss vector store")
86-
vector_store: FAISS = FAISS.from_documents(
137+
elif vector_store_type == VectorStoreType.AzureCosmosDbNoSql:
138+
logging.info("Creating Azure Cosmos DB NoSQL vector store")
139+
cosmos_database_name = "langchain_python_db"
140+
141+
AzureCosmosDBNoSqlVectorSearch.from_documents(
87142
documents=documents,
88143
embedding=get_embeddings(),
144+
cosmos_client=get_cosmos_client(),
145+
database_name=cosmos_database_name,
146+
container_name="langchain_python_container",
147+
vector_embedding_policy=get_vector_embedding_policy(),
148+
indexing_policy=get_indexing_policy(),
149+
cosmos_database_properties={"id": cosmos_database_name},
150+
cosmos_container_properties={"partition_key": PartitionKey(path="/id")},
89151
)
90-
vector_store.save_local(folder_path=get_local_vector_store_path(identifier))
91152
return
92153

93154

94155
@app.command()
95-
def create_vector_store(
156+
def create(
96157
input_csv_file_path: Annotated[str, typer.Option(help="Path to the input CSV file")] = "./data/contoso_rules.csv",
97158
identifier="contoso_rules",
98159
vector_store_type: Annotated[VectorStoreType, typer.Option(case_sensitive=False)] = VectorStoreType.Faiss,
@@ -108,7 +169,7 @@ def create_vector_store(
108169
return
109170

110171
# Create vector store
111-
_create_vector_store(
172+
create_vector_store(
112173
vector_store_type=vector_store_type,
113174
identifier=identifier,
114175
documents=documents,
@@ -120,6 +181,7 @@ def search(
120181
identifier="contoso_rules",
121182
vector_store_type: Annotated[VectorStoreType, typer.Option(case_sensitive=False)] = VectorStoreType.Faiss,
122183
query: Annotated[str, typer.Option(help="Query to search")] = "社内の機密情報は外部に漏らさないでください",
184+
k: Annotated[int, typer.Option(help="Number of documents to retrieve")] = 5,
123185
debug: Annotated[bool, typer.Option(help="Enable debug mode")] = False,
124186
):
125187
setup_logging(debug)
@@ -131,12 +193,12 @@ def search(
131193
)
132194

133195
# Search
134-
result = vector_store.similarity_search_with_relevance_scores(
196+
got_documents = vector_store.similarity_search(
135197
query=query,
136-
k=5,
137-
score_threshold=0.5,
198+
k=k,
138199
)
139-
pprint(result)
200+
for document in got_documents:
201+
pprint(document.page_content)
140202

141203

142204
if __name__ == "__main__":

0 commit comments

Comments
 (0)