Skip to content

Commit d9efaf3

Browse files
committed
add CLI for managing vector store
1 parent 46b86e2 commit d9efaf3

File tree

8 files changed

+622
-103
lines changed

8 files changed

+622
-103
lines changed

data/contoso_rules.csv

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
rule
2+
社内会議は必ず出席してください
3+
遅刻は月に2回まで許されます
4+
社内の機密情報は外部に漏らさないでください
5+
社員同士のトラブルは人事部に相談してください
6+
病気で休む場合は、必ず診断書を提出してください
7+
社内のイベントには積極的に参加してください
8+
社外の人との会食は、事前に上司の承認を得てください
9+
社内のドレスコードはビジネスカジュアルです
10+
社内の清掃は各自で行ってください
11+
全社員は健康診断を年1回受ける必要があります
12+
プロジェクトの締め切りは厳守してください
13+
社内のセキュリティポリシーを遵守してください
14+
社内の設備故障は速やかに報告してください
15+
有給休暇は最大20日です
16+
毎週金曜日はノー残業デーです
17+
社内の図書は貸出期間を守って返却してください
18+
社内のインターネット利用は業務に関連するものに限ります
19+
有給休暇の申請は1ヶ月前までに行ってください
20+
一日8時間勤労してください
21+
社内の飲食スペース以外での飲食は禁止です

infra/main.bicep

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ param iotHubName string = '${prefix}iothub'
4444
@description('Specifies the name of the Azure Log Analytics workspace.')
4545
param logAnalyticsWorkspaceName string = '${prefix}law'
4646

47+
@description('Specifies the name of the Azure AI Search resource.')
48+
param aiSearchName string = '${prefix}aisearch'
49+
4750
module openAi './modules/openAi.bicep' = {
4851
name: 'openAi'
4952
params: {
@@ -133,6 +136,15 @@ module logAnalytics './modules/logAnalytics.bicep' = {
133136
}
134137
}
135138

139+
module aiSearch './modules/aiSearch.bicep' = {
140+
name: 'aiSearch'
141+
params: {
142+
name: aiSearchName
143+
location: location
144+
tags: tags
145+
}
146+
}
147+
136148
// Output
137149
output cognitiveServicesName string = cognitiveServices.outputs.name
138150
output cognitiveServicesEndpoint string = cognitiveServices.outputs.endpoint

infra/modules/aiSearch.bicep

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// Parameters
2+
@description('Specifies the name of the virtual machine.')
3+
param name string
4+
5+
@description('Specifies the location.')
6+
param location string = resourceGroup().location
7+
8+
@description('Specifies the resource tags.')
9+
param tags object = {}
10+
11+
@description('SKU for the cognitive search service')
12+
@allowed([
13+
'basic'
14+
'free'
15+
'standard'
16+
'standard2'
17+
'standard3'
18+
'storage_optimized_l1'
19+
'storage_optimized_l2'
20+
])
21+
param sku string = 'basic'
22+
23+
@description('Authentication options for how the data plane API of a search service authenticates requests.')
24+
param authOptions object = {}
25+
26+
@allowed(['enabled', 'disabled'])
27+
param publicNetworkAccess string = 'enabled'
28+
29+
resource search 'Microsoft.Search/searchServices@2024-06-01-preview' = {
30+
name: name
31+
location: location
32+
tags: tags
33+
identity: {
34+
type: 'SystemAssigned'
35+
}
36+
properties: {
37+
authOptions: authOptions
38+
disableLocalAuth: false
39+
encryptionWithCmk: {
40+
enforcement: 'Unspecified'
41+
}
42+
hostingMode: 'default'
43+
networkRuleSet: {
44+
ipRules: []
45+
}
46+
partitionCount: 1
47+
publicNetworkAccess: publicNetworkAccess
48+
replicaCount: 1
49+
}
50+
sku: {
51+
name: sku
52+
}
53+
}
54+
55+
output id string = search.id
56+
output name string = search.name

poetry.lock

Lines changed: 358 additions & 103 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,15 @@ openai = "^1.30.5"
6161
[tool.poetry.group.azure-functions.dependencies]
6262
azure-functions = "^1.19.0"
6363

64+
65+
[tool.poetry.group.scripts.dependencies]
66+
pymupdf = "^1.24.7"
67+
faiss-cpu = "^1.8.0.post1"
68+
langchain = "^0.2.10"
69+
langchain-openai = "^0.1.17"
70+
pandas = "^2.2.2"
71+
azure-search-documents = "^11.5.0"
72+
6473
[build-system]
6574
requires = ["poetry-core"]
6675
build-backend = "poetry.core.masonry.api"

scripts/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
## How to use the scripts
2+
3+
```shell
4+
# Install dependencies
5+
make install-deps-dev
6+
7+
# Help
8+
poetry run python scripts/manage_vector_store.py --help
9+
10+
# Create a new vector store in local
11+
poetry run python scripts/manage_vector_store.py create-vector-store
12+
```
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Azure OpenAI
2+
AZURE_OPENAI_ENDPOINT = "https://<your-aoai-name>.openai.azure.com/"
3+
AZURE_OPENAI_API_KEY = "<api-key>"
4+
AZURE_OPENAI_API_VERSION = "2024-05-01-preview"
5+
AZURE_OPENAI_MODEL_EMBEDDING = "text-embedding-3-large"
6+
AZURE_OPENAI_MODEL_CHAT = "gpt-4o"
7+
8+
# Azure AI Search
9+
AZURE_AI_SEARCH_ENDPOINT = "https://<your-aisearch-name>.search.windows.net"
10+
AZURE_AI_SEARCH_API_KEY = "<api-key>"

scripts/manage_vector_store.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
import logging
2+
from enum import Enum
3+
from os import getenv
4+
from pprint import pprint
5+
from typing import Annotated
6+
7+
import typer
8+
from dotenv import load_dotenv
9+
from langchain_community.document_loaders.csv_loader import CSVLoader
10+
from langchain_community.vectorstores import FAISS, VectorStore
11+
from langchain_community.vectorstores.azuresearch import AzureSearch
12+
from langchain_core.documents import Document
13+
from langchain_openai import AzureOpenAIEmbeddings
14+
15+
app = typer.Typer()
16+
17+
18+
class VectorStoreType(str, Enum):
19+
AzureAISearch = "azureaisearch"
20+
Faiss = "faiss"
21+
22+
23+
def get_log_level(debug: bool) -> int:
24+
return logging.DEBUG if debug else logging.INFO
25+
26+
27+
def setup_logging(debug: bool = False):
28+
logging.basicConfig(
29+
format="[%(asctime)s] %(levelname)7s from %(name)s in %(pathname)s:%(lineno)d: " "%(message)s",
30+
level=get_log_level(debug),
31+
force=True,
32+
)
33+
34+
35+
def get_embeddings():
36+
return AzureOpenAIEmbeddings(
37+
api_key=getenv("AZURE_OPENAI_API_KEY"),
38+
api_version=getenv("AZURE_OPENAI_API_VERSION"),
39+
azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
40+
model=getenv("AZURE_OPENAI_MODEL_EMBEDDING"),
41+
)
42+
43+
44+
def get_local_vector_store_path(identifier: str):
45+
return f"./artifacts/vectorstore/{identifier}"
46+
47+
48+
def create_azure_search(index_name: str) -> AzureSearch:
49+
return AzureSearch(
50+
azure_search_endpoint=getenv("AZURE_AI_SEARCH_ENDPOINT"),
51+
azure_search_key=getenv("AZURE_AI_SEARCH_API_KEY"),
52+
index_name=index_name,
53+
embedding_function=get_embeddings().embed_query,
54+
additional_search_client_options={"retry_total": 4},
55+
)
56+
57+
58+
def get_vector_store(
59+
vector_store_type: VectorStoreType,
60+
identifier: str,
61+
) -> VectorStore:
62+
if vector_store_type == VectorStoreType.AzureAISearch:
63+
logging.info("Creating Azure AI Search vector store")
64+
return create_azure_search(identifier)
65+
elif vector_store_type == VectorStoreType.Faiss:
66+
logging.info("Creating Faiss vector store")
67+
return FAISS.load_local(
68+
folder_path=get_local_vector_store_path(identifier),
69+
embeddings=get_embeddings(),
70+
allow_dangerous_deserialization=True,
71+
)
72+
73+
74+
def _create_vector_store(
75+
vector_store_type: VectorStoreType,
76+
identifier: str,
77+
documents: list[Document],
78+
) -> VectorStore:
79+
if vector_store_type == VectorStoreType.AzureAISearch:
80+
logging.info("Creating Azure AI Search vector store")
81+
vector_store = create_azure_search(identifier)
82+
vector_store.add_documents(documents=documents)
83+
return
84+
elif vector_store_type == VectorStoreType.Faiss:
85+
logging.info("Creating Faiss vector store")
86+
vector_store: FAISS = FAISS.from_documents(
87+
documents=documents,
88+
embedding=get_embeddings(),
89+
)
90+
vector_store.save_local(folder_path=get_local_vector_store_path(identifier))
91+
return
92+
93+
94+
@app.command()
95+
def create_vector_store(
96+
input_csv_file_path: Annotated[str, typer.Option(help="Path to the input CSV file")] = "./data/contoso_rules.csv",
97+
identifier="contoso_rules",
98+
vector_store_type: Annotated[VectorStoreType, typer.Option(case_sensitive=False)] = VectorStoreType.Faiss,
99+
debug: Annotated[bool, typer.Option(help="Enable debug mode")] = False,
100+
):
101+
setup_logging(debug)
102+
103+
# Load documents from CSV
104+
try:
105+
documents = CSVLoader(file_path=input_csv_file_path).load()
106+
except Exception as e:
107+
logging.error(f"Failed to load documents from CSV: {e}")
108+
return
109+
110+
# Create vector store
111+
_create_vector_store(
112+
vector_store_type=vector_store_type,
113+
identifier=identifier,
114+
documents=documents,
115+
)
116+
117+
118+
@app.command()
119+
def search(
120+
identifier="contoso_rules",
121+
vector_store_type: Annotated[VectorStoreType, typer.Option(case_sensitive=False)] = VectorStoreType.Faiss,
122+
query: Annotated[str, typer.Option(help="Query to search")] = "社内の機密情報は外部に漏らさないでください",
123+
debug: Annotated[bool, typer.Option(help="Enable debug mode")] = False,
124+
):
125+
setup_logging(debug)
126+
127+
# Create vector store
128+
vector_store = get_vector_store(
129+
vector_store_type=vector_store_type,
130+
identifier=identifier,
131+
)
132+
133+
# Search
134+
result = vector_store.similarity_search_with_relevance_scores(
135+
query=query,
136+
k=5,
137+
score_threshold=0.5,
138+
)
139+
pprint(result)
140+
141+
142+
if __name__ == "__main__":
143+
load_dotenv()
144+
app()

0 commit comments

Comments
 (0)