Skip to content

Commit 2d1c9ca

Browse files
committed
add documents to qdrant
1 parent 3520c3e commit 2d1c9ca

File tree

11 files changed

+247
-0
lines changed

11 files changed

+247
-0
lines changed

.env.template

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,15 @@
11
# Project environment variables
22
PROJECT_NAME=template-langgraph
3+
4+
# Azure OpenAI Service
5+
AZURE_OPENAI_ENDPOINT="https://YOUR_AZURE_OPENAI_ENDPOINT/"
6+
AZURE_OPENAI_API_KEY="YOUR_AZURE_OPENAI_API_KEY"
7+
AZURE_OPENAI_API_VERSION="2024-10-21"
8+
AZURE_OPENAI_MODEL_CHAT="gpt-4o"
9+
AZURE_OPENAI_MODEL_EMBEDDING="text-embedding-3-small"
10+
11+
# CSV Loader Settings
12+
CSV_LOADER_DATA_DIR_PATH="./data"
13+
14+
# Qdrant Settings
15+
QDRANT_URL="http://localhost:6333"

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,3 +162,5 @@ cython_debug/
162162
# Project
163163
*.env
164164
requirements.txt
165+
assets/
166+
data/

docker-compose.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
services:
2+
qdrant:
3+
image: qdrant/qdrant:v1.15.1
4+
container_name: qdrant
5+
ports:
6+
- "6333:6333" # Dashboard: http://localhost:6333/dashboard
7+
volumes:
8+
- ./assets/qdrant_data:/qdrant/storage

docs/index.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,19 @@
11
# template-langgraph
2+
3+
## Operations
4+
5+
```bash
6+
# Add documents to Qdrant
7+
uv run python -m template_langgraph.tasks.add_documents_to_qdrant
8+
```
9+
10+
## References
11+
12+
### Models
13+
14+
- [AzureOpenAIEmbeddings](https://python.langchain.com/docs/integrations/text_embedding/azureopenai/)
15+
16+
### Tools
17+
18+
- [CSVLoader](https://python.langchain.com/docs/how_to/document_loader_csv/)
19+
- [Qdrant](https://github.com/qdrant/qdrant)

template_langgraph/models/__init__.py

Whitespace-only changes.
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from functools import lru_cache
2+
3+
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
4+
from pydantic_settings import BaseSettings, SettingsConfigDict
5+
6+
7+
class Settings(BaseSettings):
8+
azure_openai_endpoint: str = "https://<YOUR_AOAI_NAME>.openai.azure.com/"
9+
azure_openai_api_key: str = "<YOUR_API_KEY>"
10+
azure_openai_api_version: str = "2024-10-21"
11+
azure_openai_model_chat: str = "gpt-4o"
12+
azure_openai_model_embedding: str = "text-embedding-3-small"
13+
14+
model_config = SettingsConfigDict(
15+
env_file=".env",
16+
env_ignore_empty=True,
17+
extra="ignore",
18+
)
19+
20+
21+
@lru_cache
22+
def get_azure_openai_settings() -> Settings:
23+
return Settings()
24+
25+
26+
class AzureOpenAiWrapper:
27+
def __init__(self, settings: Settings = None):
28+
if settings is None:
29+
settings = get_azure_openai_settings()
30+
31+
self.chat_model = AzureChatOpenAI(
32+
azure_endpoint=settings.azure_openai_endpoint,
33+
api_key=settings.azure_openai_api_key,
34+
api_version=settings.azure_openai_api_version,
35+
azure_deployment=settings.azure_openai_model_chat,
36+
temperature=0.0,
37+
streaming=True,
38+
)
39+
self.embedding_model = AzureOpenAIEmbeddings(
40+
azure_endpoint=settings.azure_openai_endpoint,
41+
api_key=settings.azure_openai_api_key,
42+
api_version=settings.azure_openai_api_version,
43+
azure_deployment=settings.azure_openai_model_embedding,
44+
)
45+
46+
def create_embedding(self, text: str):
47+
"""Create an embedding for the given text."""
48+
return self.embedding_model.embed_query(text)

template_langgraph/tasks/__init__.py

Whitespace-only changes.
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import logging
2+
3+
from qdrant_client.models import PointStruct
4+
5+
from template_langgraph.loggers import get_logger
6+
from template_langgraph.models.azure_openais import AzureOpenAiWrapper
7+
from template_langgraph.tools.csv_loaders import CsvLoaderWrapper
8+
from template_langgraph.tools.qdrants import QdrantClientWrapper
9+
10+
logger = get_logger(__name__)
11+
logger.setLevel(logging.INFO)
12+
COLLECTION_NAME = "documents"
13+
14+
if __name__ == "__main__":
15+
# Load documents from CSV files
16+
documents = CsvLoaderWrapper().load_csv_docs()
17+
logger.info(f"Loaded {len(documents)} documents from CSV.")
18+
19+
# hardcoded collection name for demonstration purposes
20+
21+
logger.info(f"Upserting {len(documents)} documents into Qdrant collection: {COLLECTION_NAME}")
22+
points = []
23+
for i, doc in enumerate(documents):
24+
logger.debug(f"Processing document {i}: {doc.metadata.get('source', 'unknown')}")
25+
content = doc.page_content
26+
content = content.replace(" ", "")
27+
embedding = AzureOpenAiWrapper().create_embedding(content)
28+
points.append(
29+
PointStruct(
30+
id=i,
31+
vector=embedding,
32+
payload={
33+
"file_name": doc.metadata.get("source", f"doc_{i}"),
34+
"content": content,
35+
},
36+
)
37+
)
38+
39+
qdrant_client = QdrantClientWrapper()
40+
qdrant_client.create_collection(
41+
collection_name=COLLECTION_NAME,
42+
vector_size=len(points[0].vector) if points else 1536, # default vector size
43+
)
44+
45+
logger.info(f"Created Qdrant collection: {COLLECTION_NAME}")
46+
operation_info = qdrant_client.upsert_points(
47+
collection_name=COLLECTION_NAME,
48+
points=points,
49+
)
50+
logger.info(f"Upserted {len(points)} points into Qdrant collection: {COLLECTION_NAME}")

template_langgraph/tools/__init__.py

Whitespace-only changes.
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import os
2+
from functools import lru_cache
3+
from glob import glob
4+
5+
from langchain_community.document_loaders.csv_loader import CSVLoader
6+
from langchain_core.documents import Document
7+
from pydantic_settings import BaseSettings, SettingsConfigDict
8+
9+
10+
class Settings(BaseSettings):
11+
csv_loader_data_dir_path: str = "./data"
12+
13+
model_config = SettingsConfigDict(
14+
env_file=".env",
15+
env_ignore_empty=True,
16+
extra="ignore",
17+
)
18+
19+
20+
@lru_cache
21+
def get_csv_loader_settings() -> Settings:
22+
"""Get CSV loader settings."""
23+
return Settings()
24+
25+
26+
class CsvLoaderWrapper:
27+
def __init__(
28+
self,
29+
settings: Settings = None,
30+
):
31+
if settings is None:
32+
settings = get_csv_loader_settings()
33+
self.settings = settings
34+
35+
def load_csv_docs(self) -> list[Document]:
36+
"""Load CSV documents from the specified directory."""
37+
csv_path = glob(
38+
os.path.join(self.settings.csv_loader_data_dir_path, "**", "*.csv"),
39+
recursive=True,
40+
)
41+
docs = []
42+
43+
for path in csv_path:
44+
loader = CSVLoader(file_path=path)
45+
docs.extend(loader.load())
46+
47+
return docs

0 commit comments

Comments
 (0)