Skip to content

Commit ac5fdea

Browse files
committed
fix: Improve error handling and warnings
1 parent 2308831 commit ac5fdea

File tree

2 files changed

+24
-2
lines changed

2 files changed

+24
-2
lines changed

agentic_rag/pdf_processor.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
from docling.document_converter import DocumentConverter
66
from docling.chunking import HybridChunker
77
from urllib.parse import urlparse
8+
import warnings
9+
import transformers
10+
11+
# Suppress the token length warning
12+
warnings.filterwarnings('ignore', category=UserWarning, module='transformers.generation.utils')
813

914
def is_url(string: str) -> bool:
1015
"""Check if a string is a valid URL"""

agentic_rag/store.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,31 @@ def __init__(self, persist_directory: str = "chroma_db"):
2222
metadata={"hnsw:space": "cosine"}
2323
)
2424

25+
def _sanitize_metadata(self, metadata: Dict) -> Dict:
26+
"""Sanitize metadata to ensure all values are valid types for ChromaDB"""
27+
sanitized = {}
28+
for key, value in metadata.items():
29+
if isinstance(value, (str, int, float, bool)):
30+
sanitized[key] = value
31+
elif isinstance(value, list):
32+
# Convert list to string representation
33+
sanitized[key] = str(value)
34+
elif value is None:
35+
# Replace None with empty string
36+
sanitized[key] = ""
37+
else:
38+
# Convert any other type to string
39+
sanitized[key] = str(value)
40+
return sanitized
41+
2542
def add_pdf_chunks(self, chunks: List[Dict[str, Any]], document_id: str):
2643
"""Add chunks from a PDF document to the vector store"""
2744
if not chunks:
2845
return
2946

3047
# Prepare data for ChromaDB
3148
texts = [chunk["text"] for chunk in chunks]
32-
metadatas = [chunk["metadata"] for chunk in chunks]
49+
metadatas = [self._sanitize_metadata(chunk["metadata"]) for chunk in chunks]
3350
ids = [f"{document_id}_{i}" for i in range(len(chunks))]
3451

3552
# Add to collection
@@ -46,7 +63,7 @@ def add_general_knowledge(self, chunks: List[Dict[str, Any]], source_id: str):
4663

4764
# Prepare data for ChromaDB
4865
texts = [chunk["text"] for chunk in chunks]
49-
metadatas = [chunk["metadata"] for chunk in chunks]
66+
metadatas = [self._sanitize_metadata(chunk["metadata"]) for chunk in chunks]
5067
ids = [f"{source_id}_{i}" for i in range(len(chunks))]
5168

5269
# Add to collection

0 commit comments

Comments
 (0)