@@ -22,14 +22,31 @@ def __init__(self, persist_directory: str = "chroma_db"):
22
22
metadata = {"hnsw:space" : "cosine" }
23
23
)
24
24
25
+ def _sanitize_metadata (self , metadata : Dict ) -> Dict :
26
+ """Sanitize metadata to ensure all values are valid types for ChromaDB"""
27
+ sanitized = {}
28
+ for key , value in metadata .items ():
29
+ if isinstance (value , (str , int , float , bool )):
30
+ sanitized [key ] = value
31
+ elif isinstance (value , list ):
32
+ # Convert list to string representation
33
+ sanitized [key ] = str (value )
34
+ elif value is None :
35
+ # Replace None with empty string
36
+ sanitized [key ] = ""
37
+ else :
38
+ # Convert any other type to string
39
+ sanitized [key ] = str (value )
40
+ return sanitized
41
+
25
42
def add_pdf_chunks (self , chunks : List [Dict [str , Any ]], document_id : str ):
26
43
"""Add chunks from a PDF document to the vector store"""
27
44
if not chunks :
28
45
return
29
46
30
47
# Prepare data for ChromaDB
31
48
texts = [chunk ["text" ] for chunk in chunks ]
32
- metadatas = [chunk ["metadata" ] for chunk in chunks ]
49
+ metadatas = [self . _sanitize_metadata ( chunk ["metadata" ]) for chunk in chunks ]
33
50
ids = [f"{ document_id } _{ i } " for i in range (len (chunks ))]
34
51
35
52
# Add to collection
@@ -46,7 +63,7 @@ def add_general_knowledge(self, chunks: List[Dict[str, Any]], source_id: str):
46
63
47
64
# Prepare data for ChromaDB
48
65
texts = [chunk ["text" ] for chunk in chunks ]
49
- metadatas = [chunk ["metadata" ] for chunk in chunks ]
66
+ metadatas = [self . _sanitize_metadata ( chunk ["metadata" ]) for chunk in chunks ]
50
67
ids = [f"{ source_id } _{ i } " for i in range (len (chunks ))]
51
68
52
69
# Add to collection
0 commit comments