Skip to content

Commit fc52183

Browse files
authored
Lorenze/fix duplicating doc ids for knowledge (#3840)
* fix: update document ID handling in ChromaDB utility functions to use SHA-256 hashing and include index for uniqueness * test: add tests for hash-based ID generation in ChromaDB utility functions * drop idx for preventing dups, upsert should handle dups * fix: update document ID extraction logic in ChromaDB utility functions to check for doc_id at the top level of the document * fix: enhance document ID generation in ChromaDB utility functions to deduplicate documents and ensure unique hash-based IDs without suffixes * fix: improve error handling and document ID generation in ChromaDB utility functions to ensure robust processing and uniqueness
1 parent e4cc9a6 commit fc52183

File tree

2 files changed

+112
-21
lines changed

2 files changed

+112
-21
lines changed

lib/crewai/src/crewai/rag/chromadb/utils.py

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -67,31 +67,44 @@ def _prepare_documents_for_chromadb(
6767
ids: list[str] = []
6868
texts: list[str] = []
6969
metadatas: list[Mapping[str, str | int | float | bool]] = []
70+
seen_ids: dict[str, int] = {}
71+
72+
try:
73+
for doc in documents:
74+
if "doc_id" in doc:
75+
doc_id = str(doc["doc_id"])
76+
else:
77+
metadata = doc.get("metadata")
78+
if metadata and isinstance(metadata, dict) and "doc_id" in metadata:
79+
doc_id = str(metadata["doc_id"])
80+
else:
81+
content_for_hash = doc["content"]
82+
if metadata:
83+
metadata_str = json.dumps(metadata, sort_keys=True)
84+
content_for_hash = f"{content_for_hash}|{metadata_str}"
85+
doc_id = hashlib.sha256(content_for_hash.encode()).hexdigest()
7086

71-
for doc in documents:
72-
if "doc_id" in doc:
73-
ids.append(doc["doc_id"])
74-
else:
75-
content_for_hash = doc["content"]
7687
metadata = doc.get("metadata")
7788
if metadata:
78-
metadata_str = json.dumps(metadata, sort_keys=True)
79-
content_for_hash = f"{content_for_hash}|{metadata_str}"
80-
81-
content_hash = hashlib.blake2b(
82-
content_for_hash.encode(), digest_size=32
83-
).hexdigest()
84-
ids.append(content_hash)
85-
86-
texts.append(doc["content"])
87-
metadata = doc.get("metadata")
88-
if metadata:
89-
if isinstance(metadata, list):
90-
metadatas.append(metadata[0] if metadata and metadata[0] else {})
89+
if isinstance(metadata, list):
90+
processed_metadata = metadata[0] if metadata and metadata[0] else {}
91+
else:
92+
processed_metadata = metadata
93+
else:
94+
processed_metadata = {}
95+
96+
if doc_id in seen_ids:
97+
idx = seen_ids[doc_id]
98+
texts[idx] = doc["content"]
99+
metadatas[idx] = processed_metadata
91100
else:
92-
metadatas.append(metadata)
93-
else:
94-
metadatas.append({})
101+
idx = len(ids)
102+
ids.append(doc_id)
103+
texts.append(doc["content"])
104+
metadatas.append(processed_metadata)
105+
seen_ids[doc_id] = idx
106+
except Exception as e:
107+
raise ValueError(f"Error preparing documents for ChromaDB: {e}") from e
95108

96109
return PreparedDocuments(ids, texts, metadatas)
97110

lib/crewai/tests/knowledge/test_knowledge.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,3 +601,81 @@ def test_file_path_validation():
601601
match="file_path/file_paths must be a Path, str, or a list of these types",
602602
):
603603
PDFKnowledgeSource()
604+
605+
606+
def test_hash_based_id_generation_without_doc_id(mock_vector_db):
607+
"""Test that documents without doc_id generate hash-based IDs. Duplicates are deduplicated before upsert."""
608+
import hashlib
609+
import json
610+
from crewai.rag.chromadb.utils import _prepare_documents_for_chromadb
611+
from crewai.rag.types import BaseRecord
612+
613+
documents: list[BaseRecord] = [
614+
{"content": "First document content", "metadata": {"source": "test1", "category": "research"}},
615+
{"content": "Second document content", "metadata": {"source": "test2", "category": "research"}},
616+
{"content": "Third document content"}, # No metadata
617+
]
618+
619+
result = _prepare_documents_for_chromadb(documents)
620+
621+
assert len(result.ids) == 3
622+
623+
# Unique documents should get 64-character hex hashes (no suffix)
624+
for doc_id in result.ids:
625+
assert len(doc_id) == 64, f"ID should be 64 characters: {doc_id}"
626+
assert all(c in "0123456789abcdef" for c in doc_id), f"ID should be hex: {doc_id}"
627+
628+
# Different documents should have different hashes
629+
assert result.ids[0] != result.ids[1] != result.ids[2]
630+
631+
# Verify hashes match expected values
632+
expected_hash_1 = hashlib.sha256(
633+
f"First document content|{json.dumps({'category': 'research', 'source': 'test1'}, sort_keys=True)}".encode()
634+
).hexdigest()
635+
assert result.ids[0] == expected_hash_1, "First document hash should match expected"
636+
637+
expected_hash_3 = hashlib.sha256("Third document content".encode()).hexdigest()
638+
assert result.ids[2] == expected_hash_3, "Third document hash should match expected"
639+
640+
# Test that duplicate documents are deduplicated (same ID, only one sent)
641+
duplicate_documents: list[BaseRecord] = [
642+
{"content": "Same content", "metadata": {"source": "test"}},
643+
{"content": "Same content", "metadata": {"source": "test"}},
644+
{"content": "Same content", "metadata": {"source": "test"}},
645+
]
646+
duplicate_result = _prepare_documents_for_chromadb(duplicate_documents)
647+
# Duplicates should be deduplicated - only one ID should remain
648+
assert len(duplicate_result.ids) == 1, "Duplicate documents should be deduplicated"
649+
assert len(duplicate_result.ids[0]) == 64, "Deduplicated ID should be clean hash"
650+
# Verify it's the expected hash
651+
expected_hash = hashlib.sha256(
652+
f"Same content|{json.dumps({'source': 'test'}, sort_keys=True)}".encode()
653+
).hexdigest()
654+
assert duplicate_result.ids[0] == expected_hash, "Deduplicated ID should match expected hash"
655+
656+
657+
def test_hash_based_id_generation_with_doc_id_in_metadata(mock_vector_db):
658+
"""Test that documents with doc_id in metadata use the doc_id directly, not hash-based."""
659+
from crewai.rag.chromadb.utils import _prepare_documents_for_chromadb
660+
from crewai.rag.types import BaseRecord
661+
662+
documents_with_doc_id: list[BaseRecord] = [
663+
{"content": "First document", "metadata": {"doc_id": "custom-id-1", "source": "test1"}},
664+
{"content": "Second document", "metadata": {"doc_id": "custom-id-2"}},
665+
]
666+
667+
documents_without_doc_id: list[BaseRecord] = [
668+
{"content": "First document", "metadata": {"source": "test1"}},
669+
{"content": "Second document"},
670+
]
671+
672+
result_with_doc_id = _prepare_documents_for_chromadb(documents_with_doc_id)
673+
result_without_doc_id = _prepare_documents_for_chromadb(documents_without_doc_id)
674+
675+
assert result_with_doc_id.ids == ["custom-id-1", "custom-id-2"]
676+
677+
assert len(result_without_doc_id.ids) == 2
678+
# Unique documents get 64-character hashes
679+
for doc_id in result_without_doc_id.ids:
680+
assert len(doc_id) == 64, "ID should be 64 characters"
681+
assert all(c in "0123456789abcdef" for c in doc_id), "ID should be hex"

0 commit comments

Comments
 (0)