Lorenze/fix duplicating doc ids for knowledge (#3840)

lorenzejay · web-flow · commit fc521839e467 · 2025-11-06T10:59:52.000-08:00
* fix: update document ID handling in ChromaDB utility functions to use SHA-256 hashing and include index for uniqueness

* test: add tests for hash-based ID generation in ChromaDB utility functions

* drop idx for preventing dups, upsert should handle dups

* fix: update document ID extraction logic in ChromaDB utility functions to check for doc_id at the top level of the document

* fix: enhance document ID generation in ChromaDB utility functions to deduplicate documents and ensure unique hash-based IDs without suffixes

* fix: improve error handling and document ID generation in ChromaDB utility functions to ensure robust processing and uniqueness
diff --git a/lib/crewai/src/crewai/rag/chromadb/utils.py b/lib/crewai/src/crewai/rag/chromadb/utils.py
@@ -67,31 +67,44 @@ def _prepare_documents_for_chromadb(
     ids: list[str] = []
     texts: list[str] = []
     metadatas: list[Mapping[str, str | int | float | bool]] = []
+    seen_ids: dict[str, int] = {}
+
+    try:
+        for doc in documents:
+            if "doc_id" in doc:
+                doc_id = str(doc["doc_id"])
+            else:
+                metadata = doc.get("metadata")
+                if metadata and isinstance(metadata, dict) and "doc_id" in metadata:
+                    doc_id = str(metadata["doc_id"])
+                else:
+                    content_for_hash = doc["content"]
+                    if metadata:
+                        metadata_str = json.dumps(metadata, sort_keys=True)
+                        content_for_hash = f"{content_for_hash}|{metadata_str}"
+                    doc_id = hashlib.sha256(content_for_hash.encode()).hexdigest()
 
-    for doc in documents:
-        if "doc_id" in doc:
-            ids.append(doc["doc_id"])
-        else:
-            content_for_hash = doc["content"]
             metadata = doc.get("metadata")
             if metadata:
-                metadata_str = json.dumps(metadata, sort_keys=True)
-                content_for_hash = f"{content_for_hash}|{metadata_str}"
-
-            content_hash = hashlib.blake2b(
-                content_for_hash.encode(), digest_size=32
-            ).hexdigest()
-            ids.append(content_hash)
-
-        texts.append(doc["content"])
-        metadata = doc.get("metadata")
-        if metadata:
-            if isinstance(metadata, list):
-                metadatas.append(metadata[0] if metadata and metadata[0] else {})
+                if isinstance(metadata, list):
+                    processed_metadata = metadata[0] if metadata and metadata[0] else {}
+                else:
+                    processed_metadata = metadata
+            else:
+                processed_metadata = {}
+
+            if doc_id in seen_ids:
+                idx = seen_ids[doc_id]
+                texts[idx] = doc["content"]
+                metadatas[idx] = processed_metadata
             else:
-                metadatas.append(metadata)
-        else:
-            metadatas.append({})
+                idx = len(ids)
+                ids.append(doc_id)
+                texts.append(doc["content"])
+                metadatas.append(processed_metadata)
+                seen_ids[doc_id] = idx
+    except Exception as e:
+        raise ValueError(f"Error preparing documents for ChromaDB: {e}") from e
 
     return PreparedDocuments(ids, texts, metadatas)
 
diff --git a/lib/crewai/tests/knowledge/test_knowledge.py b/lib/crewai/tests/knowledge/test_knowledge.py
@@ -601,3 +601,81 @@ def test_file_path_validation():
         match="file_path/file_paths must be a Path, str, or a list of these types",
     ):
         PDFKnowledgeSource()
+
+
+def test_hash_based_id_generation_without_doc_id(mock_vector_db):
+    """Test that documents without doc_id generate hash-based IDs. Duplicates are deduplicated before upsert."""
+    import hashlib
+    import json
+    from crewai.rag.chromadb.utils import _prepare_documents_for_chromadb
+    from crewai.rag.types import BaseRecord
+
+    documents: list[BaseRecord] = [
+        {"content": "First document content", "metadata": {"source": "test1", "category": "research"}},
+        {"content": "Second document content", "metadata": {"source": "test2", "category": "research"}},
+        {"content": "Third document content"},  # No metadata
+    ]
+
+    result = _prepare_documents_for_chromadb(documents)
+
+    assert len(result.ids) == 3
+
+    # Unique documents should get 64-character hex hashes (no suffix)
+    for doc_id in result.ids:
+        assert len(doc_id) == 64, f"ID should be 64 characters: {doc_id}"
+        assert all(c in "0123456789abcdef" for c in doc_id), f"ID should be hex: {doc_id}"
+
+    # Different documents should have different hashes
+    assert result.ids[0] != result.ids[1] != result.ids[2]
+
+    # Verify hashes match expected values
+    expected_hash_1 = hashlib.sha256(
+        f"First document content|{json.dumps({'category': 'research', 'source': 'test1'}, sort_keys=True)}".encode()
+    ).hexdigest()
+    assert result.ids[0] == expected_hash_1, "First document hash should match expected"
+
+    expected_hash_3 = hashlib.sha256("Third document content".encode()).hexdigest()
+    assert result.ids[2] == expected_hash_3, "Third document hash should match expected"
+
+    # Test that duplicate documents are deduplicated (same ID, only one sent)
+    duplicate_documents: list[BaseRecord] = [
+        {"content": "Same content", "metadata": {"source": "test"}},
+        {"content": "Same content", "metadata": {"source": "test"}},
+        {"content": "Same content", "metadata": {"source": "test"}},
+    ]
+    duplicate_result = _prepare_documents_for_chromadb(duplicate_documents)
+    # Duplicates should be deduplicated - only one ID should remain
+    assert len(duplicate_result.ids) == 1, "Duplicate documents should be deduplicated"
+    assert len(duplicate_result.ids[0]) == 64, "Deduplicated ID should be clean hash"
+    # Verify it's the expected hash
+    expected_hash = hashlib.sha256(
+        f"Same content|{json.dumps({'source': 'test'}, sort_keys=True)}".encode()
+    ).hexdigest()
+    assert duplicate_result.ids[0] == expected_hash, "Deduplicated ID should match expected hash"
+
+
+def test_hash_based_id_generation_with_doc_id_in_metadata(mock_vector_db):
+    """Test that documents with doc_id in metadata use the doc_id directly, not hash-based."""
+    from crewai.rag.chromadb.utils import _prepare_documents_for_chromadb
+    from crewai.rag.types import BaseRecord
+
+    documents_with_doc_id: list[BaseRecord] = [
+        {"content": "First document", "metadata": {"doc_id": "custom-id-1", "source": "test1"}},
+        {"content": "Second document", "metadata": {"doc_id": "custom-id-2"}},
+    ]
+
+    documents_without_doc_id: list[BaseRecord] = [
+        {"content": "First document", "metadata": {"source": "test1"}},
+        {"content": "Second document"},
+    ]
+
+    result_with_doc_id = _prepare_documents_for_chromadb(documents_with_doc_id)
+    result_without_doc_id = _prepare_documents_for_chromadb(documents_without_doc_id)
+
+    assert result_with_doc_id.ids == ["custom-id-1", "custom-id-2"]
+
+    assert len(result_without_doc_id.ids) == 2
+    # Unique documents get 64-character hashes
+    for doc_id in result_without_doc_id.ids:
+        assert len(doc_id) == 64, "ID should be 64 characters"
+        assert all(c in "0123456789abcdef" for c in doc_id), "ID should be hex"