@@ -601,3 +601,81 @@ def test_file_path_validation():
601601 match = "file_path/file_paths must be a Path, str, or a list of these types" ,
602602 ):
603603 PDFKnowledgeSource ()
604+
605+
606+ def test_hash_based_id_generation_without_doc_id (mock_vector_db ):
607+ """Test that documents without doc_id generate hash-based IDs. Duplicates are deduplicated before upsert."""
608+ import hashlib
609+ import json
610+ from crewai .rag .chromadb .utils import _prepare_documents_for_chromadb
611+ from crewai .rag .types import BaseRecord
612+
613+ documents : list [BaseRecord ] = [
614+ {"content" : "First document content" , "metadata" : {"source" : "test1" , "category" : "research" }},
615+ {"content" : "Second document content" , "metadata" : {"source" : "test2" , "category" : "research" }},
616+ {"content" : "Third document content" }, # No metadata
617+ ]
618+
619+ result = _prepare_documents_for_chromadb (documents )
620+
621+ assert len (result .ids ) == 3
622+
623+ # Unique documents should get 64-character hex hashes (no suffix)
624+ for doc_id in result .ids :
625+ assert len (doc_id ) == 64 , f"ID should be 64 characters: { doc_id } "
626+ assert all (c in "0123456789abcdef" for c in doc_id ), f"ID should be hex: { doc_id } "
627+
628+ # Different documents should have different hashes
629+ assert result .ids [0 ] != result .ids [1 ] != result .ids [2 ]
630+
631+ # Verify hashes match expected values
632+ expected_hash_1 = hashlib .sha256 (
633+ f"First document content|{ json .dumps ({'category' : 'research' , 'source' : 'test1' }, sort_keys = True )} " .encode ()
634+ ).hexdigest ()
635+ assert result .ids [0 ] == expected_hash_1 , "First document hash should match expected"
636+
637+ expected_hash_3 = hashlib .sha256 ("Third document content" .encode ()).hexdigest ()
638+ assert result .ids [2 ] == expected_hash_3 , "Third document hash should match expected"
639+
640+ # Test that duplicate documents are deduplicated (same ID, only one sent)
641+ duplicate_documents : list [BaseRecord ] = [
642+ {"content" : "Same content" , "metadata" : {"source" : "test" }},
643+ {"content" : "Same content" , "metadata" : {"source" : "test" }},
644+ {"content" : "Same content" , "metadata" : {"source" : "test" }},
645+ ]
646+ duplicate_result = _prepare_documents_for_chromadb (duplicate_documents )
647+ # Duplicates should be deduplicated - only one ID should remain
648+ assert len (duplicate_result .ids ) == 1 , "Duplicate documents should be deduplicated"
649+ assert len (duplicate_result .ids [0 ]) == 64 , "Deduplicated ID should be clean hash"
650+ # Verify it's the expected hash
651+ expected_hash = hashlib .sha256 (
652+ f"Same content|{ json .dumps ({'source' : 'test' }, sort_keys = True )} " .encode ()
653+ ).hexdigest ()
654+ assert duplicate_result .ids [0 ] == expected_hash , "Deduplicated ID should match expected hash"
655+
656+
657+ def test_hash_based_id_generation_with_doc_id_in_metadata (mock_vector_db ):
658+ """Test that documents with doc_id in metadata use the doc_id directly, not hash-based."""
659+ from crewai .rag .chromadb .utils import _prepare_documents_for_chromadb
660+ from crewai .rag .types import BaseRecord
661+
662+ documents_with_doc_id : list [BaseRecord ] = [
663+ {"content" : "First document" , "metadata" : {"doc_id" : "custom-id-1" , "source" : "test1" }},
664+ {"content" : "Second document" , "metadata" : {"doc_id" : "custom-id-2" }},
665+ ]
666+
667+ documents_without_doc_id : list [BaseRecord ] = [
668+ {"content" : "First document" , "metadata" : {"source" : "test1" }},
669+ {"content" : "Second document" },
670+ ]
671+
672+ result_with_doc_id = _prepare_documents_for_chromadb (documents_with_doc_id )
673+ result_without_doc_id = _prepare_documents_for_chromadb (documents_without_doc_id )
674+
675+ assert result_with_doc_id .ids == ["custom-id-1" , "custom-id-2" ]
676+
677+ assert len (result_without_doc_id .ids ) == 2
678+ # Unique documents get 64-character hashes
679+ for doc_id in result_without_doc_id .ids :
680+ assert len (doc_id ) == 64 , "ID should be 64 characters"
681+ assert all (c in "0123456789abcdef" for c in doc_id ), "ID should be hex"
0 commit comments