1717 ChromaDBCollectionSearchParams ,
1818)
1919from crewai .rag .chromadb .utils import (
20+ _create_batch_slice ,
2021 _extract_search_params ,
2122 _is_async_client ,
2223 _is_sync_client ,
@@ -52,6 +53,7 @@ def __init__(
5253 embedding_function : ChromaEmbeddingFunction ,
5354 default_limit : int = 5 ,
5455 default_score_threshold : float = 0.6 ,
56+ default_batch_size : int = 100 ,
5557 ) -> None :
5658 """Initialize ChromaDBClient with client and embedding function.
5759
@@ -60,11 +62,13 @@ def __init__(
6062 embedding_function: Embedding function for text to vector conversion.
6163 default_limit: Default number of results to return in searches.
6264 default_score_threshold: Default minimum score for search results.
65+ default_batch_size: Default batch size for adding documents.
6366 """
6467 self .client = client
6568 self .embedding_function = embedding_function
6669 self .default_limit = default_limit
6770 self .default_score_threshold = default_score_threshold
71+ self .default_batch_size = default_batch_size
6872
6973 def create_collection (
7074 self , ** kwargs : Unpack [ChromaDBCollectionCreateParams ]
@@ -291,6 +295,7 @@ def add_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> None:
291295 - content: The text content (required)
292296 - doc_id: Optional unique identifier (auto-generated if missing)
293297 - metadata: Optional metadata dictionary
298+ batch_size: Optional batch size for processing documents (default: 100)
294299
295300 Raises:
296301 TypeError: If AsyncClientAPI is used instead of ClientAPI for sync operations.
@@ -305,6 +310,7 @@ def add_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> None:
305310
306311 collection_name = kwargs ["collection_name" ]
307312 documents = kwargs ["documents" ]
313+ batch_size = kwargs .get ("batch_size" , self .default_batch_size )
308314
309315 if not documents :
310316 raise ValueError ("Documents list cannot be empty" )
@@ -315,13 +321,17 @@ def add_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> None:
315321 )
316322
317323 prepared = _prepare_documents_for_chromadb (documents )
318- # ChromaDB doesn't accept empty metadata dicts, so pass None if all are empty
319- metadatas = prepared .metadatas if any (m for m in prepared .metadatas ) else None
320- collection .upsert (
321- ids = prepared .ids ,
322- documents = prepared .texts ,
323- metadatas = metadatas ,
324- )
324+
325+ for i in range (0 , len (prepared .ids ), batch_size ):
326+ batch_ids , batch_texts , batch_metadatas = _create_batch_slice (
327+ prepared = prepared , start_index = i , batch_size = batch_size
328+ )
329+
330+ collection .upsert (
331+ ids = batch_ids ,
332+ documents = batch_texts ,
333+ metadatas = batch_metadatas ,
334+ )
325335
326336 async def aadd_documents (self , ** kwargs : Unpack [BaseCollectionAddParams ]) -> None :
327337 """Add documents with their embeddings to a collection asynchronously.
@@ -335,6 +345,7 @@ async def aadd_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> Non
335345 - content: The text content (required)
336346 - doc_id: Optional unique identifier (auto-generated if missing)
337347 - metadata: Optional metadata dictionary
348+ batch_size: Optional batch size for processing documents (default: 100)
338349
339350 Raises:
340351 TypeError: If ClientAPI is used instead of AsyncClientAPI for async operations.
@@ -349,6 +360,7 @@ async def aadd_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> Non
349360
350361 collection_name = kwargs ["collection_name" ]
351362 documents = kwargs ["documents" ]
363+ batch_size = kwargs .get ("batch_size" , self .default_batch_size )
352364
353365 if not documents :
354366 raise ValueError ("Documents list cannot be empty" )
@@ -358,13 +370,17 @@ async def aadd_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> Non
358370 embedding_function = self .embedding_function ,
359371 )
360372 prepared = _prepare_documents_for_chromadb (documents )
361- # ChromaDB doesn't accept empty metadata dicts, so pass None if all are empty
362- metadatas = prepared .metadatas if any (m for m in prepared .metadatas ) else None
363- await collection .upsert (
364- ids = prepared .ids ,
365- documents = prepared .texts ,
366- metadatas = metadatas ,
367- )
373+
374+ for i in range (0 , len (prepared .ids ), batch_size ):
375+ batch_ids , batch_texts , batch_metadatas = _create_batch_slice (
376+ prepared = prepared , start_index = i , batch_size = batch_size
377+ )
378+
379+ await collection .upsert (
380+ ids = batch_ids ,
381+ documents = batch_texts ,
382+ metadatas = batch_metadatas ,
383+ )
368384
369385 def search (
370386 self , ** kwargs : Unpack [ChromaDBCollectionSearchParams ]
0 commit comments