fix: add batch_size support to prevent embedder token limit errors

greysonlalonde · vinibrsl · web-flow · commit 1dbe8aab52b9 · 2025-09-24T00:05:43.000-04:00
- add batch_size field to baseragconfig (default=100) - update chromadb/qdrant clients and factories to use batch_size - extract and filter batch_size from embedder config in knowledgestorage - fix large csv files exceeding embedder token limits (#3574) - remove unneeded conditional for type Co-authored-by: Vini Brasil <vini@hey.com>
diff --git a/src/crewai/knowledge/storage/knowledge_storage.py b/src/crewai/knowledge/storage/knowledge_storage.py
@@ -8,7 +8,7 @@
 from crewai.rag.chromadb.types import ChromaEmbeddingFunctionWrapper
 from crewai.rag.config.utils import get_rag_client
 from crewai.rag.core.base_client import BaseClient
-from crewai.rag.embeddings.factory import get_embedding_function
+from crewai.rag.embeddings.factory import EmbedderConfig, get_embedding_function
 from crewai.rag.factory import create_client
 from crewai.rag.types import BaseRecord, SearchResult
 from crewai.utilities.logger import Logger
@@ -27,6 +27,7 @@ def __init__(
     ) -> None:
         self.collection_name = collection_name
         self._client: BaseClient | None = None
+        self._embedder_config = embedder  # Store embedder config
 
         warnings.filterwarnings(
             "ignore",
@@ -35,12 +36,29 @@ def __init__(
         )
 
         if embedder:
-            embedding_function = get_embedding_function(embedder)
-            config = ChromaDBConfig(
-                embedding_function=cast(
-                    ChromaEmbeddingFunctionWrapper, embedding_function
+            # Cast to EmbedderConfig for type checking
+            embedder_typed = cast(EmbedderConfig, embedder)
+            embedding_function = get_embedding_function(embedder_typed)
+            batch_size = None
+            if isinstance(embedder, dict) and "config" in embedder:
+                nested_config = embedder["config"]
+                if isinstance(nested_config, dict):
+                    batch_size = nested_config.get("batch_size")
+
+            # Create config with batch_size if provided
+            if batch_size is not None:
+                config = ChromaDBConfig(
+                    embedding_function=cast(
+                        ChromaEmbeddingFunctionWrapper, embedding_function
+                    ),
+                    batch_size=batch_size,
+                )
+            else:
+                config = ChromaDBConfig(
+                    embedding_function=cast(
+                        ChromaEmbeddingFunctionWrapper, embedding_function
+                    )
                 )
-            )
             self._client = create_client(config)
 
     def _get_client(self) -> BaseClient:
@@ -105,9 +123,23 @@ def save(self, documents: list[str]) -> None:
 
             rag_documents: list[BaseRecord] = [{"content": doc} for doc in documents]
 
-            client.add_documents(
-                collection_name=collection_name, documents=rag_documents
-            )
+            batch_size = None
+            if self._embedder_config and isinstance(self._embedder_config, dict):
+                if "config" in self._embedder_config:
+                    nested_config = self._embedder_config["config"]
+                    if isinstance(nested_config, dict):
+                        batch_size = nested_config.get("batch_size")
+
+            if batch_size is not None:
+                client.add_documents(
+                    collection_name=collection_name,
+                    documents=rag_documents,
+                    batch_size=batch_size,
+                )
+            else:
+                client.add_documents(
+                    collection_name=collection_name, documents=rag_documents
+                )
         except Exception as e:
             if "dimension mismatch" in str(e).lower():
                 Logger(verbose=True).log(
diff --git a/src/crewai/memory/storage/rag_storage.py b/src/crewai/memory/storage/rag_storage.py
@@ -66,11 +66,28 @@ def __init__(
                     f"Error: {e}"
                 ) from e
 
-            config = ChromaDBConfig(
-                embedding_function=cast(
-                    ChromaEmbeddingFunctionWrapper, embedding_function
+            batch_size = None
+            if (
+                isinstance(self.embedder_config, dict)
+                and "config" in self.embedder_config
+            ):
+                nested_config = self.embedder_config["config"]
+                if isinstance(nested_config, dict):
+                    batch_size = nested_config.get("batch_size")
+
+            if batch_size is not None:
+                config = ChromaDBConfig(
+                    embedding_function=cast(
+                        ChromaEmbeddingFunctionWrapper, embedding_function
+                    ),
+                    batch_size=batch_size,
+                )
+            else:
+                config = ChromaDBConfig(
+                    embedding_function=cast(
+                        ChromaEmbeddingFunctionWrapper, embedding_function
+                    )
                 )
-            )
             self._client = create_client(config)
 
     def _get_client(self) -> BaseClient:
@@ -111,7 +128,26 @@ def save(self, value: Any, metadata: dict[str, Any]) -> None:
             if metadata:
                 document["metadata"] = metadata
 
-            client.add_documents(collection_name=collection_name, documents=[document])
+            batch_size = None
+            if (
+                self.embedder_config
+                and isinstance(self.embedder_config, dict)
+                and "config" in self.embedder_config
+            ):
+                nested_config = self.embedder_config["config"]
+                if isinstance(nested_config, dict):
+                    batch_size = nested_config.get("batch_size")
+
+            if batch_size is not None:
+                client.add_documents(
+                    collection_name=collection_name,
+                    documents=[document],
+                    batch_size=batch_size,
+                )
+            else:
+                client.add_documents(
+                    collection_name=collection_name, documents=[document]
+                )
         except Exception as e:
             logging.error(
                 f"Error during {self.type} save: {e!s}\n{traceback.format_exc()}"
diff --git a/src/crewai/rag/chromadb/client.py b/src/crewai/rag/chromadb/client.py
@@ -17,6 +17,7 @@
     ChromaDBCollectionSearchParams,
 )
 from crewai.rag.chromadb.utils import (
+    _create_batch_slice,
     _extract_search_params,
     _is_async_client,
     _is_sync_client,
@@ -52,6 +53,7 @@ def __init__(
         embedding_function: ChromaEmbeddingFunction,
         default_limit: int = 5,
         default_score_threshold: float = 0.6,
+        default_batch_size: int = 100,
     ) -> None:
         """Initialize ChromaDBClient with client and embedding function.
 
@@ -60,11 +62,13 @@ def __init__(
             embedding_function: Embedding function for text to vector conversion.
             default_limit: Default number of results to return in searches.
             default_score_threshold: Default minimum score for search results.
+            default_batch_size: Default batch size for adding documents.
         """
         self.client = client
         self.embedding_function = embedding_function
         self.default_limit = default_limit
         self.default_score_threshold = default_score_threshold
+        self.default_batch_size = default_batch_size
 
     def create_collection(
         self, **kwargs: Unpack[ChromaDBCollectionCreateParams]
@@ -291,6 +295,7 @@ def add_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> None:
                 - content: The text content (required)
                 - doc_id: Optional unique identifier (auto-generated if missing)
                 - metadata: Optional metadata dictionary
+            batch_size: Optional batch size for processing documents (default: 100)
 
         Raises:
             TypeError: If AsyncClientAPI is used instead of ClientAPI for sync operations.
@@ -305,6 +310,7 @@ def add_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> None:
 
         collection_name = kwargs["collection_name"]
         documents = kwargs["documents"]
+        batch_size = kwargs.get("batch_size", self.default_batch_size)
 
         if not documents:
             raise ValueError("Documents list cannot be empty")
@@ -315,13 +321,17 @@ def add_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> None:
         )
 
         prepared = _prepare_documents_for_chromadb(documents)
-        # ChromaDB doesn't accept empty metadata dicts, so pass None if all are empty
-        metadatas = prepared.metadatas if any(m for m in prepared.metadatas) else None
-        collection.upsert(
-            ids=prepared.ids,
-            documents=prepared.texts,
-            metadatas=metadatas,
-        )
+
+        for i in range(0, len(prepared.ids), batch_size):
+            batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
+                prepared=prepared, start_index=i, batch_size=batch_size
+            )
+
+            collection.upsert(
+                ids=batch_ids,
+                documents=batch_texts,
+                metadatas=batch_metadatas,
+            )
 
     async def aadd_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> None:
         """Add documents with their embeddings to a collection asynchronously.
@@ -335,6 +345,7 @@ async def aadd_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> Non
                 - content: The text content (required)
                 - doc_id: Optional unique identifier (auto-generated if missing)
                 - metadata: Optional metadata dictionary
+            batch_size: Optional batch size for processing documents (default: 100)
 
         Raises:
             TypeError: If ClientAPI is used instead of AsyncClientAPI for async operations.
@@ -349,6 +360,7 @@ async def aadd_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> Non
 
         collection_name = kwargs["collection_name"]
         documents = kwargs["documents"]
+        batch_size = kwargs.get("batch_size", self.default_batch_size)
 
         if not documents:
             raise ValueError("Documents list cannot be empty")
@@ -358,13 +370,17 @@ async def aadd_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> Non
             embedding_function=self.embedding_function,
         )
         prepared = _prepare_documents_for_chromadb(documents)
-        # ChromaDB doesn't accept empty metadata dicts, so pass None if all are empty
-        metadatas = prepared.metadatas if any(m for m in prepared.metadatas) else None
-        await collection.upsert(
-            ids=prepared.ids,
-            documents=prepared.texts,
-            metadatas=metadatas,
-        )
+
+        for i in range(0, len(prepared.ids), batch_size):
+            batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
+                prepared=prepared, start_index=i, batch_size=batch_size
+            )
+
+            await collection.upsert(
+                ids=batch_ids,
+                documents=batch_texts,
+                metadatas=batch_metadatas,
+            )
 
     def search(
         self, **kwargs: Unpack[ChromaDBCollectionSearchParams]
diff --git a/src/crewai/rag/chromadb/factory.py b/src/crewai/rag/chromadb/factory.py
@@ -41,4 +41,5 @@ def create_client(config: ChromaDBConfig) -> ChromaDBClient:
         embedding_function=config.embedding_function,
         default_limit=config.limit,
         default_score_threshold=config.score_threshold,
+        default_batch_size=config.batch_size,
     )
diff --git a/src/crewai/rag/chromadb/utils.py b/src/crewai/rag/chromadb/utils.py
@@ -1,6 +1,7 @@
 """Utility functions for ChromaDB client implementation."""
 
 import hashlib
+import json
 from collections.abc import Mapping
 from typing import Literal, TypeGuard, cast
 
@@ -72,7 +73,15 @@ def _prepare_documents_for_chromadb(
         if "doc_id" in doc:
             ids.append(doc["doc_id"])
         else:
-            content_hash = hashlib.sha256(doc["content"].encode()).hexdigest()[:16]
+            content_for_hash = doc["content"]
+            metadata = doc.get("metadata")
+            if metadata:
+                metadata_str = json.dumps(metadata, sort_keys=True)
+                content_for_hash = f"{content_for_hash}|{metadata_str}"
+
+            content_hash = hashlib.blake2b(
+                content_for_hash.encode(), digest_size=32
+            ).hexdigest()
             ids.append(content_hash)
 
         texts.append(doc["content"])
@@ -88,6 +97,32 @@ def _prepare_documents_for_chromadb(
     return PreparedDocuments(ids, texts, metadatas)
 
 
+def _create_batch_slice(
+    prepared: PreparedDocuments, start_index: int, batch_size: int
+) -> tuple[list[str], list[str], list[Mapping[str, str | int | float | bool]] | None]:
+    """Create a batch slice from prepared documents.
+
+    Args:
+        prepared: PreparedDocuments containing ids, texts, and metadatas.
+        start_index: Starting index for the batch.
+        batch_size: Size of the batch.
+
+    Returns:
+        Tuple of (batch_ids, batch_texts, batch_metadatas).
+    """
+    batch_end = min(start_index + batch_size, len(prepared.ids))
+    batch_ids = prepared.ids[start_index:batch_end]
+    batch_texts = prepared.texts[start_index:batch_end]
+    batch_metadatas = (
+        prepared.metadatas[start_index:batch_end] if prepared.metadatas else None
+    )
+
+    if batch_metadatas and not any(m for m in batch_metadatas):
+        batch_metadatas = None
+
+    return batch_ids, batch_texts, batch_metadatas
+
+
 def _extract_search_params(
     kwargs: ChromaDBCollectionSearchParams,
 ) -> ExtractedSearchParams:
diff --git a/src/crewai/rag/config/base.py b/src/crewai/rag/config/base.py
@@ -16,3 +16,4 @@ class BaseRagConfig:
     embedding_function: Any | None = field(default=None)
     limit: int = field(default=5)
     score_threshold: float = field(default=0.6)
+    batch_size: int = field(default=100)
diff --git a/src/crewai/rag/core/base_client.py b/src/crewai/rag/core/base_client.py
@@ -29,17 +29,19 @@ class BaseCollectionParams(TypedDict):
     ]
 
 
-class BaseCollectionAddParams(BaseCollectionParams):
+class BaseCollectionAddParams(BaseCollectionParams, total=False):
     """Parameters for adding documents to a collection.
 
     Extends BaseCollectionParams with document-specific fields.
 
     Attributes:
         collection_name: The name of the collection to add documents to.
         documents: List of BaseRecord dictionaries containing document data.
+        batch_size: Optional batch size for processing documents to avoid token limits.
     """
 
-    documents: list[BaseRecord]
+    documents: Required[list[BaseRecord]]
+    batch_size: int
 
 
 class BaseCollectionSearchParams(BaseCollectionParams, total=False):
diff --git a/src/crewai/rag/embeddings/factory.py b/src/crewai/rag/embeddings/factory.py
@@ -244,4 +244,6 @@ def get_embedding_function(
 
     _inject_api_key_from_env(provider, config_dict)
 
+    config_dict.pop("batch_size", None)
+
     return EMBEDDING_PROVIDERS[provider](**config_dict)
diff --git a/src/crewai/rag/qdrant/client.py b/src/crewai/rag/qdrant/client.py
diff --git a/src/crewai/rag/qdrant/factory.py b/src/crewai/rag/qdrant/factory.py
diff --git a/tests/rag/chromadb/test_client.py b/tests/rag/chromadb/test_client.py
diff --git a/tests/rag/chromadb/test_utils.py b/tests/rag/chromadb/test_utils.py

Original file line number	Diff line number	Diff line change
`@@ -41,4 +41,5 @@ def create_client(config: ChromaDBConfig) -> ChromaDBClient:`
`41`	`41`	`embedding_function=config.embedding_function,`
`42`	`42`	`default_limit=config.limit,`
`43`	`43`	`default_score_threshold=config.score_threshold,`
	`44`	`+ default_batch_size=config.batch_size,`
`44`	`45`	`)`