feat: Configurable batch size (#1941)

Vasilije1990 · web-flow · commit 1d674d459fd7 · 2026-01-16T15:20:10.000+01:00
&lt;!-- .github/pull_request_template.md --&gt;

## Description
&lt;!--
Please provide a clear, human-generated description of the changes in
this PR.
DO NOT use AI-generated descriptions. We want to understand your thought
process and reasoning.
--&gt;

## Acceptance Criteria
&lt;!--
* Key requirements to the new feature or modification;
* Proof that the changes work and meet the requirements;
* Include instructions on how to verify the changes. Describe how to
test it locally;
* Proof that it's sufficiently tested.
--&gt;

## Type of Change
&lt;!-- Please check the relevant option --&gt;
- [ ] Bug fix (non-breaking change that fixes an issue)
- [ ] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to change)
- [ ] Documentation update
- [ ] Code refactoring
- [ ] Performance improvement
- [ ] Other (please specify):

## Screenshots/Videos (if applicable)
&lt;!-- Add screenshots or videos to help explain your changes --&gt;

## Pre-submission Checklist
&lt;!-- Please check all boxes that apply before submitting your PR --&gt;
- [ ] **I have tested my changes thoroughly before submitting this PR**
- [ ] **This PR contains minimal changes necessary to address the
issue/feature**
- [ ] My code follows the project's coding standards and style
guidelines
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] I have added necessary documentation (if applicable)
- [ ] All new and existing tests pass
- [ ] I have searched existing PRs to ensure this change hasn't been
submitted already
- [ ] I have linked any relevant issues in the description
- [ ] My commits have clear and descriptive messages

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.


&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

* **New Features**
* Added configurable chunks-per-batch to control per-batch processing
size via CLI flag, API payload, and configuration; defaults are now
driven by config with an automatic fallback.

* **Style / Documentation**
* Updated contribution/style guidelines (formatting, line length,
string-quote rule, pre-commit note).

* **Tests**
* Updated CLI tests to verify propagation of the new chunks-per-batch
parameter.

&lt;sub&gt;✏️ Tip: You can customize this high-level summary in your review
settings.&lt;/sub&gt;
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -427,10 +427,12 @@ git checkout -b feature/your-feature-name
 
 ## Code Style
 
-- Ruff for linting and formatting (configured in `pyproject.toml`)
-- Line length: 100 characters
-- Pre-commit hooks run ruff automatically
-- Type hints encouraged (mypy checks enabled)
+- **Formatter**: Ruff (configured in `pyproject.toml`)
+- **Line length**: 100 characters
+- **String quotes**: Use double quotes `"` not single quotes `'` (enforced by ruff-format)
+- **Pre-commit hooks**: Run ruff linting and formatting automatically
+- **Type hints**: Encouraged (mypy checks enabled)
+- **Important**: Always run `pre-commit run --all-files` before committing to catch formatting issues
 
 ## Testing Strategy
 
diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py
@@ -252,7 +252,7 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
     chunk_size: int = None,
     config: Config = None,
     custom_prompt: Optional[str] = None,
-    chunks_per_batch: int = 100,
+    chunks_per_batch: int = None,
     **kwargs,
 ) -> list[Task]:
     if config is None:
@@ -272,12 +272,14 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
                 "ontology_config": {"ontology_resolver": get_default_ontology_resolver()}
             }
 
-    if chunks_per_batch is None:
-        chunks_per_batch = 100
-
     cognify_config = get_cognify_config()
     embed_triplets = cognify_config.triplet_embedding
 
+    if chunks_per_batch is None:
+        chunks_per_batch = (
+            cognify_config.chunks_per_batch if cognify_config.chunks_per_batch is not None else 100
+        )
+
     default_tasks = [
         Task(classify_documents),
         Task(
@@ -308,7 +310,7 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
 
 
 async def get_temporal_tasks(
-    user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = 10
+    user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = None
 ) -> list[Task]:
     """
     Builds and returns a list of temporal processing tasks to be executed in sequence.
@@ -330,7 +332,10 @@ async def get_temporal_tasks(
         list[Task]: A list of Task objects representing the temporal processing pipeline.
     """
     if chunks_per_batch is None:
-        chunks_per_batch = 10
+        from cognee.modules.cognify.config import get_cognify_config
+
+        configured = get_cognify_config().chunks_per_batch
+        chunks_per_batch = configured if configured is not None else 10
 
     temporal_tasks = [
         Task(classify_documents),
diff --git a/cognee/api/v1/cognify/routers/get_cognify_router.py b/cognee/api/v1/cognify/routers/get_cognify_router.py
@@ -46,6 +46,11 @@ class CognifyPayloadDTO(InDTO):
         examples=[[]],
         description="Reference to one or more previously uploaded ontologies",
     )
+    chunks_per_batch: Optional[int] = Field(
+        default=None,
+        description="Number of chunks to process per task batch in Cognify (overrides default).",
+        examples=[10, 20, 50, 100],
+    )
 
 
 def get_cognify_router() -> APIRouter:
@@ -146,6 +151,7 @@ async def cognify(payload: CognifyPayloadDTO, user: User = Depends(get_authentic
                 config=config_to_use,
                 run_in_background=payload.run_in_background,
                 custom_prompt=payload.custom_prompt,
+                chunks_per_batch=payload.chunks_per_batch,
             )
 
             # If any cognify run errored return JSONResponse with proper error status code
diff --git a/cognee/cli/commands/cognify_command.py b/cognee/cli/commands/cognify_command.py
@@ -62,6 +62,11 @@ def configure_parser(self, parser: argparse.ArgumentParser) -> None:
         parser.add_argument(
             "--verbose", "-v", action="store_true", help="Show detailed progress information"
         )
+        parser.add_argument(
+            "--chunks-per-batch",
+            type=int,
+            help="Number of chunks to process per task batch (try 50 for large single documents).",
+        )
 
     def execute(self, args: argparse.Namespace) -> None:
         try:
@@ -111,6 +116,7 @@ async def run_cognify():
                         chunk_size=args.chunk_size,
                         ontology_file_path=args.ontology_file,
                         run_in_background=args.background,
+                        chunks_per_batch=getattr(args, "chunks_per_batch", None),
                     )
                     return result
                 except Exception as e:
diff --git a/cognee/modules/cognify/config.py b/cognee/modules/cognify/config.py
@@ -9,13 +9,15 @@ class CognifyConfig(BaseSettings):
     classification_model: object = DefaultContentPrediction
     summarization_model: object = SummarizedContent
     triplet_embedding: bool = False
+    chunks_per_batch: Optional[int] = None
     model_config = SettingsConfigDict(env_file=".env", extra="allow")
 
     def to_dict(self) -> dict:
         return {
             "classification_model": self.classification_model,
             "summarization_model": self.summarization_model,
             "triplet_embedding": self.triplet_embedding,
+            "chunks_per_batch": self.chunks_per_batch,
         }
 
 
diff --git a/cognee/tests/cli_tests/cli_unit_tests/test_cli_commands.py b/cognee/tests/cli_tests/cli_unit_tests/test_cli_commands.py
@@ -238,6 +238,7 @@ def test_execute_basic_cognify(self, mock_asyncio_run):
             ontology_file_path=None,
             chunker=TextChunker,
             run_in_background=False,
+            chunks_per_batch=None,
         )
 
     @patch("cognee.cli.commands.cognify_command.asyncio.run")
diff --git a/cognee/tests/cli_tests/cli_unit_tests/test_cli_edge_cases.py b/cognee/tests/cli_tests/cli_unit_tests/test_cli_edge_cases.py
@@ -262,6 +262,7 @@ def test_cognify_invalid_chunk_size(self, mock_asyncio_run):
             ontology_file_path=None,
             chunker=TextChunker,
             run_in_background=False,
+            chunks_per_batch=None,
         )
 
     @patch("cognee.cli.commands.cognify_command.asyncio.run", side_effect=_mock_run)
@@ -295,6 +296,7 @@ def test_cognify_nonexistent_ontology_file(self, mock_asyncio_run):
             ontology_file_path="/nonexistent/path/ontology.owl",
             chunker=TextChunker,
             run_in_background=False,
+            chunks_per_batch=None,
         )
 
     @patch("cognee.cli.commands.cognify_command.asyncio.run")
@@ -373,6 +375,7 @@ def test_cognify_empty_datasets_list(self, mock_asyncio_run):
             ontology_file_path=None,
             chunker=TextChunker,
             run_in_background=False,
+            chunks_per_batch=None,
         )
 
 

Original file line number	Diff line number	Diff line change
`@@ -238,6 +238,7 @@ def test_execute_basic_cognify(self, mock_asyncio_run):`
`238`	`238`	`ontology_file_path=None,`
`239`	`239`	`chunker=TextChunker,`
`240`	`240`	`run_in_background=False,`
	`241`	`+ chunks_per_batch=None,`
`241`	`242`	`)`
`242`	`243`
`243`	`244`	`@patch("cognee.cli.commands.cognify_command.asyncio.run")`
Original file line number	Diff line number	Diff line change
`@@ -262,6 +262,7 @@ def test_cognify_invalid_chunk_size(self, mock_asyncio_run):`
`262`	`262`	`ontology_file_path=None,`
`263`	`263`	`chunker=TextChunker,`
`264`	`264`	`run_in_background=False,`
	`265`	`+ chunks_per_batch=None,`
`265`	`266`	`)`
`266`	`267`
`267`	`268`	`@patch("cognee.cli.commands.cognify_command.asyncio.run", side_effect=_mock_run)`
`@@ -295,6 +296,7 @@ def test_cognify_nonexistent_ontology_file(self, mock_asyncio_run):`
`295`	`296`	`ontology_file_path="/nonexistent/path/ontology.owl",`
`296`	`297`	`chunker=TextChunker,`
`297`	`298`	`run_in_background=False,`
	`299`	`+ chunks_per_batch=None,`
`298`	`300`	`)`
`299`	`301`
`300`	`302`	`@patch("cognee.cli.commands.cognify_command.asyncio.run")`
`@@ -373,6 +375,7 @@ def test_cognify_empty_datasets_list(self, mock_asyncio_run):`
`373`	`375`	`ontology_file_path=None,`
`374`	`376`	`chunker=TextChunker,`
`375`	`377`	`run_in_background=False,`
	`378`	`+ chunks_per_batch=None,`
`376`	`379`	`)`
`377`	`380`
`378`	`381`