-
Notifications
You must be signed in to change notification settings - Fork 1.2k
feat: Configurable batch size #1941
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d1b13b1
f2166be
e38c33c
6ec44f6
7d3450c
98394fc
b7d5bf5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -252,7 +252,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's | |||||||||||||||||||
| chunk_size: int = None, | ||||||||||||||||||||
| config: Config = None, | ||||||||||||||||||||
| custom_prompt: Optional[str] = None, | ||||||||||||||||||||
| chunks_per_batch: int = 100, | ||||||||||||||||||||
| chunks_per_batch: int = None, | ||||||||||||||||||||
| **kwargs, | ||||||||||||||||||||
| ) -> list[Task]: | ||||||||||||||||||||
| if config is None: | ||||||||||||||||||||
|
|
@@ -272,12 +272,14 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's | |||||||||||||||||||
| "ontology_config": {"ontology_resolver": get_default_ontology_resolver()} | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| if chunks_per_batch is None: | ||||||||||||||||||||
| chunks_per_batch = 100 | ||||||||||||||||||||
|
|
||||||||||||||||||||
| cognify_config = get_cognify_config() | ||||||||||||||||||||
| embed_triplets = cognify_config.triplet_embedding | ||||||||||||||||||||
|
|
||||||||||||||||||||
| if chunks_per_batch is None: | ||||||||||||||||||||
| chunks_per_batch = ( | ||||||||||||||||||||
| cognify_config.chunks_per_batch if cognify_config.chunks_per_batch is not None else 100 | ||||||||||||||||||||
| ) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| default_tasks = [ | ||||||||||||||||||||
| Task(classify_documents), | ||||||||||||||||||||
| Task( | ||||||||||||||||||||
|
|
@@ -308,7 +310,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's | |||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| async def get_temporal_tasks( | ||||||||||||||||||||
| user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = 10 | ||||||||||||||||||||
| user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = None | ||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix type annotation for optional parameter. The parameter 🔎 Proposed fix- user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = None
+ user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: Optional[int] = None📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||
| ) -> list[Task]: | ||||||||||||||||||||
| """ | ||||||||||||||||||||
| Builds and returns a list of temporal processing tasks to be executed in sequence. | ||||||||||||||||||||
|
|
@@ -330,7 +332,10 @@ async def get_temporal_tasks( | |||||||||||||||||||
| list[Task]: A list of Task objects representing the temporal processing pipeline. | ||||||||||||||||||||
| """ | ||||||||||||||||||||
| if chunks_per_batch is None: | ||||||||||||||||||||
| chunks_per_batch = 10 | ||||||||||||||||||||
| from cognee.modules.cognify.config import get_cognify_config | ||||||||||||||||||||
|
|
||||||||||||||||||||
| configured = get_cognify_config().chunks_per_batch | ||||||||||||||||||||
| chunks_per_batch = configured if configured is not None else 10 | ||||||||||||||||||||
|
Comment on lines
334
to
+338
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion | 🟠 Major Remove redundant import. The lazy import of 🔎 Proposed fix if chunks_per_batch is None:
- from cognee.modules.cognify.config import get_cognify_config
-
configured = get_cognify_config().chunks_per_batch
chunks_per_batch = configured if configured is not None else 10📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||
|
|
||||||||||||||||||||
| temporal_tasks = [ | ||||||||||||||||||||
| Task(classify_documents), | ||||||||||||||||||||
|
|
||||||||||||||||||||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -9,13 +9,15 @@ class CognifyConfig(BaseSettings): | |||||||||||
| classification_model: object = DefaultContentPrediction | ||||||||||||
| summarization_model: object = SummarizedContent | ||||||||||||
| triplet_embedding: bool = False | ||||||||||||
| chunks_per_batch: Optional[int] = None | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion | 🟠 Major Add field documentation. The new 🔎 Proposed documentation addition+ """
+ chunks_per_batch: Number of chunks to process per task batch in Cognify.
+ Can be configured via CHUNKS_PER_BATCH environment variable.
+ Higher values (e.g., 50) can improve processing speed for large documents,
+ but may cause max_token errors if set too high. Defaults to 100 for default tasks
+ and 10 for temporal tasks when not specified.
+ """
chunks_per_batch: Optional[int] = NoneAlternatively, use Pydantic's Field with description: - chunks_per_batch: Optional[int] = None
+ chunks_per_batch: Optional[int] = Field(
+ default=None,
+ description="Number of chunks to process per task batch (configurable via CHUNKS_PER_BATCH env var)"
+ )📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||
| model_config = SettingsConfigDict(env_file=".env", extra="allow") | ||||||||||||
|
|
||||||||||||
| def to_dict(self) -> dict: | ||||||||||||
| return { | ||||||||||||
| "classification_model": self.classification_model, | ||||||||||||
| "summarization_model": self.summarization_model, | ||||||||||||
| "triplet_embedding": self.triplet_embedding, | ||||||||||||
| "chunks_per_batch": self.chunks_per_batch, | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
|
|
||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fix type annotation for optional parameter.
The parameter
chunks_per_batch: int = Noneshould useOptional[int]for proper type checking, consistent with the Optional import at line 3 and the coding guidelines requiring type annotations for public APIs.🔎 Proposed fix
🤖 Prompt for AI Agents