Merge branch 'cocoindex-io:main' into main

Gohlub · web-flow · commit 7996528c09a2 · 2025-11-06T12:19:29.000-05:00
diff --git a/docs/docs/custom_ops/custom_functions.mdx b/docs/docs/custom_ops/custom_functions.mdx
@@ -145,6 +145,8 @@ Custom functions take the following additional parameters:
 * `batching: bool`: Whether the executor will consume requests in batch.
     See the [Batching](#batching) section below for details.
 
+* `max_batch_size: int | None`: The maximum batch size for the executor.
+
 * `behavior_version: int`: The version of the behavior of the function.
     When the version is changed, the function will be re-executed even if cache is enabled.
     It's required to be set if `cache` is `True`.
@@ -221,5 +223,25 @@ class ComputeSomethingExecutor:
         ...
 ```
 
+### Controlling Batch Size
+
+You can control the maximum batch size using the `max_batch_size` parameter. This is useful for:
+* Limiting memory usage when processing large batches
+* Reducing latency by flushing batches before they grow too large
+* Working with APIs that have request size limits
+
+```python
+@cocoindex.op.function(batching=True, max_batch_size=32)
+def compute_something(args: list[str]) -> list[str]:
+    ...
+```
+
+With `max_batch_size` set, a batch will be flushed when either:
+
+1. No ongoing batches are running
+2. The pending batch size reaches `max_batch_size`
+
+This ensures that requests don't wait indefinitely for a batch to fill up, while still allowing efficient batch processing.
+
 </TabItem>
 </Tabs>
diff --git a/docs/docs/sources/amazons3.md b/docs/docs/sources/amazons3.md
@@ -131,6 +131,9 @@ The spec takes the following fields:
 
     :::
 
+*   `max_file_size` (`int`, optional): if provided, files exceeding this size in bytes will be treated as non-existent and skipped during processing.
+    This is useful to avoid processing large files that are not relevant to your use case, such as videos or backups.
+    If not specified, no size limit is applied.
 *   `sqs_queue_url` (`str`, optional): if provided, the source will receive change event notifications from Amazon S3 via this SQS queue.
 
     :::info
diff --git a/examples/code_embedding/main.py b/examples/code_embedding/main.py
@@ -16,12 +16,12 @@ def code_to_embedding(
     Embed the text using a SentenceTransformer model.
     """
     # You can also switch to Voyage embedding model:
-    #    return text.transform(
-    #        cocoindex.functions.EmbedText(
-    #            api_type=cocoindex.LlmApiType.VOYAGE,
-    #            model="voyage-code-3",
-    #        )
-    #    )
+    # return text.transform(
+    #     cocoindex.functions.EmbedText(
+    #         api_type=cocoindex.LlmApiType.GEMINI,
+    #         model="text-embedding-004",
+    #     )
+    # )
     return text.transform(
         cocoindex.functions.SentenceTransformerEmbed(
             model="sentence-transformers/all-MiniLM-L6-v2"
diff --git a/python/cocoindex/functions/colpali.py b/python/cocoindex/functions/colpali.py
@@ -125,6 +125,7 @@ class ColPaliEmbedImage(op.FunctionSpec):
     gpu=True,
     cache=True,
     batching=True,
+    max_batch_size=32,
     behavior_version=1,
 )
 class ColPaliEmbedImageExecutor:
@@ -204,6 +205,7 @@ class ColPaliEmbedQuery(op.FunctionSpec):
     cache=True,
     behavior_version=1,
     batching=True,
+    max_batch_size=32,
 )
 class ColPaliEmbedQueryExecutor:
     """Executor for ColVision query embedding (ColPali, ColQwen2, ColSmol, etc.)."""
diff --git a/python/cocoindex/functions/sbert.py b/python/cocoindex/functions/sbert.py
@@ -31,6 +31,7 @@ class SentenceTransformerEmbed(op.FunctionSpec):
     gpu=True,
     cache=True,
     batching=True,
+    max_batch_size=512,
     behavior_version=1,
     arg_relationship=(op.ArgRelationship.EMBEDDING_ORIGIN_TEXT, "text"),
 )
diff --git a/python/cocoindex/op.py b/python/cocoindex/op.py
@@ -151,6 +151,7 @@ class OpArgs:
     - gpu: Whether the executor will be executed on GPU.
     - cache: Whether the executor will be cached.
     - batching: Whether the executor will be batched.
+    - max_batch_size: The maximum batch size for the executor. Only valid if `batching` is True.
     - behavior_version: The behavior version of the executor. Cache will be invalidated if it
       changes. Must be provided if `cache` is True.
     - arg_relationship: It specifies the relationship between an input argument and the output,
@@ -161,6 +162,7 @@ class OpArgs:
     gpu: bool = False
     cache: bool = False
     batching: bool = False
+    max_batch_size: int | None = None
     behavior_version: int | None = None
     arg_relationship: tuple[ArgRelationship, str] | None = None
 
@@ -389,11 +391,17 @@ def enable_cache(self) -> bool:
         def behavior_version(self) -> int | None:
             return op_args.behavior_version
 
+        def batching_options(self) -> dict[str, Any] | None:
+            if op_args.batching:
+                return {
+                    "max_batch_size": op_args.max_batch_size,
+                }
+            else:
+                return None
+
     if category == OpCategory.FUNCTION:
         _engine.register_function_factory(
-            op_kind,
-            _EngineFunctionExecutorFactory(spec_loader, _WrappedExecutor),
-            op_args.batching,
+            op_kind, _EngineFunctionExecutorFactory(spec_loader, _WrappedExecutor)
         )
     else:
         raise ValueError(f"Unsupported executor type {category}")
diff --git a/python/cocoindex/sources/_engine_builtin_specs.py b/python/cocoindex/sources/_engine_builtin_specs.py
@@ -56,6 +56,7 @@ class AmazonS3(op.SourceSpec):
     binary: bool = False
     included_patterns: list[str] | None = None
     excluded_patterns: list[str] | None = None
+    max_file_size: int | None = None
     sqs_queue_url: str | None = None
     redis: RedisNotification | None = None
 
diff --git a/src/execution/source_indexer.rs b/src/execution/source_indexer.rs
@@ -304,7 +304,10 @@ impl SourceIndexingContext {
                 rows_to_retry,
             }),
             setup_execution_ctx,
-            update_once_batcher: batching::Batcher::new(UpdateOnceRunner),
+            update_once_batcher: batching::Batcher::new(
+                UpdateOnceRunner,
+                batching::BatchingOptions::default(),
+            ),
         }))
     }
 
diff --git a/src/ops/factory_bases.rs b/src/ops/factory_bases.rs
@@ -381,6 +381,8 @@ pub trait BatchedFunctionExecutor: Send + Sync + Sized + 'static {
     fn into_fn_executor(self) -> impl SimpleFunctionExecutor {
         BatchedFunctionExecutorWrapper::new(self)
     }
+
+    fn batching_options(&self) -> batching::BatchingOptions;
 }
 
 #[async_trait]
@@ -404,10 +406,11 @@ struct BatchedFunctionExecutorWrapper<E: BatchedFunctionExecutor> {
 
 impl<E: BatchedFunctionExecutor> BatchedFunctionExecutorWrapper<E> {
     fn new(executor: E) -> Self {
+        let batching_options = executor.batching_options();
         Self {
             enable_cache: executor.enable_cache(),
             behavior_version: executor.behavior_version(),
-            batcher: batching::Batcher::new(executor),
+            batcher: batching::Batcher::new(executor, batching_options),
         }
     }
 }
diff --git a/src/ops/functions/embed_text.rs b/src/ops/functions/embed_text.rs
@@ -36,6 +36,14 @@ impl BatchedFunctionExecutor for Executor {
         true
     }
 
+    fn batching_options(&self) -> batching::BatchingOptions {
+        // A safe default for most embeddings providers.
+        // May tune it for specific providers later.
+        batching::BatchingOptions {
+            max_batch_size: Some(64),
+        }
+    }
+
     async fn evaluate_batch(&self, args: Vec<Vec<Value>>) -> Result<Vec<Value>> {
         let texts = args
             .iter()
diff --git a/src/ops/py_factory.rs b/src/ops/py_factory.rs
@@ -121,6 +121,7 @@ struct PyBatchedFunctionExecutor {
 
     enable_cache: bool,
     behavior_version: Option<u32>,
+    batching_options: batching::BatchingOptions,
 }
 
 #[async_trait]
@@ -168,11 +169,13 @@ impl BatchedFunctionExecutor for PyBatchedFunctionExecutor {
     fn behavior_version(&self) -> Option<u32> {
         self.behavior_version
     }
+    fn batching_options(&self) -> batching::BatchingOptions {
+        self.batching_options.clone()
+    }
 }
 
 pub(crate) struct PyFunctionFactory {
     pub py_function_factory: Py<PyAny>,
-    pub batching: bool,
 }
 
 #[async_trait]
@@ -237,7 +240,7 @@ impl interface::SimpleFunctionFactory for PyFunctionFactory {
                     .as_ref()
                     .ok_or_else(|| anyhow!("Python execution context is missing"))?
                     .clone();
-                let (prepare_fut, enable_cache, behavior_version) =
+                let (prepare_fut, enable_cache, behavior_version, batching_options) =
                     Python::with_gil(|py| -> anyhow::Result<_> {
                         let prepare_coro = executor
                             .call_method(py, "prepare", (), None)
@@ -257,31 +260,45 @@ impl interface::SimpleFunctionFactory for PyFunctionFactory {
                             .call_method(py, "behavior_version", (), None)
                             .to_result_with_py_trace(py)?
                             .extract::<Option<u32>>(py)?;
-                        Ok((prepare_fut, enable_cache, behavior_version))
+                        let batching_options = executor
+                            .call_method(py, "batching_options", (), None)
+                            .to_result_with_py_trace(py)?
+                            .extract::<crate::py::Pythonized<Option<batching::BatchingOptions>>>(
+                                py,
+                            )?
+                            .into_inner();
+                        Ok((
+                            prepare_fut,
+                            enable_cache,
+                            behavior_version,
+                            batching_options,
+                        ))
                     })?;
                 prepare_fut.await?;
-                let executor: Box<dyn interface::SimpleFunctionExecutor> = if self.batching {
-                    Box::new(
-                        PyBatchedFunctionExecutor {
+                let executor: Box<dyn interface::SimpleFunctionExecutor> =
+                    if let Some(batching_options) = batching_options {
+                        Box::new(
+                            PyBatchedFunctionExecutor {
+                                py_function_executor: executor,
+                                py_exec_ctx,
+                                result_type,
+                                enable_cache,
+                                behavior_version,
+                                batching_options,
+                            }
+                            .into_fn_executor(),
+                        )
+                    } else {
+                        Box::new(Arc::new(PyFunctionExecutor {
                             py_function_executor: executor,
                             py_exec_ctx,
+                            num_positional_args,
+                            kw_args_names,
                             result_type,
                             enable_cache,
                             behavior_version,
-                        }
-                        .into_fn_executor(),
-                    )
-                } else {
-                    Box::new(Arc::new(PyFunctionExecutor {
-                        py_function_executor: executor,
-                        py_exec_ctx,
-                        num_positional_args,
-                        kw_args_names,
-                        result_type,
-                        enable_cache,
-                        behavior_version,
-                    }))
-                };
+                        }))
+                    };
                 Ok(executor)
             }
         };
diff --git a/src/ops/sources/amazon_s3.rs b/src/ops/sources/amazon_s3.rs
@@ -34,6 +34,7 @@ pub struct Spec {
     binary: bool,
     included_patterns: Option<Vec<String>>,
     excluded_patterns: Option<Vec<String>>,
+    max_file_size: Option<i64>,
     sqs_queue_url: Option<String>,
     redis: Option<RedisConfig>,
 }
@@ -82,6 +83,7 @@ struct Executor {
     prefix: Option<String>,
     binary: bool,
     pattern_matcher: PatternMatcher,
+    max_file_size: Option<i64>,
     sqs_context: Option<Arc<SqsContext>>,
     redis_context: Option<Arc<RedisContext>>,
 }
@@ -115,6 +117,14 @@ impl SourceExecutor for Executor {
                         if let Some(key) = obj.key() {
                             // Only include files (not folders)
                             if key.ends_with('/') { continue; }
+                            // Check file size limit
+                            if let Some(max_size) = self.max_file_size {
+                                if let Some(size) = obj.size() {
+                                    if size > max_size {
+                                        continue;
+                                    }
+                                }
+                            }
                             if self.pattern_matcher.is_file_included(key) {
                                 batch.push(PartialSourceRow {
                                     key: KeyValue::from_single_part(key.to_string()),
@@ -156,6 +166,25 @@ impl SourceExecutor for Executor {
                 content_version_fp: None,
             });
         }
+        // Check file size limit
+        if let Some(max_size) = self.max_file_size {
+            let head_result = self
+                .client
+                .head_object()
+                .bucket(&self.bucket_name)
+                .key(key_str.as_ref())
+                .send()
+                .await?;
+            if let Some(size) = head_result.content_length() {
+                if size > max_size {
+                    return Ok(PartialSourceRowData {
+                        value: Some(SourceValue::NonExistence),
+                        ordinal: Some(Ordinal::unavailable()),
+                        content_version_fp: None,
+                    });
+                }
+            }
+        }
         let resp = self
             .client
             .get_object()
@@ -457,6 +486,7 @@ impl SourceFactoryBase for Factory {
             prefix: spec.prefix,
             binary: spec.binary,
             pattern_matcher: PatternMatcher::new(spec.included_patterns, spec.excluded_patterns)?,
+            max_file_size: spec.max_file_size,
             sqs_context: spec.sqs_queue_url.map(|url| {
                 Arc::new(SqsContext {
                     client: aws_sdk_sqs::Client::new(&config),
diff --git a/src/py/mod.rs b/src/py/mod.rs
@@ -156,14 +156,9 @@ fn register_source_connector(name: String, py_source_connector: Py<PyAny>) -> Py
 }
 
 #[pyfunction]
-fn register_function_factory(
-    name: String,
-    py_function_factory: Py<PyAny>,
-    batching: bool,
-) -> PyResult<()> {
+fn register_function_factory(name: String, py_function_factory: Py<PyAny>) -> PyResult<()> {
     let factory = PyFunctionFactory {
         py_function_factory,
-        batching,
     };
     register_factory(name, ExecutorFactory::SimpleFunction(Arc::new(factory))).into_py_result()
 }
diff --git a/src/utils/batching.rs b/src/utils/batching.rs

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ class SentenceTransformerEmbed(op.FunctionSpec):`
`31`	`31`	`gpu=True,`
`32`	`32`	`cache=True,`
`33`	`33`	`batching=True,`
	`34`	`+ max_batch_size=512,`
`34`	`35`	`behavior_version=1,`
`35`	`36`	`arg_relationship=(op.ArgRelationship.EMBEDDING_ORIGIN_TEXT, "text"),`
`36`	`37`	`)`
Original file line number	Diff line number	Diff line change
`@@ -381,6 +381,8 @@ pub trait BatchedFunctionExecutor: Send + Sync + Sized + 'static {`
`381`	`381`	`fn into_fn_executor(self) -> impl SimpleFunctionExecutor {`
`382`	`382`	`BatchedFunctionExecutorWrapper::new(self)`
`383`	`383`	`}`
	`384`	`+`
	`385`	`+ fn batching_options(&self) -> batching::BatchingOptions;`
`384`	`386`	`}`
`385`	`387`
`386`	`388`	`#[async_trait]`
`@@ -404,10 +406,11 @@ struct BatchedFunctionExecutorWrapper<E: BatchedFunctionExecutor> {`
`404`	`406`
`405`	`407`	`impl<E: BatchedFunctionExecutor> BatchedFunctionExecutorWrapper<E> {`
`406`	`408`	`fn new(executor: E) -> Self {`
	`409`	`+ let batching_options = executor.batching_options();`
`407`	`410`	`Self {`
`408`	`411`	`enable_cache: executor.enable_cache(),`
`409`	`412`	`behavior_version: executor.behavior_version(),`
`410`		`- batcher: batching::Batcher::new(executor),`
	`413`	`+ batcher: batching::Batcher::new(executor, batching_options),`
`411`	`414`	`}`
`412`	`415`	`}`
`413`	`416`	`}`