Add support for max batch size to AI embedding models. (#9040)

dnwpark · web-flow · commit ba913b8c419e · 2025-09-24T10:16:31.000-07:00
Adds annotation `ext::ai::embedding_model_max_batch_size` which limits the number of batches which can be part of a single batched embedding request.
diff --git a/docs/reference/ai/extai.rst b/docs/reference/ai/extai.rst
@@ -409,6 +409,7 @@ instantiated as a :eql:type:`ext::ai::TextGenerationModel`
 
     * ``embedding_model_max_input_tokens`` - Maximum tokens per input
     * ``embedding_model_max_batch_tokens`` - Maximum tokens per batch. Default: ``'8191'``.
+    * ``embedding_model_max_batch_size`` - Maximum inputs per batch. Optional.
     * ``embedding_model_max_output_dimensions`` - Maximum embedding dimensions
     * ``embedding_model_supports_shortening`` - Input shortening support flag
 
diff --git a/edb/buildmeta.py b/edb/buildmeta.py
@@ -57,7 +57,7 @@
 # The merge conflict there is a nice reminder that you probably need
 # to write a patch in edb/pgsql/patches.py, and then you should preserve
 # the old value.
-EDGEDB_CATALOG_VERSION = 2025_09_09_00_00
+EDGEDB_CATALOG_VERSION = 2025_09_23_00_00
 EDGEDB_MAJOR_VERSION = 8
 
 
diff --git a/edb/lib/ext/ai.edgeql b/edb/lib/ext/ai.edgeql
@@ -198,6 +198,9 @@ CREATE EXTENSION PACKAGE ai VERSION '1.0' {
     create abstract inheritable annotation
         ext::ai::embedding_model_max_batch_tokens;
 
+    create abstract inheritable annotation
+        ext::ai::embedding_model_max_batch_size;
+
     create abstract inheritable annotation
         ext::ai::embedding_model_max_output_dimensions;
 
@@ -212,6 +215,8 @@ CREATE EXTENSION PACKAGE ai VERSION '1.0' {
         # for now, use the openai batch limit as the default.
         create annotation
             ext::ai::embedding_model_max_batch_tokens := "8191";
+        create annotation
+            ext::ai::embedding_model_max_batch_size := "<optional>";
         create annotation
             ext::ai::embedding_model_max_output_dimensions := "<must override>";
         create annotation
diff --git a/edb/server/protocol/ai_ext.py b/edb/server/protocol/ai_ext.py
@@ -328,6 +328,7 @@ class BaseModel:
 class EmbeddingModel (BaseModel):
     max_input_tokens: int
     max_batch_tokens: int
+    max_batch_size: int | None
     max_output_dimensions: int
     supports_shortening: bool
 
@@ -339,6 +340,9 @@ class EmbeddingModel (BaseModel):
     max_batch_tokens_annotation: ClassVar[str] = (
         "ext::ai::embedding_model_max_batch_tokens"
     )
+    max_batch_size_annotation: ClassVar[str] = (
+        "ext::ai::embedding_model_max_batch_size"
+    )
     max_output_dimensions_annotation: ClassVar[str] = (
         "ext::ai::embedding_model_max_output_dimensions"
     )
@@ -846,6 +850,8 @@ async def _generate_embeddings_params(
     embeddings_params: list[EmbeddingsParams] = []
 
     for model_name, pending_entries in model_pending_entries.items():
+        embedding_model = embedding_models[model_name]
+
         groups = itertools.groupby(
             pending_entries, key=lambda e: e.target_dims_shortening
         )
@@ -856,8 +862,9 @@ async def _generate_embeddings_params(
             batches, excluded_indexes = batch_texts(
                 part_texts,
                 get_model_tokenizer(provider_name, model_name),
-                embedding_models[model_name].max_input_tokens,
-                embedding_models[model_name].max_batch_tokens,
+                max_input_tokens=embedding_model.max_input_tokens,
+                max_batch_tokens=embedding_model.max_batch_tokens,
+                max_batch_size=embedding_model.max_batch_size,
             )
 
             if excluded_indexes:
@@ -908,8 +915,10 @@ class TextBatch:
 def batch_texts(
     texts: list[tuple[str, bool]],
     tokenizer: Optional[Tokenizer],
+    *,
     max_input_tokens: int,
     max_batch_tokens: int,
+    max_batch_size: int | None,
 ) -> tuple[list[TextBatch], list[int]]:
     """Given a list of texts and whether each can be truncated, produce a list
     of valid texts to batch.
@@ -942,7 +951,7 @@ def batch_texts(
 
         # Group the valid texts into batches based on token count
         batched_inputs = _batch_embeddings_inputs(
-            tokenizer, input_texts, max_batch_tokens
+            tokenizer, input_texts, max_batch_tokens, max_batch_size
         )
 
         # Gather results
@@ -960,6 +969,25 @@ def batch_texts(
             for batch_input_indexes, token_count in batched_inputs
         ]
 
+    elif max_batch_size:
+        batch_count = (len(texts) - 1) // max_batch_size + 1
+        batches = [
+            TextBatch(
+                entries=[
+                    TextBatchEntry(
+                        input_index=index,
+                        input_text=texts[index][0],
+                    )
+                    for index in range(
+                        batch_index * max_batch_size,
+                        min((batch_index + 1) * max_batch_size, len(texts))
+                    )
+                ],
+                token_count=0,
+            )
+            for batch_index in range(batch_count)
+        ]
+
     else:
         batches = [
             TextBatch(
@@ -1099,6 +1127,7 @@ def _batch_embeddings_inputs(
     tokenizer: Tokenizer,
     inputs: list[str],
     max_batch_tokens: int,
+    max_batch_size: int | None,
 ) -> list[tuple[list[int], int]]:
     """Create batches of embeddings inputs.
 
@@ -1140,9 +1169,15 @@ def unbatched_token_count(unbatched_index: int) -> int:
 
         if batch_token_count < max_batch_tokens:
             # Then add the smallest available input as long as long as the
-            # max batch token count isn't exceeded
+            # max batch token and input counts aren't exceeded
             unbatched_index = 0
-            while unbatched_index < len(unbatched_input_indexes):
+            while (
+                unbatched_index < len(unbatched_input_indexes)
+                and (
+                    max_batch_size is None
+                    or len(batch_input_indexes) < max_batch_size
+                )
+            ):
                 if (
                     batch_token_count + unbatched_token_count(unbatched_index)
                     <= max_batch_tokens
@@ -3124,6 +3159,7 @@ async def _get_embedding_models(
             EmbeddingModel.provider_annotation,
             EmbeddingModel.max_model_input_tokens_annotation,
             EmbeddingModel.max_batch_tokens_annotation,
+            EmbeddingModel.max_batch_size_annotation,
             EmbeddingModel.max_output_dimensions_annotation,
             EmbeddingModel.supports_shortening_annotation,
         ],
@@ -3143,6 +3179,20 @@ def _get_ann(
             )
         return val
 
+    def _get_bool_ann(
+        model: str,
+        anns: dict[str, str | None],
+        name: str,
+    ) -> bool:
+        val = _get_ann(model, anns, name)
+        try:
+            return bool(val)
+        except ValueError:
+            raise InternalError(
+                f"Model '{model}' annotation '{name}' "
+                f"has non boolean value {val}"
+            )
+
     def _get_int_ann(
         model: str,
         anns: dict[str, str | None],
@@ -3157,18 +3207,20 @@ def _get_int_ann(
                 f"has non integer value {val}"
             )
 
-    def _get_bool_ann(
+    def _get_int_or_none_ann(
         model: str,
         anns: dict[str, str | None],
         name: str,
-    ) -> bool:
+    ) -> int | None:
         val = _get_ann(model, anns, name)
+        if val == "<optional>":
+            return None
         try:
-            return bool(val)
+            return int(val)
         except ValueError:
             raise InternalError(
                 f"Model '{model}' annotation '{name}' "
-                f"has non boolean value {val}"
+                f"has non integer value {val}"
             )
 
     result: dict[str, EmbeddingModel] = {}
@@ -3182,6 +3234,9 @@ def _get_bool_ann(
             max_batch_tokens=_get_int_ann(
                 model, anns, EmbeddingModel.max_batch_tokens_annotation
             ),
+            max_batch_size=_get_int_or_none_ann(
+                model, anns, EmbeddingModel.max_batch_size_annotation
+            ),
             max_output_dimensions=_get_int_ann(
                 model, anns, EmbeddingModel.max_output_dimensions_annotation
             ),
@@ -3427,8 +3482,6 @@ async def generate_embeddings_for_texts(
         embedding_model = embedding_models[model_name]
 
         tokenizer = get_model_tokenizer(provider, model_name)
-        max_input_tokens = embedding_model.max_input_tokens
-        max_batch_tokens = embedding_model.max_batch_tokens
 
         texts = [
             (
@@ -3441,8 +3494,9 @@ async def generate_embeddings_for_texts(
         text_batches, excluded_indexes = batch_texts(
             texts,
             tokenizer,
-            max_input_tokens,
-            max_batch_tokens,
+            max_input_tokens=embedding_model.max_input_tokens,
+            max_batch_tokens=embedding_model.max_batch_tokens,
+            max_batch_size=embedding_model.max_batch_size,
         )
 
         if excluded_indexes or too_long:
diff --git a/tests/test_ext_ai.py b/tests/test_ext_ai.py
@@ -1355,23 +1355,26 @@ def test_batch_embeddings_inputs_01(self):
             ai_ext._batch_embeddings_inputs(
                 CharacterTokenizer(),
                 [],
-                10
+                10,
+                None,
             ),
             [],
         )
         self.assertEqual(
             ai_ext._batch_embeddings_inputs(
                 CharacterTokenizer(),
                 ['1', '22', '333', '4444'],
-                10
+                10,
+                None,
             ),
             [([3, 0, 1, 2], 10)],
         )
         self.assertEqual(
             ai_ext._batch_embeddings_inputs(
                 CharacterTokenizer(),
                 ['1', '22', '333', '4444', '55555'],
-                10
+                10,
+                None,
             ),
             [
                 ([4, 0, 1], 8),
@@ -1382,7 +1385,8 @@ def test_batch_embeddings_inputs_01(self):
             ai_ext._batch_embeddings_inputs(
                 CharacterTokenizer(),
                 ['1', '22', '333', '4444', '55555', '666666'],
-                10
+                10,
+                None,
             ),
             [
                 ([5, 0, 1], 9),
@@ -1394,7 +1398,8 @@ def test_batch_embeddings_inputs_01(self):
             ai_ext._batch_embeddings_inputs(
                 CharacterTokenizer(),
                 ['1', '22', '333', '4444', '55555', '666666'],
-                10
+                10,
+                None,
             ),
             [
                 ([5, 0, 1], 9),
@@ -1406,19 +1411,44 @@ def test_batch_embeddings_inputs_01(self):
             ai_ext._batch_embeddings_inputs(
                 CharacterTokenizer(),
                 ['1', '22', '333', '4444', '55555', '121212121212'],
-                10
+                10,
+                None,
             ),
             [
                 ([4, 0, 1], 8),
                 ([3, 2], 7),
             ],
         )
+        self.assertEqual(
+            ai_ext._batch_embeddings_inputs(
+                CharacterTokenizer(),
+                [
+                    '1',
+                    '22',
+                    '333',
+                    '4444',
+                    '55555',
+                    '666666',
+                    '7777777',
+                    '88888888',
+                ],
+                12,
+                3,
+            ),
+            [
+                ([7, 0, 1], 11),
+                ([6, 2], 10),
+                ([5, 3], 10),
+                ([4], 5),
+            ],
+        )
         # Text is alphabetically ordered to ensure consistent batching
         self.assertEqual(
             ai_ext._batch_embeddings_inputs(
                 CharacterTokenizer(),
                 ['AAA', 'CCC', 'EEE', 'BBB', 'DDD'],
-                10
+                10,
+                None,
             ),
             [
                 ([2, 0, 3], 9),