Fix issue #939 - tokens batch_type may exceed max_batch_size (#1948)

anterart · web-flow · commit 598f63a2dee9 · 2025-12-22T04:39:31.000+01:00
* 1. Fix the batching logic to include padding tokens in batch size increment in BatchReader.get_next method. The rebatch_input will always pass batch_increment_is_fixed=true. Since rebatch_input sorts the input by length in descending order, the first example in every batch will be longest, so batch increment will be fixed with the longest example in batch length. This solves the issue #939. But since the batch_increment_is_fixed=false by default, it won't affect the prefetching logic as mentioned in the revert of the previous PR addressing this issue here: #1314. 2. Fix the same issue in the CTranslate2/python/ctranslate2/extensions.py module in the _batch_iterator method. 3. Add tests for both changes * add a comment * 1. Improve the get_next method implementation by allowing it to work with unsorted examples input. 2. Fix memory over-allocation in case batch_type=tokens, if we reserve max_batch_size memory for the batch vector, we probably overallocate memory, thus shrink_to_fit is needed before we return the batch. * 1. Rename batch_size_increment_is_fixed var to consider_padding 2. Update documentation
diff --git a/include/ctranslate2/batch_reader.h b/include/ctranslate2/batch_reader.h
@@ -56,7 +56,8 @@ namespace ctranslate2 {
 
     std::vector<Example>
     get_next(const size_t max_batch_size,
-             const BatchType batch_type = BatchType::Examples);
+             const BatchType batch_type = BatchType::Examples,
+             const bool consider_padding = false);
 
     // Consumes and returns the next example.
     virtual Example get_next_example() = 0;
@@ -67,6 +68,12 @@ namespace ctranslate2 {
     }
 
   private:
+    std::vector<Example> fill_batch_with_fixed_increment(const size_t max_batch_size,
+                                                          const BatchType batch_type);
+
+    std::vector<Example> fill_batch_with_variable_increment(const size_t max_batch_size,
+                                                             const BatchType batch_type);
+
     bool _initialized = false;
     Example _next;
   };
diff --git a/python/cpp/generator.cc b/python/cpp/generator.cc
@@ -234,10 +234,10 @@ namespace ctranslate2 {
                  Arguments:
                    start_tokens: Batch of start tokens. If the decoder starts from a special
                      start token like ``<s>``, this token should be added to this input.
-                   max_batch_size: The maximum batch size. If the number of inputs is greater than
-                     :obj:`max_batch_size`, the inputs are sorted by length and split by chunks of
-                     :obj:`max_batch_size` examples so that the number of padding positions is
-                     minimized.
+                   max_batch_size: The maximum batch size. If the number of inputs is greater than :obj:`max_batch_size`,
+                     the inputs are sorted by length and split by chunks of :obj:`max_batch_size` examples
+                     (or tokens when :obj:`batch_type`="tokens") so that the number of padding positions
+                     is minimized.
                    batch_type: Whether :obj:`max_batch_size` is the number of "examples" or "tokens".
                    asynchronous: Run the generation asynchronously.
                    beam_size: Beam size (1 for greedy search).
diff --git a/python/cpp/translator.cc b/python/cpp/translator.cc
@@ -372,10 +372,10 @@ namespace ctranslate2 {
                  Arguments:
                    source: Batch of source tokens.
                    target_prefix: Optional batch of target prefix tokens.
-                   max_batch_size: The maximum batch size. If the number of inputs is greater than
-                     :obj:`max_batch_size`, the inputs are sorted by length and split by chunks of
-                     :obj:`max_batch_size` examples so that the number of padding positions is
-                     minimized.
+                   max_batch_size: The maximum batch size. If the number of inputs is greater than :obj:`max_batch_size`,
+                     the inputs are sorted by length and split by chunks of :obj:`max_batch_size` examples
+                     (or tokens when :obj:`batch_type`="tokens") so that the number of padding positions
+                     is minimized.
                    batch_type: Whether :obj:`max_batch_size` is the number of "examples" or "tokens".
                    asynchronous: Run the translation asynchronously.
                    beam_size: Beam size (1 for greedy search).
diff --git a/python/ctranslate2/extensions.py b/python/ctranslate2/extensions.py
@@ -556,30 +556,34 @@ def _process_iterable(process_func, iterables, max_batch_size, batch_type, **kwa
 
 def _batch_iterator(iterable, batch_size, batch_type):
     streams = None
-    cur_batch_size = 0
+    max_length = 0
 
     for example in iterable:
         if not isinstance(example, tuple):
             example = (example,)
 
+        if batch_type == "examples":
+            if streams and len(streams[0]) == batch_size:
+                yield streams
+                streams = None
+
+        elif batch_type == "tokens":
+            max_length = max(max_length, len(example[0]))
+
+            if streams and (len(streams[0]) + 1) * max_length > batch_size:
+                yield streams
+                streams = None
+                max_length = len(example[0])
+
+        else:
+            raise ValueError("Invalid batch type %s" % batch_type)
+
         if streams is None:
             streams = tuple([] for _ in example)
         for batch, element in zip(streams, example):
             if element is None and len(streams) > 1:
                 raise ValueError("Input iterables do not have the same length")
             batch.append(element)
 
-        if batch_type == "examples":
-            cur_batch_size += 1
-        elif batch_type == "tokens":
-            cur_batch_size += len(example[0])
-        else:
-            raise ValueError("Invalid batch type %s" % batch_type)
-
-        if cur_batch_size >= batch_size:
-            yield streams
-            streams = None
-            cur_batch_size = 0
-
     if streams is not None:
         yield streams
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
@@ -0,0 +1,19 @@
+import pytest
+
+from ctranslate2.extensions import _batch_iterator as batch_iterator
+
+
+@pytest.mark.parametrize(
+    "batch_size,batch_type,lengths,expected_batch_sizes",
+    [
+        (2, "examples", [2, 3, 4, 1, 1], [2, 2, 1]),
+        (6, "tokens", [2, 3, 1, 4, 1, 2], [2, 1, 1, 2]),
+    ],
+)
+def test_batch_iterator(batch_size, batch_type, lengths, expected_batch_sizes):
+    iterable = (["a"] * length for length in lengths)
+
+    batches = batch_iterator(iterable, batch_size, batch_type)
+    batch_sizes = [len(batch[0]) for batch in batches]
+
+    assert batch_sizes == expected_batch_sizes
diff --git a/src/batch_reader.cc b/src/batch_reader.cc
@@ -36,33 +36,69 @@ namespace ctranslate2 {
   }
 
   std::vector<Example>
-  BatchReader::get_next(const size_t max_batch_size,
-                        const BatchType batch_type) {
-    if (max_batch_size == 0)
-      throw std::invalid_argument("BatchReader: max_batch_size must be > 0");
+  BatchReader::fill_batch_with_fixed_increment(const size_t max_batch_size,
+                                                const BatchType batch_type) {
+    std::vector<Example> batch;
+    batch.reserve(max_batch_size);
 
-    if (!_initialized) {
+    size_t max_increment = 0;
+
+    while (!_next.empty()) {
+      const size_t cur_increment = get_batch_size_increment(_next, batch_type);
+      max_increment = std::max(max_increment, cur_increment);
+      const size_t new_batch_size = (batch.size() + 1) * max_increment;
+
+      if (!batch.empty() && new_batch_size > max_batch_size)
+        break;
+
+      batch.emplace_back(std::move(_next));
       _next = get_next_example();
-      _initialized = true;
     }
+    return batch;
+  }
 
+  std::vector<Example>
+  BatchReader::fill_batch_with_variable_increment(const size_t max_batch_size,
+                                                   const BatchType batch_type) {
     std::vector<Example> batch;
-    if (_next.empty())
-      return batch;
-
     batch.reserve(max_batch_size);
 
-    size_t batch_size = 0;
+    size_t total_increment = 0;
 
     while (!_next.empty()) {
-      const size_t batch_size_increment = get_batch_size_increment(_next, batch_type);
-      if (batch_size > 0 && batch_size + batch_size_increment > max_batch_size)
+      const size_t cur_increment = get_batch_size_increment(_next, batch_type);
+      const size_t new_batch_size = total_increment + cur_increment;
+
+      if (!batch.empty() && new_batch_size > max_batch_size)
         break;
+
       batch.emplace_back(std::move(_next));
-      batch_size += batch_size_increment;
+      total_increment += cur_increment;
+      _next = get_next_example();
+    }
+    return batch;
+  }
+
+  std::vector<Example>
+  BatchReader::get_next(const size_t max_batch_size,
+                        const BatchType batch_type,
+                        const bool consider_padding) {
+    if (max_batch_size == 0)
+      throw std::invalid_argument("BatchReader: max_batch_size must be > 0");
+
+    if (!_initialized) {
       _next = get_next_example();
+      _initialized = true;
     }
 
+    if (_next.empty())
+      return {};
+
+    auto batch = consider_padding
+      ? fill_batch_with_fixed_increment(max_batch_size, batch_type)
+      : fill_batch_with_variable_increment(max_batch_size, batch_type);
+
+    batch.shrink_to_fit();
     return batch;
   }
 
@@ -170,7 +206,8 @@ namespace ctranslate2 {
     VectorReader batch_reader(index_vector(examples, example_index));
 
     for (size_t offset = 0;;) {
-      auto examples_part = batch_reader.get_next(max_batch_size, batch_type);
+      // the batch size increment per example is always fixed because padding is required
+      auto examples_part = batch_reader.get_next(max_batch_size, batch_type, true);
       if (examples_part.empty())
         break;
 
@@ -189,4 +226,4 @@ namespace ctranslate2 {
     return batches;
   }
 
-}
+}
diff --git a/tests/batching_test.cc b/tests/batching_test.cc
@@ -36,3 +36,117 @@ TEST(BatchingTest, RebatchInput) {
     EXPECT_EQ(batch.example_index, expected_batches[i]);
   }
 }
+
+TEST(BatchingTest, BatchReaderGetNext_Examples) {
+  const std::vector<std::vector<std::string>> examples = {
+    {"a", "b"},
+    {"a", "b", "c"},
+    {"a"},
+    {"a", "b", "c", "d"}
+  };
+  const std::vector<std::vector<size_t>> expected_batches = {{0, 1}, {2, 3}};
+
+  VectorReader reader(examples);
+
+  for (const auto& expected_batch : expected_batches) {
+    auto batch = reader.get_next(2, BatchType::Examples, true);
+    ASSERT_EQ(batch.size(), expected_batch.size());
+    for (size_t i = 0; i < batch.size(); ++i) {
+      EXPECT_EQ(batch[i].streams[0], examples[expected_batch[i]]);
+    }
+  }
+}
+
+TEST(BatchingTest, BatchReaderGetNext_TokensFixed) {
+  const std::vector<std::vector<std::string>> source = {
+    {"a", "b", "c", "d"},
+    {"a", "b", "c", "d", "e"},
+    {"a"},
+    {"a", "b", "c"},
+    {"a", "b"}
+  };
+  const std::vector<std::vector<std::string>> target = {
+    {"1"},
+    {"2"},
+    {"3"},
+    {"4"},
+    {"5"}
+  };
+
+  const std::vector<std::vector<size_t>> expected_batches = {{1}, {0}, {3, 4}, {2}};
+
+  const auto batches = rebatch_input(load_examples({source, target}), 6, BatchType::Tokens);
+  ASSERT_EQ(batches.size(), expected_batches.size());
+
+  for (size_t i = 0; i < batches.size(); ++i) {
+    const auto& batch = batches[i];
+    EXPECT_EQ(batch.get_stream(0), index_vector(source, expected_batches[i]));
+    EXPECT_EQ(batch.get_stream(1), index_vector(target, expected_batches[i]));
+    EXPECT_EQ(batch.example_index, expected_batches[i]);
+  }
+}
+
+TEST(BatchingTest, BatchReaderGetNext_TokensDynamic) {
+  const std::vector<std::vector<std::string>> examples = {
+    {"a", "b"},
+    {"a", "b", "c"},
+    {"a"},
+    {"a", "b", "c", "d"},
+    {"a", "b", "c", "d", "e"}
+  };
+
+  const std::vector<std::vector<size_t>> expected_batches = {{0, 1, 2}, {3}, {4}};
+
+  VectorReader reader(examples);
+
+  for (const auto& expected_batch : expected_batches) {
+    auto batch = reader.get_next(6, BatchType::Tokens, false);
+    ASSERT_EQ(batch.size(), expected_batch.size());
+    for (size_t i = 0; i < batch.size(); ++i) {
+      EXPECT_EQ(batch[i].streams[0], examples[expected_batch[i]]);
+    }
+  }
+}
+
+TEST(BatchingTest, BatchReaderGetNext_TokensFixed2) {
+  const std::vector<std::vector<std::string>> source = {
+    {"a", "b", "c", "d", "e"},
+    {"a", "b"},
+    {"a"}
+  };
+  const std::vector<std::vector<std::string>> target = {
+    {"1"},
+    {"2"},
+    {"3"}
+  };
+
+  const std::vector<std::vector<size_t>> expected_batches = {{0}, {1, 2}};
+  const auto batches = rebatch_input(load_examples({source, target}), 8, BatchType::Tokens);
+  ASSERT_EQ(batches.size(), expected_batches.size());
+
+  for (size_t i = 0; i < batches.size(); ++i) {
+    const auto& batch = batches[i];
+    EXPECT_EQ(batch.get_stream(0), index_vector(source, expected_batches[i]));
+    EXPECT_EQ(batch.get_stream(1), index_vector(target, expected_batches[i]));
+    EXPECT_EQ(batch.example_index, expected_batches[i]);
+  }
+}
+
+TEST(BatchingTest, BatchReaderGetNext_TokensDynamic2) {
+  const std::vector<std::vector<std::string>> source = {
+    {"a", "b", "c", "d", "e"},
+    {"a", "b"},
+    {"a"}
+  };
+
+  const std::vector<std::vector<size_t>> expected_batches = {{0, 1, 2}};
+  VectorReader reader(source);
+
+  for (const auto& expected_batch : expected_batches) {
+    auto batch = reader.get_next(8, BatchType::Tokens, false);
+    ASSERT_EQ(batch.size(), expected_batch.size());
+    for (size_t i = 0; i < batch.size(); ++i) {
+      EXPECT_EQ(batch[i].streams[0], source[expected_batch[i]]);
+    }
+  }
+}