- -> _

qgallouedec · qgallouedec · commit 42d4be7e8eec · 2026-03-13T22:01:01.000Z
diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md
@@ -1144,14 +1144,14 @@ SFTConfig(
 
 **📜 Paper**: https://huggingface.co/papers/2404.10830
 
-The paper shows that the standard concat-and-chunk preprocessing (`packing_strategy="wrapped"`) used for LLM training causes many documents to be arbitrarily truncated, which harms learning. It proposes packing document chunks into context windows using a Best-Fit Decreasing bin-packing algorithm, greatly reducing truncation while keeping high token utilization and improving model performance. TRL implements this as the `"bfd-split"` packing strategy in [`SFTConfig`]. For more details on packing, see the [SFT documentation](sft_trainer#packing).
+The paper shows that the standard concat-and-chunk preprocessing (`packing_strategy="wrapped"`) used for LLM training causes many documents to be arbitrarily truncated, which harms learning. It proposes packing document chunks into context windows using a Best-Fit Decreasing bin-packing algorithm, greatly reducing truncation while keeping high token utilization and improving model performance. TRL implements this as the `"bfd_split"` packing strategy in [`SFTConfig`]. For more details on packing, see the [SFT documentation](sft_trainer#packing).
 
 ```python
 from trl import SFTConfig
 
 training_args = SFTConfig(
     packing=True,
-    packing_strategy="bfd-split",
+    packing_strategy="bfd_split",
     max_length=4096,
 )
 ```
diff --git a/docs/source/reducing_memory_usage.md b/docs/source/reducing_memory_usage.md
@@ -81,14 +81,14 @@ TRL supports three strategies:
 * **`bfd` (default)**
   Uses **Best-Fit Decreasing packing**. If a sequence exceeds `max_length`, the overflow tokens are discarded.
 
-* **`bfd-split`**
+* **`bfd_split`**
   Uses **Best-Fit Decreasing packing**, but long sequences are split into chunks ≤ `max_length` before packing. This preserves all tokens and follows the approach proposed in [Fewer Truncations Improve Language Modeling](https://huggingface.co/papers/2404.10830).
 
 * **`wrapped`**
   All tokens are concatenated into a stream and split into fixed-length blocks. This minimizes padding but may mix unrelated examples. This strategy corresponds to the *concatenate-then-split* preprocessing described in the literature (e.g., [Fewer Truncations Improve Language Modeling](https://huggingface.co/papers/2404.10830)). It has the downside of breaking sequence continuity for a large fraction of the dataset, which hurts performance, as discussed in the [Qwen3-Coder-Next Technical Report](https://huggingface.co/papers/2603.00729).
 
 > [!NOTE]
-> If all sequences are shorter than `max_length`, **`bfd` and `bfd-split` behave identically**, since no truncation or splitting is required.
+> If all sequences are shorter than `max_length`, **`bfd` and `bfd_split` behave identically**, since no truncation or splitting is required.
 
 ```python
 from trl import SFTConfig
diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py
@@ -1133,7 +1133,7 @@ def test_with_overlong_0(self):
             "input_ids": [[1, 2, 3, 4], [8, 9, 10, 11], [6, 7, 5, 12]],
             "seq_lengths": [[4], [4], [2, 1, 1]],
         }
-        dataset = pack_dataset(dataset, seq_length, strategy="bfd-split")
+        dataset = pack_dataset(dataset, seq_length, strategy="bfd_split")
         assert dataset.to_dict() == expected_output
 
     def test_with_overlong_two_coluns(self):
@@ -1148,7 +1148,7 @@ def test_with_overlong_two_coluns(self):
             "col2": [[-1, 2, -3, 4], [-13, 14, -15, 16], [-7, 8, -9], [10, -11, 12], [-5, 6]],
             "seq_lengths": [[4], [4], [3], [3], [2]],
         }
-        dataset = pack_dataset(dataset, seq_length, strategy="bfd-split")
+        dataset = pack_dataset(dataset, seq_length, strategy="bfd_split")
         assert dataset.to_dict() == expected_output
 
     def test_with_non_power_of_2(self):
@@ -1161,7 +1161,7 @@ def test_with_non_power_of_2(self):
             "input_ids": [[1, 2, 3, 4, 5], [7, 8, 9, 10, 6], [11, 12, 13]],
             "seq_lengths": [[5], [4, 1], [3]],
         }
-        dataset = pack_dataset(dataset, seq_length, strategy="bfd-split")
+        dataset = pack_dataset(dataset, seq_length, strategy="bfd_split")
         assert dataset.to_dict() == expected_output
 
     def test_default_no_split(self):
@@ -1189,7 +1189,7 @@ def test_with_empty_sequences(self):
             "input_ids": [[3, 4, 5, 6], [1, 2]],
             "seq_lengths": [[3, 1], [2]],
         }
-        dataset = pack_dataset(dataset, seq_length, strategy="bfd-split")
+        dataset = pack_dataset(dataset, seq_length, strategy="bfd_split")
         assert dataset.to_dict() == expected_output
 
 
diff --git a/trl/data_utils.py b/trl/data_utils.py
@@ -807,7 +807,7 @@ def pack_dataset(
             - `"bfd"` (Best Fit Decreasing): Preserves sequence boundaries and truncates sequences that exceed
                 `seq_length`, discarding overflow tokens. Ideal for SFT and conversational datasets where maintaining
                 conversation structure is important.
-            - `"bfd-split"`: Similar to `"bfd"` but splits overflow sequences for packing into other examples. Prevents
+            - `"bfd_split"`: Similar to `"bfd"` but splits overflow sequences for packing into other examples. Prevents
                 token loss for pre-training or long documents, but may break conversation structure in SFT datasets.
             - `"wrapped"`: Faster but more aggressive. Ignores sequence boundaries and will cut sequences in the middle
                 to completely fill each packed sequence with data.
@@ -835,8 +835,8 @@ def pack_dataset(
      'attention_mask': [[1, 1, 1, 0], [1, 1, 0, 1], [1, 0]],
      'seq_lengths': [[4], [3, 1], [2]]}
 
-    >>> # "bfd-split" strategy: preserves all tokens
-    >>> packed_dataset = pack_dataset(dataset, seq_length=4, strategy="bfd-split")
+    >>> # "bfd_split" strategy: preserves all tokens
+    >>> packed_dataset = pack_dataset(dataset, seq_length=4, strategy="bfd_split")
     >>> packed_dataset[:]
     {'input_ids': [[1, 2, 3, 4], [8, 9, 10, 5], [6, 7, 11]],
      'attention_mask': [[1, 1, 1, 0], [1, 1, 0, 0], [1, 0, 1]],
@@ -846,7 +846,7 @@ def pack_dataset(
     if map_kwargs is None:
         map_kwargs = {}
 
-    valid_strategies = ("bfd", "bfd-split", "wrapped")
+    valid_strategies = ("bfd", "bfd_split", "wrapped")
     if strategy not in valid_strategies:
         raise ValueError(f"Invalid packing strategy '{strategy}', must be one of {valid_strategies}.")
     format = _get_dataset_format(dataset)
@@ -858,7 +858,7 @@ def pack_dataset(
             fn_kwargs={"seq_length": seq_length, "on_seq_length_overflow": "truncate"},
             **map_kwargs,
         )
-    elif strategy == "bfd-split":
+    elif strategy == "bfd_split":
         dataset = dataset.map(
             _pack_bfd,
             batched=True,
@@ -870,7 +870,7 @@ def pack_dataset(
     else:
         raise ValueError(f"Invalid packing strategy: '{strategy}', must be one of {valid_strategies}.")
 
-    if strategy in {"bfd", "bfd-split"} and "columns" in format:
+    if strategy in {"bfd", "bfd_split"} and "columns" in format:
         format["columns"] = format["columns"] + ["seq_lengths"]
 
     dataset = dataset.with_format(**format)
diff --git a/trl/trainer/sft_config.py b/trl/trainer/sft_config.py
@@ -72,7 +72,7 @@ class SFTConfig(_BaseConfig):
             Whether to group multiple sequences into fixed-length blocks to improve computational efficiency and reduce
             padding. Uses `max_length` to define sequence length.
         packing_strategy (`str`, *optional*, defaults to `"bfd"`):
-            Strategy for packing sequences. Can be `"bfd"` (best-fit decreasing, truncates overflow), `"bfd-split"`
+            Strategy for packing sequences. Can be `"bfd"` (best-fit decreasing, truncates overflow), `"bfd_split"`
             (best-fit decreasing, splits overflow sequences), or `"wrapped"` (aggressive, cuts mid-sequence).
         padding_free (`bool`, *optional*, defaults to `False`):
             Whether to perform forward passes without padding by flattening all sequences in the batch into a single
@@ -193,9 +193,9 @@ class SFTConfig(_BaseConfig):
         default="bfd",
         metadata={
             "help": "Strategy for packing sequences. Can be `'bfd'` (best-fit decreasing, truncates overflow), "
-            "`'bfd-split'` (best-fit decreasing, splits overflow sequences), or `'wrapped'` (aggressive, cuts "
+            "`'bfd_split'` (best-fit decreasing, splits overflow sequences), or `'wrapped'` (aggressive, cuts "
             "mid-sequence).",
-            "choices": ["bfd", "bfd-split", "wrapped"],
+            "choices": ["bfd", "bfd_split", "wrapped"],
         },
     )
     padding_free: bool = field(
@@ -259,7 +259,7 @@ def __post_init__(self):
 
         if self.packing_strategy == "bfd-requeue":
             warnings.warn(
-                "The `bfd-requeue` packing strategy has been renamed to `bfd-split`. Please update your configuration accordingly. "
+                "The `bfd-requeue` packing strategy has been renamed to `bfd_split`. Please update your configuration accordingly. "
                 "The `bfd-requeue` strategy is deprecated and will be removed in a future version.",
                 FutureWarning,
             )
diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py
@@ -795,7 +795,7 @@ def __init__(
         # Data collator
         # BFD packing requires padding-free mode; otherwise, the collator outputs padded attention masks, causing
         # FlashAttention to ignore position_ids and recompute them incorrectly from the padded attention mask.
-        self.padding_free = args.padding_free or (args.packing and args.packing_strategy in {"bfd", "bfd-split"})
+        self.padding_free = args.padding_free or (args.packing and args.packing_strategy in {"bfd", "bfd_split"})
         use_flash_attention = model.config._attn_implementation in FLASH_ATTENTION_VARIANTS
         if self.padding_free:
             if data_collator is not None:
@@ -864,7 +864,7 @@ def __init__(
                 dataset_text_field=args.dataset_text_field,
             )
 
-        if args.packing and args.packing_strategy in {"bfd", "bfd-split"} and not use_flash_attention:
+        if args.packing and args.packing_strategy in {"bfd", "bfd_split"} and not use_flash_attention:
             logger.warning(
                 "You are using packing, but the attention implementation is not set to a supported flash attention "
                 "variant. Packing gathers multiple samples into a single sequence, and only the following "