Skip to content

Commit 42d4be7

Browse files
committed
- -> _
1 parent 4737aee commit 42d4be7

File tree

6 files changed

+20
-20
lines changed

6 files changed

+20
-20
lines changed

docs/source/paper_index.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1144,14 +1144,14 @@ SFTConfig(
11441144

11451145
**📜 Paper**: https://huggingface.co/papers/2404.10830
11461146

1147-
The paper shows that the standard concat-and-chunk preprocessing (`packing_strategy="wrapped"`) used for LLM training causes many documents to be arbitrarily truncated, which harms learning. It proposes packing document chunks into context windows using a Best-Fit Decreasing bin-packing algorithm, greatly reducing truncation while keeping high token utilization and improving model performance. TRL implements this as the `"bfd-split"` packing strategy in [`SFTConfig`]. For more details on packing, see the [SFT documentation](sft_trainer#packing).
1147+
The paper shows that the standard concat-and-chunk preprocessing (`packing_strategy="wrapped"`) used for LLM training causes many documents to be arbitrarily truncated, which harms learning. It proposes packing document chunks into context windows using a Best-Fit Decreasing bin-packing algorithm, greatly reducing truncation while keeping high token utilization and improving model performance. TRL implements this as the `"bfd_split"` packing strategy in [`SFTConfig`]. For more details on packing, see the [SFT documentation](sft_trainer#packing).
11481148

11491149
```python
11501150
from trl import SFTConfig
11511151

11521152
training_args = SFTConfig(
11531153
packing=True,
1154-
packing_strategy="bfd-split",
1154+
packing_strategy="bfd_split",
11551155
max_length=4096,
11561156
)
11571157
```

docs/source/reducing_memory_usage.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,14 @@ TRL supports three strategies:
8181
* **`bfd` (default)**
8282
Uses **Best-Fit Decreasing packing**. If a sequence exceeds `max_length`, the overflow tokens are discarded.
8383

84-
* **`bfd-split`**
84+
* **`bfd_split`**
8585
Uses **Best-Fit Decreasing packing**, but long sequences are split into chunks ≤ `max_length` before packing. This preserves all tokens and follows the approach proposed in [Fewer Truncations Improve Language Modeling](https://huggingface.co/papers/2404.10830).
8686

8787
* **`wrapped`**
8888
All tokens are concatenated into a stream and split into fixed-length blocks. This minimizes padding but may mix unrelated examples. This strategy corresponds to the *concatenate-then-split* preprocessing described in the literature (e.g., [Fewer Truncations Improve Language Modeling](https://huggingface.co/papers/2404.10830)). It has the downside of breaking sequence continuity for a large fraction of the dataset, which hurts performance, as discussed in the [Qwen3-Coder-Next Technical Report](https://huggingface.co/papers/2603.00729).
8989

9090
> [!NOTE]
91-
> If all sequences are shorter than `max_length`, **`bfd` and `bfd-split` behave identically**, since no truncation or splitting is required.
91+
> If all sequences are shorter than `max_length`, **`bfd` and `bfd_split` behave identically**, since no truncation or splitting is required.
9292
9393
```python
9494
from trl import SFTConfig

tests/test_data_utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,7 +1133,7 @@ def test_with_overlong_0(self):
11331133
"input_ids": [[1, 2, 3, 4], [8, 9, 10, 11], [6, 7, 5, 12]],
11341134
"seq_lengths": [[4], [4], [2, 1, 1]],
11351135
}
1136-
dataset = pack_dataset(dataset, seq_length, strategy="bfd-split")
1136+
dataset = pack_dataset(dataset, seq_length, strategy="bfd_split")
11371137
assert dataset.to_dict() == expected_output
11381138

11391139
def test_with_overlong_two_coluns(self):
@@ -1148,7 +1148,7 @@ def test_with_overlong_two_coluns(self):
11481148
"col2": [[-1, 2, -3, 4], [-13, 14, -15, 16], [-7, 8, -9], [10, -11, 12], [-5, 6]],
11491149
"seq_lengths": [[4], [4], [3], [3], [2]],
11501150
}
1151-
dataset = pack_dataset(dataset, seq_length, strategy="bfd-split")
1151+
dataset = pack_dataset(dataset, seq_length, strategy="bfd_split")
11521152
assert dataset.to_dict() == expected_output
11531153

11541154
def test_with_non_power_of_2(self):
@@ -1161,7 +1161,7 @@ def test_with_non_power_of_2(self):
11611161
"input_ids": [[1, 2, 3, 4, 5], [7, 8, 9, 10, 6], [11, 12, 13]],
11621162
"seq_lengths": [[5], [4, 1], [3]],
11631163
}
1164-
dataset = pack_dataset(dataset, seq_length, strategy="bfd-split")
1164+
dataset = pack_dataset(dataset, seq_length, strategy="bfd_split")
11651165
assert dataset.to_dict() == expected_output
11661166

11671167
def test_default_no_split(self):
@@ -1189,7 +1189,7 @@ def test_with_empty_sequences(self):
11891189
"input_ids": [[3, 4, 5, 6], [1, 2]],
11901190
"seq_lengths": [[3, 1], [2]],
11911191
}
1192-
dataset = pack_dataset(dataset, seq_length, strategy="bfd-split")
1192+
dataset = pack_dataset(dataset, seq_length, strategy="bfd_split")
11931193
assert dataset.to_dict() == expected_output
11941194

11951195

trl/data_utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -807,7 +807,7 @@ def pack_dataset(
807807
- `"bfd"` (Best Fit Decreasing): Preserves sequence boundaries and truncates sequences that exceed
808808
`seq_length`, discarding overflow tokens. Ideal for SFT and conversational datasets where maintaining
809809
conversation structure is important.
810-
- `"bfd-split"`: Similar to `"bfd"` but splits overflow sequences for packing into other examples. Prevents
810+
- `"bfd_split"`: Similar to `"bfd"` but splits overflow sequences for packing into other examples. Prevents
811811
token loss for pre-training or long documents, but may break conversation structure in SFT datasets.
812812
- `"wrapped"`: Faster but more aggressive. Ignores sequence boundaries and will cut sequences in the middle
813813
to completely fill each packed sequence with data.
@@ -835,8 +835,8 @@ def pack_dataset(
835835
'attention_mask': [[1, 1, 1, 0], [1, 1, 0, 1], [1, 0]],
836836
'seq_lengths': [[4], [3, 1], [2]]}
837837
838-
>>> # "bfd-split" strategy: preserves all tokens
839-
>>> packed_dataset = pack_dataset(dataset, seq_length=4, strategy="bfd-split")
838+
>>> # "bfd_split" strategy: preserves all tokens
839+
>>> packed_dataset = pack_dataset(dataset, seq_length=4, strategy="bfd_split")
840840
>>> packed_dataset[:]
841841
{'input_ids': [[1, 2, 3, 4], [8, 9, 10, 5], [6, 7, 11]],
842842
'attention_mask': [[1, 1, 1, 0], [1, 1, 0, 0], [1, 0, 1]],
@@ -846,7 +846,7 @@ def pack_dataset(
846846
if map_kwargs is None:
847847
map_kwargs = {}
848848

849-
valid_strategies = ("bfd", "bfd-split", "wrapped")
849+
valid_strategies = ("bfd", "bfd_split", "wrapped")
850850
if strategy not in valid_strategies:
851851
raise ValueError(f"Invalid packing strategy '{strategy}', must be one of {valid_strategies}.")
852852
format = _get_dataset_format(dataset)
@@ -858,7 +858,7 @@ def pack_dataset(
858858
fn_kwargs={"seq_length": seq_length, "on_seq_length_overflow": "truncate"},
859859
**map_kwargs,
860860
)
861-
elif strategy == "bfd-split":
861+
elif strategy == "bfd_split":
862862
dataset = dataset.map(
863863
_pack_bfd,
864864
batched=True,
@@ -870,7 +870,7 @@ def pack_dataset(
870870
else:
871871
raise ValueError(f"Invalid packing strategy: '{strategy}', must be one of {valid_strategies}.")
872872

873-
if strategy in {"bfd", "bfd-split"} and "columns" in format:
873+
if strategy in {"bfd", "bfd_split"} and "columns" in format:
874874
format["columns"] = format["columns"] + ["seq_lengths"]
875875

876876
dataset = dataset.with_format(**format)

trl/trainer/sft_config.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ class SFTConfig(_BaseConfig):
7272
Whether to group multiple sequences into fixed-length blocks to improve computational efficiency and reduce
7373
padding. Uses `max_length` to define sequence length.
7474
packing_strategy (`str`, *optional*, defaults to `"bfd"`):
75-
Strategy for packing sequences. Can be `"bfd"` (best-fit decreasing, truncates overflow), `"bfd-split"`
75+
Strategy for packing sequences. Can be `"bfd"` (best-fit decreasing, truncates overflow), `"bfd_split"`
7676
(best-fit decreasing, splits overflow sequences), or `"wrapped"` (aggressive, cuts mid-sequence).
7777
padding_free (`bool`, *optional*, defaults to `False`):
7878
Whether to perform forward passes without padding by flattening all sequences in the batch into a single
@@ -193,9 +193,9 @@ class SFTConfig(_BaseConfig):
193193
default="bfd",
194194
metadata={
195195
"help": "Strategy for packing sequences. Can be `'bfd'` (best-fit decreasing, truncates overflow), "
196-
"`'bfd-split'` (best-fit decreasing, splits overflow sequences), or `'wrapped'` (aggressive, cuts "
196+
"`'bfd_split'` (best-fit decreasing, splits overflow sequences), or `'wrapped'` (aggressive, cuts "
197197
"mid-sequence).",
198-
"choices": ["bfd", "bfd-split", "wrapped"],
198+
"choices": ["bfd", "bfd_split", "wrapped"],
199199
},
200200
)
201201
padding_free: bool = field(
@@ -259,7 +259,7 @@ def __post_init__(self):
259259

260260
if self.packing_strategy == "bfd-requeue":
261261
warnings.warn(
262-
"The `bfd-requeue` packing strategy has been renamed to `bfd-split`. Please update your configuration accordingly. "
262+
"The `bfd-requeue` packing strategy has been renamed to `bfd_split`. Please update your configuration accordingly. "
263263
"The `bfd-requeue` strategy is deprecated and will be removed in a future version.",
264264
FutureWarning,
265265
)

trl/trainer/sft_trainer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -795,7 +795,7 @@ def __init__(
795795
# Data collator
796796
# BFD packing requires padding-free mode; otherwise, the collator outputs padded attention masks, causing
797797
# FlashAttention to ignore position_ids and recompute them incorrectly from the padded attention mask.
798-
self.padding_free = args.padding_free or (args.packing and args.packing_strategy in {"bfd", "bfd-split"})
798+
self.padding_free = args.padding_free or (args.packing and args.packing_strategy in {"bfd", "bfd_split"})
799799
use_flash_attention = model.config._attn_implementation in FLASH_ATTENTION_VARIANTS
800800
if self.padding_free:
801801
if data_collator is not None:
@@ -864,7 +864,7 @@ def __init__(
864864
dataset_text_field=args.dataset_text_field,
865865
)
866866

867-
if args.packing and args.packing_strategy in {"bfd", "bfd-split"} and not use_flash_attention:
867+
if args.packing and args.packing_strategy in {"bfd", "bfd_split"} and not use_flash_attention:
868868
logger.warning(
869869
"You are using packing, but the attention implementation is not set to a supported flash attention "
870870
"variant. Packing gathers multiple samples into a single sequence, and only the following "

0 commit comments

Comments
 (0)