open-lm-engine · mayank31398 · Sep 28, 2025 · Sep 28, 2025 · Sep 28, 2025 · Sep 28, 2025
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@ __pycache__
 /appwrapper.yaml
 *.egg-info/
 build/
+*.log
diff --git a/README.md b/README.md
@@ -86,7 +86,7 @@ labels = [[-100, -100, -100, 4, 5, 0], [-100, -100, 8, 0]]
 
 # this will throw a warning saying that the model is of gpt_bigcode class
 # ignore the warning
-model = GPTBaseForCausalLM.from_pretrained(<model_path>, use_padding_free_transformer=True).cuda()
+model = GPTBaseForCausalLM.from_pretrained(<model_path>).cuda()
 
 with enable_kernels([Kernel.flash_attention_2]):
     loss = model(input_ids=input_ids, labels=labels).loss

diff --git a/configs/distillation-example.yml b/configs/distillation-example.yml
@@ -23,7 +23,6 @@ model_args:
   model_class: AutoModelForCausalLM
   model_name: ibm/PowerLM-3b
   efficient_initialization: false
-  use_padding_free_transformer: false
 
 teacher_args:
   model_class: AutoModelForCausalLM

diff --git a/configs/finetuning-example.yml b/configs/finetuning-example.yml
@@ -25,7 +25,6 @@ model_args:
   # padding free transformer needs a gpt_base model.
   # To convert granite models to this class and convert back after training,
   # take a look at the readme of this repo
-  use_padding_free_transformer: false
 
 random_args:
   # for replication of experiment (however, flash attention is non-deterministic so replication generally won't work)

diff --git a/configs/pretraining-examples/dense/pretrain-1.yml b/configs/pretraining-examples/dense/pretrain-1.yml
@@ -120,7 +120,6 @@ model_args:
         intermediate_size: 3072
         add_bias: true
     position_embedding_type: learned_absolute
-  use_padding_free_transformer: true
 
 tuning_args:
   tuning_method: pretraining

diff --git a/configs/pretraining-examples/dense/pretrain-2.yml b/configs/pretraining-examples/dense/pretrain-2.yml
@@ -125,7 +125,6 @@ model_args:
         intermediate_size: 3072
         add_bias: true
     position_embedding_type: learned_absolute
-  use_padding_free_transformer: true
 
 tuning_args:
   tuning_method: pretraining

diff --git a/configs/pretraining-examples/dense/pretrain-3.yml b/configs/pretraining-examples/dense/pretrain-3.yml
@@ -138,7 +138,6 @@ model_args:
         intermediate_size: 3072
         add_bias: true
     position_embedding_type: learned_absolute
-  use_padding_free_transformer: true
 
 tuning_args:
   tuning_method: pretraining

diff --git a/configs/pretraining-examples/dense/pretrain-tpu.yml b/configs/pretraining-examples/dense/pretrain-tpu.yml
@@ -139,7 +139,6 @@ model_args:
         intermediate_size: 3072
         add_bias: true
     position_embedding_type: learned_absolute
-  # use_padding_free_transformer: true
 
 tuning_args:
   tuning_method: pretraining

diff --git a/configs/research/cross-layer-attention/base.yml b/configs/research/cross-layer-attention/base.yml
@@ -249,7 +249,6 @@ model_args:
         activation_function: swiglu
         intermediate_size: 8192
   efficient_initialization: false
-  use_padding_free_transformer: true
 
 tuning_args:
   tuning_method: pretraining

diff --git a/configs/research/cross-layer-attention/cla.yml b/configs/research/cross-layer-attention/cla.yml
@@ -282,7 +282,6 @@ model_args:
         activation_function: swiglu
         intermediate_size: 8192
   efficient_initialization: false
-  use_padding_free_transformer: true
 
 tuning_args:
   tuning_method: pretraining

diff --git a/configs/research/ladder-residual/1b-base.yml b/configs/research/ladder-residual/1b-base.yml
@@ -278,7 +278,6 @@ model_args:
         activation_function: swiglu
         intermediate_size: 4096
   efficient_initialization: false
-  use_padding_free_transformer: false
 
 tuning_args:
   tuning_method: pretraining

diff --git a/configs/research/ladder-residual/1b-ladder.yml b/configs/research/ladder-residual/1b-ladder.yml
@@ -278,7 +278,6 @@ model_args:
         activation_function: swiglu
         intermediate_size: 4096
   efficient_initialization: false
-  use_padding_free_transformer: false
 
 tuning_args:
   tuning_method: pretraining

diff --git a/configs/research/ladder-residual/1b-parallel.yml b/configs/research/ladder-residual/1b-parallel.yml
@@ -278,7 +278,6 @@ model_args:
         activation_function: swiglu
         intermediate_size: 4096
   efficient_initialization: false
-  use_padding_free_transformer: false
 
 tuning_args:
   tuning_method: pretraining

diff --git a/configs/research/ladder-residual/3b-base.yml b/configs/research/ladder-residual/3b-base.yml
@@ -238,7 +238,6 @@ model_args:
       - mlp_type: MLP
         activation_function: swiglu
   efficient_initialization: false
-  use_padding_free_transformer: false
 
 tuning_args:
   tuning_method: pretraining

diff --git a/configs/research/ladder-residual/3b-ladder.yml b/configs/research/ladder-residual/3b-ladder.yml
@@ -238,7 +238,6 @@ model_args:
       - mlp_type: MLP
         activation_function: swiglu
   efficient_initialization: false
-  use_padding_free_transformer: false
 
 tuning_args:
   tuning_method: pretraining

diff --git a/configs/research/ladder-residual/3b-parallel.yml b/configs/research/ladder-residual/3b-parallel.yml
@@ -238,7 +238,6 @@ model_args:
       - mlp_type: MLP
         activation_function: swiglu
   efficient_initialization: false
-  use_padding_free_transformer: false
 
 tuning_args:
   tuning_method: pretraining

diff --git a/lm_engine/arguments.py b/lm_engine/arguments.py
@@ -48,8 +48,6 @@ class ModelArgs(BaseArgs):
     model_class: str = None
     # trust remote code for models that are not directly supported by HuggingFace yet
     trust_remote_code: bool = False
-    # whether to use padding free transformer: https://huggingface.co/blog/mayank-mishra/padding-free-transformer
-    use_padding_free_transformer: bool = False
     # use lower memory to initialize model
     efficient_initialization: bool = False
     # whether to reset attention masks for pretraining

diff --git a/lm_engine/data/__init__.py b/lm_engine/data/__init__.py
@@ -134,7 +134,6 @@ def get_finetuning_dataloader(
             use_output=use_output,
             loss_mask=args.training_parameters.loss_mask,
             eos_token_id=tokenizer.eos_token_id,
-            use_padding_free_transformer=args.model_args.use_padding_free_transformer,
             pad_to_multiple_of=ProcessGroupManager.get_tensor_parallel_world_size(),
         ),
     )

diff --git a/lm_engine/data/utils.py b/lm_engine/data/utils.py
@@ -8,15 +8,59 @@
 import torch
 
 from ..enums import LossMask
-from ..hf_models import convert_padding_free_lists_to_tensors
+
+
+def _check_list_type(list_of_list: list[list[int | float]] | None, error_message: str) -> None:
+    if list_of_list is None:
+        return
+
+    assert isinstance(list_of_list, list), error_message
+    assert isinstance(list_of_list[0], list), error_message
+
+
+def _flatten_and_convert_to_tensors(x: list[int], device: torch.device) -> torch.Tensor:
+    y = []
+    for sequence in x:
+        y.extend(sequence)
+
+    return torch.tensor(y, device=device)
+
+
+def _convert_padding_free_lists_to_tensors(
+    input_ids: list[list[int]] | None = None,
+    position_ids: list[list[int]] | None = None,
+    labels: list[list[int]] | None = None,
+    device: torch.device = None,
+) -> tuple[torch.Tensor | int]:
+
+    # check input types are correct
+    error_message = "{variable} should be of type List[List[{dtype}]]"
+    _check_list_type(input_ids, error_message.format(variable="input_ids", dtype="int"))
+    _check_list_type(position_ids, error_message.format(variable="position_ids", dtype="int"))
+    _check_list_type(labels, error_message.format(variable="labels", dtype="int"))
+
+    # prepare inputs for the model
+    seqlens = torch.tensor([0] + [len(x) for x in input_ids], device=device)
+    cu_seqlens = seqlens.cumsum(dim=-1).to(torch.int32)
+    max_seqlen = seqlens.max().item()
+
+    if position_ids is None:
+        position_ids = [list(range(len(x))) for x in input_ids]
+    position_ids = _flatten_and_convert_to_tensors(position_ids, device)
+
+    input_ids = _flatten_and_convert_to_tensors(input_ids, device)
+
+    if labels is not None:
+        labels = _flatten_and_convert_to_tensors(labels, device)
+
+    return input_ids, position_ids, labels, cu_seqlens, max_seqlen
 
 
 def collate_fn(
     batch: list[dict],
     use_output: bool,
     loss_mask: LossMask,
     eos_token_id: int,
-    use_padding_free_transformer: bool,
     labels_mask_value: int = -100,
     pad_to_multiple_of: int = 1,
     device: torch.device = None,
@@ -38,64 +82,40 @@ def collate_fn(
 
     device = torch.cuda.current_device() if device is None else device
 
-    if use_padding_free_transformer:
-        input_ids = inputs
-        attention_mask = None
-
-        if loss_mask == LossMask.output_only:
-            labels = [
-                [labels_mask_value] * (len(array_in) - len(array_out)) + array_out
-                for array_in, array_out in zip(inputs, outputs)
-            ]
-        elif loss_mask == LossMask.no_mask:
-            labels = inputs
-        else:
-            raise ValueError(f"unexpected loss_mask ({loss_mask})")
-
-        tokens_to_add = 0
-        if pad_to_multiple_of > 1:
-            total_tokens = sum([len(array) for array in input_ids])
-            tokens_to_add = (math.ceil(total_tokens / pad_to_multiple_of) * pad_to_multiple_of) - total_tokens
-
-        # we pad the last example in the batch on the right
-        # NOTE this can be done since the attention is causal
-        input_ids[-1].extend([eos_token_id] * tokens_to_add)
-        labels[-1].extend([labels_mask_value] * tokens_to_add)
-
-        input_ids, position_ids, _, labels, cu_seqlens, max_seqlen = convert_padding_free_lists_to_tensors(
-            input_ids=input_ids, labels=labels, device=device
-        )
-
-        result = {
-            "input_ids": input_ids,
-            "position_ids": position_ids,
-            "cu_seqlens": cu_seqlens,
-            "max_seqlen": max_seqlen,
-        }
-        if labels is not None:
-            result["labels"] = labels
+    input_ids = inputs
+
+    if loss_mask == LossMask.output_only:
+        labels = [
+            [labels_mask_value] * (len(array_in) - len(array_out)) + array_out
+            for array_in, array_out in zip(inputs, outputs)
+        ]
+    elif loss_mask == LossMask.no_mask:
+        labels = inputs
     else:
-        max_length = max(list(map(len, inputs)))
-        if pad_to_multiple_of > 1:
-            max_length = math.ceil(max_length / pad_to_multiple_of) * pad_to_multiple_of
-
-        input_ids = [[eos_token_id] * (max_length - len(array)) + array for array in inputs]
-        attention_mask = [[0] * (max_length - len(array)) + [1] * len(array) for array in inputs]
-
-        if outputs is not None:
-            if loss_mask == LossMask.output_only:
-                labels = [[labels_mask_value] * (max_length - len(array)) + array for array in outputs]
-            elif loss_mask == LossMask.no_mask:
-                labels = inputs
-            else:
-                raise ValueError(f"unexpected loss_mask ({loss_mask})")
-
-        result = {
-            "input_ids": torch.tensor(input_ids, device=device),
-            "attention_mask": torch.tensor(attention_mask, device=device),
-        }
-        if labels is not None:
-            result["labels"] = torch.tensor(labels, device=device)
+        raise ValueError(f"unexpected loss_mask ({loss_mask})")
+
+    tokens_to_add = 0
+    if pad_to_multiple_of > 1:
+        total_tokens = sum([len(array) for array in input_ids])
+        tokens_to_add = (math.ceil(total_tokens / pad_to_multiple_of) * pad_to_multiple_of) - total_tokens
+
+    # we pad the last example in the batch on the right
+    # NOTE this can be done since the attention is causal
+    input_ids[-1].extend([eos_token_id] * tokens_to_add)
+    labels[-1].extend([labels_mask_value] * tokens_to_add)
+
+    input_ids, position_ids, _, labels, cu_seqlens, max_seqlen = _convert_padding_free_lists_to_tensors(
+        input_ids=input_ids, labels=labels, device=device
+    )
+
+    result = {
+        "input_ids": input_ids,
+        "position_ids": position_ids,
+        "cu_seqlens": cu_seqlens,
+        "max_seqlen": max_seqlen,
+    }
+    if labels is not None:
+        result["labels"] = labels
 
     return result
 

diff --git a/lm_engine/hf_models/__init__.py b/lm_engine/hf_models/__init__.py
@@ -2,8 +2,10 @@
 # Copyright (c) 2025, Mayank Mishra
 # **************************************************
 
+from .cache import disable_generation_cache
 from .config import CommonConfig
 from .loss import get_autoregressive_language_modeling_loss, is_aux_loss_zero
+from .mask import AttentionMaskInfo
 from .mixins import CausalLMOutputWithPast, PipelineParallelInput, PipelineParallelOutput
 from .model_conversion import export_to_huggingface, import_from_huggingface
 from .models import (
@@ -30,7 +32,6 @@
 )
 from .register_hf import get_model_parallel_class, is_custom_model, register_model_classes
 from .unshard import fix_unsharded_state_dict, unshard_tensor_parallel_state_dicts
-from .utils import convert_padding_free_lists_to_tensors, disable_generation_cache
 
 
 register_model_classes()
diff --git a/lm_engine/hf_models/cache/__init__.py b/lm_engine/hf_models/cache/__init__.py
@@ -4,7 +4,7 @@
 
 from __future__ import annotations
 
-from typing import Iterable
+from typing import Any, Iterable
 
 import torch
 
@@ -53,3 +53,22 @@ def get_seq_length(self, layer_idx: int = 0) -> int:
     def reorder_cache(self, beam_idx: torch.Tensor) -> None:
         for cache in self.cache:
             cache.reorder_cache(beam_idx)
+
+
+_IS_GENERATION_CACHE_ENABLED: bool = True
+
+
+class disable_generation_cache:
+    def __enter__(self) -> Any:
+        global _IS_GENERATION_CACHE_ENABLED
+        self.original = _IS_GENERATION_CACHE_ENABLED
+
+        _IS_GENERATION_CACHE_ENABLED = False
+
+    def __exit__(self, exception_type, exception_value, exception_traceback) -> Any:
+        global _IS_GENERATION_CACHE_ENABLED
+        _IS_GENERATION_CACHE_ENABLED = self.original
+
+
+def is_generation_cache_enabled() -> bool:
+    return _IS_GENERATION_CACHE_ENABLED
diff --git a/lm_engine/hf_models/loss.py b/lm_engine/hf_models/loss.py
@@ -14,6 +14,7 @@
 from ..enums import Kernel
 from ..kernels import is_kernel_allowed
 from ..utils import ProcessGroupManager, is_xma_available
+from .mask import AttentionMaskInfo
 
 
 if is_xma_available():
@@ -23,10 +24,9 @@
 def get_autoregressive_language_modeling_loss(
     lm_logits: torch.Tensor,
     labels: torch.Tensor,
+    attention_mask_info: AttentionMaskInfo,
     hidden_states: torch.Tensor | None = None,
     vocab_weight: torch.Tensor | None = None,
-    cu_seqlens: torch.Tensor | None = None,
-    use_padding_free_transformer: bool = False,
     reduction: str = "mean",
     shift_logits_and_labels: bool = True,
     tensor_parallel_enabled: bool = False,
@@ -40,15 +40,13 @@ def get_autoregressive_language_modeling_loss(
 
         labels = labels[..., 1:]
 
-    if use_padding_free_transformer:
-        if shift_logits_and_labels:
-            assert cu_seqlens is not None
+    if shift_logits_and_labels:
+        cu_seqlens = attention_mask_info.get_cu_seqlens()
 
+        if cu_seqlens is not None:
             # this is needed so that the last token of current example doesn't predict first token of next example
             drop_loss_positions = cu_seqlens[1:-1] - 1
             labels[drop_loss_positions] = -100
-    else:
-        assert cu_seqlens is None
 
     if is_kernel_allowed(Kernel.fused_linear_cross_entropy):
         assert lm_logits is None
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,3 +6,4 @@ __pycache__ @@
     /appwrapper.yaml
     *.egg-info/
     build/
+    *.log