compile attention mask in first step

h-guo18 · h-guo18 · commit 67cc30bb0dc4 · 2025-09-23T20:56:46.000Z
Signed-off-by: h-guo18 &lt;67671475+h-guo18@users.noreply.github.com&gt;
diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py
@@ -203,8 +203,6 @@ def train():
                     "draft_vocab_size": custom_config["draft_vocab_size"]
                     if eagle_args.eagle_config and "draft_vocab_size" in custom_config
                     else model.config.vocab_size,
-                    # pass in the seq length for flex attention mask compilation
-                    "training_seq_len": training_args.training_seq_len,
                 }
             )
 
diff --git a/modelopt/torch/speculative/eagle/eagle_model.py b/modelopt/torch/speculative/eagle/eagle_model.py
@@ -45,7 +45,6 @@ def modify(
         self.eagle_report_acc = eagle_report_acc
         self.eagle_reuse_base_decoder = eagle_reuse_base_decoder
         self.eagle_loss_decay_factor = eagle_loss_decay_factor
-
         if eagle_architecture_config.get("parallel_draft_step", 1) > 1:
             for i in range(eagle_architecture_config.get("parallel_draft_step") - 1):
                 self.register_buffer(f"mask_token_{i}", torch.tensor(-1))
diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py
@@ -452,11 +452,15 @@ def modify(
                     layer.register_forward_hook(self._collect_aux_hidden_states_forward_hook)
 
         self.num_ttt_steps = 3  # NOTE: (hg) hardcoded for now. Might add to config later.
-        # compile and cach flex attention masks
-        self.cached_attn_blk_masks = [
-            self._compile_ttt_block_mask(self.eagle_config.training_seq_len, i)
-            for i in range(self.num_ttt_steps)
-        ]
+        self._cached_attn_blk_masks = []
+
+    def _get_ttt_attention_mask(self, seq_length, ttt_step):
+        # compile and cached flex attention masks in first call
+        if ttt_step >= len(self._cached_attn_blk_masks):
+            self._cached_attn_blk_masks.append(self._compile_ttt_block_mask(seq_length, ttt_step))
+
+        # return cached flex attention mask
+        return self._cached_attn_blk_masks[ttt_step]
 
     def _prepare_decoder_attention_mask(
         self, attention_mask, input_shape, inputs_embeds, past_key_values_length
@@ -773,7 +777,7 @@ def forward(
                     ),
                     dim=1,
                 )
-                attention_mask = self.cached_attn_blk_masks[ttt_step]
+                attention_mask = self._get_ttt_attention_mask(seq_length, ttt_step)
                 _, eagle_prenorm_h, eagle_logits, eagle_cache = self._eagle_forward(
                     eagle_input_hidden_states,
                     inputs_embeds,
diff --git a/tests/unit/torch/speculative/plugins/test_hf_speculative.py b/tests/unit/torch/speculative/plugins/test_hf_speculative.py
@@ -17,7 +17,6 @@
 from copy import deepcopy
 
 import pytest
-import torch
 from _test_utils.torch_model.transformers_models import (
     create_tiny_llama_dir,
     get_tiny_llama,
@@ -69,122 +68,3 @@ def test_eagle_model_convert_save_and_restore(tmp_path, eagle_config):
     model_test = AutoModelForCausalLM.from_pretrained(tmp_path / "modelopt_model")
     assert isinstance(model_test, mtsp.plugins.HFEagleModel)
     tf_modelopt_state_and_output_tester(model_ref, model_test)
-
-
-# fmt: off
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-def test_eagle_model_prepare_eagle_inputs(dtype):
-    dummy_model = get_tiny_llama(num_hidden_layers=4)
-
-    config = EAGLE3_DEFAULT_CFG["config"]
-    config["eagle_architecture_config"].update({
-        "draft_vocab_size": dummy_model.config.vocab_size,
-        "hidden_size": dummy_model.config.hidden_size,
-    })
-    mtsp.convert(dummy_model, mode=[("eagle", config)])
-
-    eagle_input_ids_0 = torch.tensor([[10, 20, 30, 40]], dtype=torch.long)
-    position_ids_0 = torch.tensor([[0, 1, 2, 3]], dtype=torch.long)
-
-
-    #This is concatenated from 3 intermediate base model layers
-    cat_aux_hidden_states = torch.randn(1, 4, 32, dtype=dtype)
-
-    #This is eagle output from previous eagle forward pass
-    dummy_eagle_output_hidden_states = torch.randn(1, 4, 32, dtype=dtype)
-
-    #This is the causal mask for the 0th eagle step
-    m = torch.finfo(dtype).min
-    attention_mask_0 = torch.tensor([[0, m, m, m], #  input tok 10-> predicting token 20
-                                     [0, 0, m, m], #  20 -> 30
-                                     [0, 0, 0, m], #  30 -> 40
-                                     [0, 0, 0, 0]] #  40 -> tok after 40
-
-                                    , dtype=dtype).view(1, 1, 4, 4)
-
-    # 2nd eagle step
-    eagle_input_h_1, eagle_input_ids_1, attention_mask_1, position_ids_1 = dummy_model._concat_eagle_inputs(
-        eagle_input_ids_0,
-        cat_aux_hidden_states,
-        attention_mask_0,
-        position_ids_0,
-        dummy_eagle_output_hidden_states,
-    )
-
-    assert eagle_input_ids_1.equal(torch.tensor([[10, 20, 30, 40, 10, 20, 30, 40]], dtype=torch.long))
-    assert position_ids_1.equal(torch.tensor([[0, 1, 2, 3, 0, 1, 2, 3]], dtype=torch.long))
-
-    assert attention_mask_1.equal(torch.tensor([[0, m, m, m,  m, m, m, m], # (x) output discarded
-                                                [0, 0, m, m,  m, m, m, m], # (x)
-                                                [0, 0, 0, m,  m, m, m, m], # (x)
-                                                [0, 0, 0, 0,  m, m, m, m], # (x)
-
-                                                [m, m, m, m,  m, m, m, m], # (x) input tok 10-> predicting token 20
-                                                [0, m, m, m,  m, 0, m, m], # 20 -> 30
-                                                [0, 0, m, m,  m, m, 0, m], # 30 -> 40
-                                                [0, 0, 0, 0,  m, m, m, m], # (x) 40 -> tok after 40
-                                                ], dtype=dtype).view(1, 1, 8, 8))
-
-    # 3rd eagle step
-    eagle_input_hidden_states_2, eagle_input_ids_2, attention_mask_2, position_ids_2 = dummy_model._concat_eagle_inputs(
-        eagle_input_ids_0,
-        cat_aux_hidden_states,
-        attention_mask_0,
-        position_ids_0,
-        torch.cat([dummy_eagle_output_hidden_states, dummy_eagle_output_hidden_states], dim=1),
-    )
-    assert eagle_input_ids_2.equal(torch.tensor([[10, 20, 30, 40,  10, 20, 30, 40,  10, 20, 30, 40]], dtype=torch.long))
-    assert position_ids_2.equal(torch.tensor([[0, 1, 2, 3,  0, 1, 2, 3,  0, 1, 2, 3]], dtype=torch.long))
-
-    assert attention_mask_2.equal(torch.tensor([[0, m, m, m,  m, m, m, m,  m, m, m, m], # (x)
-                                                [0, 0, m, m,  m, m, m, m,  m, m, m, m], # (x)
-                                                [0, 0, 0, m,  m, m, m, m,  m, m, m, m], # (x)
-                                                [0, 0, 0, 0,  m, m, m, m,  m, m, m, m], # (x)
-
-                                                [m, m, m, m,  m, m, m, m,  m, m, m, m], # (x)
-                                                [0, m, m, m,  m, 0, m, m,  m, m, m, m], # (x)
-                                                [0, 0, m, m,  m, m, 0, m,  m, m, m, m], # (x)
-                                                [0, 0, 0, 0,  m, m, m, m,  m, m, m, m], # (x)
-
-                                                [m, m, m, m,  m, m, m, m,  m, m, m, m], # (x)10 -> 20
-                                                [m, m, m, m,  m, m, m, m,  m, m, m, m], # (x)20 -> 30
-                                                [0, m, m, m,  m, 0, m, m,  m, m, 0, m], # 30 -> 40
-                                                [0, 0, 0, 0,  m, m, m, m,  m, m, m, m], # (x) 40 -> tok after 40
-
-                                                ], dtype=dtype).view(1, 1, 12, 12))
-
-    # 4th eagle step
-    eagle_input_hidden_states_3, eagle_input_ids_3, attention_mask_3, position_ids_3 = dummy_model._concat_eagle_inputs(
-        eagle_input_ids_0,
-        cat_aux_hidden_states,
-        attention_mask_0,
-        position_ids_0,
-        torch.cat([dummy_eagle_output_hidden_states, dummy_eagle_output_hidden_states,
-                   dummy_eagle_output_hidden_states],dim=1),
-    )
-
-    assert eagle_input_ids_3.equal(torch.tensor([[10, 20, 30, 40,  10, 20, 30, 40,
-                                                  10, 20, 30, 40,  10, 20, 30, 40]], dtype=torch.long))
-    assert position_ids_3.equal(torch.tensor([[0, 1, 2, 3,  0, 1, 2, 3,  0, 1, 2, 3,  0, 1, 2, 3]], dtype=torch.long))
-
-    assert attention_mask_3.equal(torch.tensor([[0, m, m, m,  m, m, m, m,  m, m, m, m,  m, m, m, m], # (x)
-                                                [0, 0, m, m,  m, m, m, m,  m, m, m, m,  m, m, m, m], # (x)
-                                                [0, 0, 0, m,  m, m, m, m,  m, m, m, m,  m, m, m, m], # (x)
-                                                [0, 0, 0, 0,  m, m, m, m,  m, m, m, m,  m, m, m, m], # (x)
-
-                                                [m, m, m, m,  m, m, m, m,  m, m, m, m,   m, m, m, m], # (x)
-                                                [0, m, m, m,  m, 0, m, m,  m, m, m, m,   m, m, m, m], # (x)
-                                                [0, 0, m, m,  m, m, 0, m,  m, m, m, m,   m, m, m, m], # (x)
-                                                [0, 0, 0, 0,  m, m, m, m,  m, m, m, m,   m, m, m, m], # (x)
-
-                                                [m, m, m, m,  m, m, m, m,  m, m, m, m,   m, m, m, m], # (x)
-                                                [m, m, m, m,  m, m, m, m,  m, m, m, m,   m, m, m, m], # (x)
-                                                [0, m, m, m,  m, 0, m, m,  m, m, 0, m,   m, m, m, m], # (x)
-                                                [0, 0, 0, 0,  m, m, m, m,  m, m, m, m,   m, m, m, m], # (x)
-
-                                                [m, m, m, m,  m, m, m, m,  m, m, m, m,   m, m, m, m], # (x)10 -> 20
-                                                [m, m, m, m,  m, m, m, m,  m, m, m, m,   m, m, m, m], # (x)20 -> 30
-                                                [m, m, m, m,  m, m, m, m,  m, m, m, m,   m, m, m, m], # (x)
-                                                [0, 0, 0, 0,  m, m, m, m,  m, m, m, m,   m, m, m, m], # (x)
-
-                                                ], dtype=dtype).view(1, 1, 16, 16))

Original file line number	Diff line number	Diff line change
`@@ -203,8 +203,6 @@ def train():`
`203`	`203`	`"draft_vocab_size": custom_config["draft_vocab_size"]`
`204`	`204`	`if eagle_args.eagle_config and "draft_vocab_size" in custom_config`
`205`	`205`	`else model.config.vocab_size,`
`206`		`- # pass in the seq length for flex attention mask compilation`
`207`		`- "training_seq_len": training_args.training_seq_len,`
`208`	`206`	`}`
`209`	`207`	`)`
`210`	`208`