[None][fix] Apply spec_decoding_position_offsets in Python RoPE path for EAGLE3 dynamic tree

sunnyqgg · sunnyqgg · commit 26088df5e49b · 2026-03-31T21:38:50.000-07:00
Signed-off-by: qgai &lt;qgai@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -872,6 +872,19 @@ def forward(
         else:
             q, k, v = qkv, None, None
 
+        # For dynamic tree spec decoding with Python RoPE, adjust position_ids
+        # to use tree offsets (same as C++ kernel: past_seq_len + offset).
+        if (not self.rope_fusion
+                and getattr(attn_metadata, 'is_spec_dec_dynamic_tree', False)
+                and getattr(attn_metadata, 'use_spec_decoding', False)
+                and getattr(attn_metadata, 'spec_decoding_position_offsets',
+                            None) is not None
+                and attn_metadata.spec_decoding_position_offsets.dim() ==
+                1  # 1D layout ⇒ dynamic tree
+                and position_ids is not None):
+            position_ids = self._adjust_position_ids_for_spec_dec(
+                position_ids, attn_metadata)
+
         q, k, v = self.apply_rope(q, k, v, position_ids)
         q, k, v = self.convert_qkv(q, k, v)
 
@@ -921,6 +934,25 @@ def apply_rope(self, q: torch.Tensor, k: Optional[torch.Tensor],
             q, k = self.rotary_emb(position_ids, [q, k])
         return q, k, v
 
+    def _adjust_position_ids_for_spec_dec(self, position_ids, attn_metadata):
+        """Replicate C++ kernel's rotary_pos = past_seq_len + offset."""
+        num_contexts = attn_metadata.num_contexts
+        num_gens = attn_metadata.num_seqs - num_contexts
+        if num_gens <= 0:
+            return position_ids
+        gen_len = int(attn_metadata.seq_lens[num_contexts])
+        base_pos = attn_metadata.kv_lens_cuda[num_contexts:num_contexts +
+                                              num_gens] - gen_len
+        offsets = attn_metadata.spec_decoding_position_offsets[:num_gens *
+                                                               gen_len].view(
+                                                                   num_gens,
+                                                                   gen_len)
+        adjusted = (base_pos.unsqueeze(1) + offsets).reshape(-1)
+        start = attn_metadata.num_ctx_tokens
+        end = start + num_gens * gen_len
+        position_ids[0, start:end] = adjusted
+        return position_ids
+
     def apply_qk_norm(self, q, k):
         raise NotImplementedError(
             f"QK norm is not implemented for {self.__class__.__name__}. "
diff --git a/tensorrt_llm/_torch/speculative/eagle3_dynamic_tree.py b/tensorrt_llm/_torch/speculative/eagle3_dynamic_tree.py
@@ -182,7 +182,9 @@ def __init__(
             (max_batch_size, K + K * K * (max_draft_len - 1)), dtype=torch.float32, device="cuda"
         )
         self.history_draft_tokens_parent_buffer = torch.zeros(
-            (max_batch_size, K * (max_draft_len - 1) + 1), dtype=torch.int64, device="cuda"
+            (max_batch_size, max(K * (max_draft_len - 1) + 1, K + 1)),
+            dtype=torch.int64,
+            device="cuda",
         )
         self.tree_mask_buffer = torch.zeros(
             (max_batch_size * loop_max_tokens * loop_max_tokens),

Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,9 @@ def __init__(`
`182`	`182`	`(max_batch_size, K + K * K * (max_draft_len - 1)), dtype=torch.float32, device="cuda"`
`183`	`183`	`)`
`184`	`184`	`self.history_draft_tokens_parent_buffer = torch.zeros(`
`185`		`- (max_batch_size, K * (max_draft_len - 1) + 1), dtype=torch.int64, device="cuda"`
	`185`	`+ (max_batch_size, max(K * (max_draft_len - 1) + 1, K + 1)),`
	`186`	`+ dtype=torch.int64,`
	`187`	`+ device="cuda",`
`186`	`188`	`)`
`187`	`189`	`self.tree_mask_buffer = torch.zeros(`
`188`	`190`	`(max_batch_size * loop_max_tokens * loop_max_tokens),`