clean up

uygnef · uygnef · commit bba2f9d2a544 · 2026-01-14T17:25:21.000+08:00
diff --git a/scripts/train_eagle3.py b/scripts/train_eagle3.py
@@ -35,8 +35,9 @@
     destroy_distributed,
     get_dp_group,
     get_draft_dp_group,
+    get_draft_sp_group,
     get_tp_group,
-    init_distributed, get_draft_sp_group,
+    init_distributed,
 )
 from specforge.modeling.target import (
     Eagle3TargetModel,
@@ -631,11 +632,6 @@ def record_metrcs(
     tracker.log(logdict, step=global_step)
 
 
-import torch
-import torch.distributed as dist
-import torch.nn.functional as F
-
-
 def get_dp_data_shard_from_tp(tensor: torch.Tensor, sp_dim: int = 1) -> torch.Tensor:
     """
     Process: TP split -> Pad to Max Len -> SP gather.
@@ -657,7 +653,9 @@ def get_dp_data_shard_from_tp(tensor: torch.Tensor, sp_dim: int = 1) -> torch.Te
         local_seq_len = local_tp_shard.size(sp_dim)
 
         # Find global max sequence length in SP group
-        len_tensor = torch.tensor([local_seq_len], device=local_tp_shard.device, dtype=torch.long)
+        len_tensor = torch.tensor(
+            [local_seq_len], device=local_tp_shard.device, dtype=torch.long
+        )
         dist.all_reduce(len_tensor, op=dist.ReduceOp.MAX, group=sp_group)
         max_seq_len = len_tensor.item()
 
@@ -674,12 +672,16 @@ def get_dp_data_shard_from_tp(tensor: torch.Tensor, sp_dim: int = 1) -> torch.Te
             pad_config[pad_idx] = pad_size
 
             # Pad value: 0 is standard, ensure it matches your pad_token_id logic if needed
-            local_tp_shard_padded = F.pad(local_tp_shard, pad_config, value=0)
+            local_tp_shard_padded = nn.F.pad(local_tp_shard, pad_config, value=0)
         else:
             local_tp_shard_padded = local_tp_shard
 
-        gathered_shards = [torch.empty_like(local_tp_shard_padded) for _ in range(sp_world_size)]
-        dist.all_gather(gathered_shards, local_tp_shard_padded.contiguous(), group=sp_group)
+        gathered_shards = [
+            torch.empty_like(local_tp_shard_padded) for _ in range(sp_world_size)
+        ]
+        dist.all_gather(
+            gathered_shards, local_tp_shard_padded.contiguous(), group=sp_group
+        )
 
         return torch.cat(gathered_shards, dim=sp_dim)
 
diff --git a/specforge/layers/ring/ring_flash_attn.py b/specforge/layers/ring/ring_flash_attn.py
@@ -1,6 +1,8 @@
 import torch
+from yunchang.kernels import AttnType, select_flash_attn_impl
+
 from .utils import RingComm, update_out_and_lse
-from yunchang.kernels import select_flash_attn_impl, AttnType
+
 
 def ring_flash_attn_forward(
     process_group,
@@ -31,7 +33,9 @@ def ring_flash_attn_forward(
             comm.commit()
 
         if not causal or step <= comm.rank:
-            fn = select_flash_attn_impl(attn_type, stage="fwd-only", attn_processor=attn_processor)
+            fn = select_flash_attn_impl(
+                attn_type, stage="fwd-only", attn_processor=attn_processor
+            )
             block_out, block_lse = fn(
                 q,
                 k,
@@ -219,7 +223,22 @@ def backward(ctx, dout, *args):
             deterministic=ctx.deterministic,
             attn_type=ctx.attn_type,
         )
-        return dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None
+        return (
+            dq,
+            dk,
+            dv,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
 
 
 def ring_flash_attn_qkvpacked_func(
diff --git a/specforge/layers/ring/utils.py b/specforge/layers/ring/utils.py
@@ -6,14 +6,15 @@
 
 __all__ = ["update_out_and_lse", "RingComm"]
 
+
 @torch.jit.script
 def _update_out_and_lse(
     out: torch.Tensor,
     lse: torch.Tensor,
     block_out: torch.Tensor,
     block_lse: torch.Tensor,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
-    
+
     block_out = block_out.to(torch.float32)
     block_lse = block_lse.transpose(-2, -1).unsqueeze(dim=-1)
 
@@ -115,4 +116,4 @@ def wait(self):
         for req in self._reqs:
             req.wait()
         self._reqs = None
-        self._ops = []
+        self._ops = []
diff --git a/specforge/modeling/draft/llama3_eagle.py b/specforge/modeling/draft/llama3_eagle.py
@@ -8,11 +8,9 @@
 import torch.nn.functional as F
 from flash_attn import flash_attn_func
 from torch.nn.attention.flex_attention import create_block_mask, flex_attention
-from transformers import LlamaConfig
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache
 from transformers.models.llama.configuration_llama import LlamaConfig
-from yunchang import EXTRACT_FUNC_DICT
 from yunchang.comm import SeqAllToAll4D
 
 from specforge.modeling.draft.flex_attention import (
@@ -21,10 +19,10 @@
     generate_eagle3_mask,
 )
 from specforge.utils import print_with_rank
-from .base import Eagle3DraftModel
+
 from ...distributed import get_sp_ring_group, get_sp_ulysses_group
 from ...layers.ring import ring_flash_attn_func
-
+from .base import Eagle3DraftModel
 
 try:
     from flash_attn import flash_attn_func
@@ -973,6 +971,7 @@ class LlamaUSPFlashAttention(LlamaAttention):
     """
     LlamaUSPFlashAttention with Trainable Ring Attention & Correct Eagle3 Branch Merging.
     """
+
     def __init__(self, config):
         super().__init__(config)
         assert (
@@ -1008,19 +1007,35 @@ def forward(
         query_states = self.q_proj(hidden_states)
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
         query_states = SeqAllToAll4D.apply(
-            self.ulysses_pg, query_states, self.scatter_idx, self.gather_idx, self.use_sync
+            self.ulysses_pg,
+            query_states,
+            self.scatter_idx,
+            self.gather_idx,
+            self.use_sync,
         )
 
         key_states = self.k_proj(hidden_states)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        key_states = key_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        )
         key_states = SeqAllToAll4D.apply(
-            self.ulysses_pg, key_states, self.scatter_idx, self.gather_idx, self.use_sync
+            self.ulysses_pg,
+            key_states,
+            self.scatter_idx,
+            self.gather_idx,
+            self.use_sync,
         )
 
         value_states = self.v_proj(hidden_states)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        value_states = value_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        )
         value_states = SeqAllToAll4D.apply(
-            self.ulysses_pg, value_states, self.scatter_idx, self.gather_idx, self.use_sync
+            self.ulysses_pg,
+            value_states,
+            self.scatter_idx,
+            self.gather_idx,
+            self.use_sync,
         )
 
         current_q_len = query_states.shape[1]
@@ -1034,17 +1049,26 @@ def forward(
         # =============================================================
         if self.sp_ring_degree > 1:
             if isinstance(self.rotary_emb, LlamaMutiRotaryEmbedding):
-                position_ids = position_ids.chunk(self.sp_ring_degree, dim=2)[self.ring_rank].clone()
+                position_ids = position_ids.chunk(self.sp_ring_degree, dim=2)[
+                    self.ring_rank
+                ].clone()
             else:
-                position_ids = position_ids.chunk(self.sp_ring_degree, dim=1)[self.ring_rank].clone()
+                position_ids = position_ids.chunk(self.sp_ring_degree, dim=1)[
+                    self.ring_rank
+                ].clone()
 
         lck = 0 if cache_hidden is None else len(cache_hidden[0])
 
         if isinstance(self.rotary_emb, LlamaMutiRotaryEmbedding):
             cos, sin = self.rotary_emb(query_states, position_ids + lck)
             cos, sin = cos.to(query_states.device), sin.to(query_states.device)
             query_states, key_states = apply_multimodal_rotary_pos_emb(
-                query_states, key_states, cos, sin, self.config.rope_scaling["mrope_section"], unsqueeze_dim=2
+                query_states,
+                key_states,
+                cos,
+                sin,
+                self.config.rope_scaling["mrope_section"],
+                unsqueeze_dim=2,
             )
         else:
             cos, sin = self.rotary_emb(query_states, seq_len=global_q_len + lck)
@@ -1087,8 +1111,9 @@ def forward(
         else:
             acc_lse = lse_ring
 
-        assert acc_lse.shape[1] == current_q_len, \
-            f"LSE seq_len {acc_lse.shape[1]} mismatch with Query seq_len {current_q_len}"
+        assert (
+            acc_lse.shape[1] == current_q_len
+        ), f"LSE seq_len {acc_lse.shape[1]} mismatch with Query seq_len {current_q_len}"
 
         acc_out = out_ring
 
@@ -1097,7 +1122,13 @@ def forward(
             num_kv_heads_local = cache_k[0].shape[2]
             local_groups = local_num_heads // num_kv_heads_local
 
-            q_shape_expanded = (bsz, current_q_len, num_kv_heads_local, local_groups, self.head_dim)
+            q_shape_expanded = (
+                bsz,
+                current_q_len,
+                num_kv_heads_local,
+                local_groups,
+                self.head_dim,
+            )
             qi_reshaped = query_states.view(q_shape_expanded)  # [B, S, KV, G, D]
 
             for i in range(1, len(cache_k)):
@@ -1118,8 +1149,9 @@ def forward(
                 # Online Softmax Update
                 new_lse = torch.logaddexp(acc_lse, step_lse)
 
-                acc_out = acc_out * torch.exp(acc_lse - new_lse).unsqueeze(-1) + \
-                          step_out * torch.exp(step_lse - new_lse).unsqueeze(-1)
+                acc_out = acc_out * torch.exp(acc_lse - new_lse).unsqueeze(
+                    -1
+                ) + step_out * torch.exp(step_lse - new_lse).unsqueeze(-1)
 
                 acc_lse = new_lse