clean up

uygnef · uygnef · commit 2e9bdff2b447 · 2026-01-13T12:10:10.000+08:00
diff --git a/scripts/train_eagle3.py b/scripts/train_eagle3.py
@@ -434,9 +434,7 @@ def build_dataloaders(
         num_workers=args.dataloader_num_workers,
         shuffle=True,
         process_group=(
-            get_draft_dp_group()
-            if args.attention_backend == "usp"
-            else get_dp_group()
+            get_draft_dp_group() if args.attention_backend == "usp" else get_dp_group()
         ),
         is_vlm=args.is_vlm,
     )
diff --git a/specforge/layers/ring/__init__.py b/specforge/layers/ring/__init__.py
@@ -4,4 +4,3 @@
     ring_flash_attn_kvpacked_func,
     ring_flash_attn_qkvpacked_func,
 )
-
diff --git a/specforge/layers/ring/ring_flash_attn.py b/specforge/layers/ring/ring_flash_attn.py
@@ -1,6 +1,8 @@
 import torch
+from yunchang.kernels import AttnType, select_flash_attn_impl
+
 from .utils import RingComm, update_out_and_lse
-from yunchang.kernels import select_flash_attn_impl, AttnType
+
 
 def ring_flash_attn_forward(
     process_group,
@@ -31,7 +33,9 @@ def ring_flash_attn_forward(
             comm.commit()
 
         if not causal or step <= comm.rank:
-            fn = select_flash_attn_impl(attn_type, stage="fwd-only", attn_processor=attn_processor)
+            fn = select_flash_attn_impl(
+                attn_type, stage="fwd-only", attn_processor=attn_processor
+            )
             block_out, block_lse = fn(
                 q,
                 k,
@@ -219,7 +223,22 @@ def backward(ctx, dout, *args):
             deterministic=ctx.deterministic,
             attn_type=ctx.attn_type,
         )
-        return dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None
+        return (
+            dq,
+            dk,
+            dv,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
 
 
 def ring_flash_attn_qkvpacked_func(
diff --git a/specforge/layers/ring/utils.py b/specforge/layers/ring/utils.py
@@ -6,14 +6,15 @@
 
 __all__ = ["update_out_and_lse", "RingComm"]
 
+
 @torch.jit.script
 def _update_out_and_lse(
     out: torch.Tensor,
     lse: torch.Tensor,
     block_out: torch.Tensor,
     block_lse: torch.Tensor,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
-    
+
     block_out = block_out.to(torch.float32)
     block_lse = block_lse.transpose(-2, -1).unsqueeze(dim=-1)
 
@@ -115,4 +116,4 @@ def wait(self):
         for req in self._reqs:
             req.wait()
         self._reqs = None
-        self._ops = []
+        self._ops = []
diff --git a/specforge/modeling/draft/llama3_eagle.py b/specforge/modeling/draft/llama3_eagle.py
@@ -11,7 +11,6 @@
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache
 from transformers.models.llama.configuration_llama import LlamaConfig
-from yunchang import EXTRACT_FUNC_DICT
 from yunchang.comm import SeqAllToAll4D
 
 from specforge.modeling.draft.flex_attention import (
@@ -20,10 +19,10 @@
     generate_eagle3_mask,
 )
 from specforge.utils import print_with_rank
-from .base import Eagle3DraftModel
+
 from ...distributed import get_sp_ring_group, get_sp_ulysses_group
 from ...layers.ring import ring_flash_attn_func
-
+from .base import Eagle3DraftModel
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
@@ -961,6 +960,7 @@ class LlamaUSPFlashAttention(LlamaAttention):
     """
     LlamaUSPFlashAttention with Trainable Ring Attention & Correct Eagle3 Branch Merging.
     """
+
     def __init__(self, config):
         super().__init__(config)
         assert (
@@ -996,19 +996,35 @@ def forward(
         query_states = self.q_proj(hidden_states)
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
         query_states = SeqAllToAll4D.apply(
-            self.ulysses_pg, query_states, self.scatter_idx, self.gather_idx, self.use_sync
+            self.ulysses_pg,
+            query_states,
+            self.scatter_idx,
+            self.gather_idx,
+            self.use_sync,
         )
 
         key_states = self.k_proj(hidden_states)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        key_states = key_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        )
         key_states = SeqAllToAll4D.apply(
-            self.ulysses_pg, key_states, self.scatter_idx, self.gather_idx, self.use_sync
+            self.ulysses_pg,
+            key_states,
+            self.scatter_idx,
+            self.gather_idx,
+            self.use_sync,
         )
 
         value_states = self.v_proj(hidden_states)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        value_states = value_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        )
         value_states = SeqAllToAll4D.apply(
-            self.ulysses_pg, value_states, self.scatter_idx, self.gather_idx, self.use_sync
+            self.ulysses_pg,
+            value_states,
+            self.scatter_idx,
+            self.gather_idx,
+            self.use_sync,
         )
 
         current_q_len = query_states.shape[1]
@@ -1022,17 +1038,26 @@ def forward(
         # =============================================================
         if self.sp_ring_degree > 1:
             if isinstance(self.rotary_emb, LlamaMutiRotaryEmbedding):
-                position_ids = position_ids.chunk(self.sp_ring_degree, dim=2)[self.ring_rank].clone()
+                position_ids = position_ids.chunk(self.sp_ring_degree, dim=2)[
+                    self.ring_rank
+                ].clone()
             else:
-                position_ids = position_ids.chunk(self.sp_ring_degree, dim=1)[self.ring_rank].clone()
+                position_ids = position_ids.chunk(self.sp_ring_degree, dim=1)[
+                    self.ring_rank
+                ].clone()
 
         lck = 0 if cache_hidden is None else len(cache_hidden[0])
 
         if isinstance(self.rotary_emb, LlamaMutiRotaryEmbedding):
             cos, sin = self.rotary_emb(query_states, position_ids + lck)
             cos, sin = cos.to(query_states.device), sin.to(query_states.device)
             query_states, key_states = apply_multimodal_rotary_pos_emb(
-                query_states, key_states, cos, sin, self.config.rope_scaling["mrope_section"], unsqueeze_dim=2
+                query_states,
+                key_states,
+                cos,
+                sin,
+                self.config.rope_scaling["mrope_section"],
+                unsqueeze_dim=2,
             )
         else:
             cos, sin = self.rotary_emb(query_states, seq_len=global_q_len + lck)
@@ -1075,8 +1100,9 @@ def forward(
         else:
             acc_lse = lse_ring
 
-        assert acc_lse.shape[1] == current_q_len, \
-            f"LSE seq_len {acc_lse.shape[1]} mismatch with Query seq_len {current_q_len}"
+        assert (
+            acc_lse.shape[1] == current_q_len
+        ), f"LSE seq_len {acc_lse.shape[1]} mismatch with Query seq_len {current_q_len}"
 
         acc_out = out_ring
 
@@ -1085,7 +1111,13 @@ def forward(
             num_kv_heads_local = cache_k[0].shape[2]
             local_groups = local_num_heads // num_kv_heads_local
 
-            q_shape_expanded = (bsz, current_q_len, num_kv_heads_local, local_groups, self.head_dim)
+            q_shape_expanded = (
+                bsz,
+                current_q_len,
+                num_kv_heads_local,
+                local_groups,
+                self.head_dim,
+            )
             qi_reshaped = query_states.view(q_shape_expanded)  # [B, S, KV, G, D]
 
             for i in range(1, len(cache_k)):
@@ -1106,8 +1138,9 @@ def forward(
                 # Online Softmax Update
                 new_lse = torch.logaddexp(acc_lse, step_lse)
 
-                acc_out = acc_out * torch.exp(acc_lse - new_lse).unsqueeze(-1) + \
-                          step_out * torch.exp(step_lse - new_lse).unsqueeze(-1)
+                acc_out = acc_out * torch.exp(acc_lse - new_lse).unsqueeze(
+                    -1
+                ) + step_out * torch.exp(step_lse - new_lse).unsqueeze(-1)
 
                 acc_lse = new_lse
 

Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,3 @@`
`4`	`4`	`ring_flash_attn_kvpacked_func,`
`5`	`5`	`ring_flash_attn_qkvpacked_func,`
`6`	`6`	`)`
`7`		`-`