online mode

uygnef · uygnef · commit 272799cf3b20 · 2026-01-13T20:10:29.000+08:00
diff --git a/scripts/train_eagle3.py b/scripts/train_eagle3.py
@@ -36,7 +36,7 @@
     get_dp_group,
     get_draft_dp_group,
     get_tp_group,
-    init_distributed,
+    init_distributed, get_draft_sp_group,
 )
 from specforge.modeling.target import (
     Eagle3TargetModel,
@@ -335,10 +335,6 @@ def sanity_check(args: Namespace) -> None:
     args.draft_accumulation_steps = (
         args.draft_accumulation_steps * args.sp_ulysses_size * args.sp_ring_size
     )
-    if args.attention_backend in ("usp", "usp_fa"):
-        assert (
-            args.train_hidden_states_path is not None
-        ), "train_hidden_states_path should not be None for usp"
 
 
 def build_draft_model(args: Namespace) -> Tuple[AutoDraftModelConfig, nn.Module]:
@@ -410,6 +406,9 @@ def build_dataloaders(
     )
     cache_key = hashlib.md5(cache_params_string.encode()).hexdigest()
     train_dataset = load_dataset("json", data_files=args.train_data_path)["train"]
+    is_online = (
+        args.train_data_path is not None and args.train_hidden_states_path is None
+    )
     with rank_0_priority():
         train_eagle3_dataset = build_eagle3_dataset(
             dataset=train_dataset,
@@ -431,7 +430,7 @@ def build_dataloaders(
             cache_key=cache_key,
         )
 
-        if args.train_hidden_states_path is not None:
+        if not is_online:
             train_eagle3_dataset = build_offline_eagle3_dataset(
                 args.train_hidden_states_path,
                 args.max_length,
@@ -444,7 +443,7 @@ def build_dataloaders(
         shuffle=True,
         process_group=(
             get_draft_dp_group()
-            if args.attention_backend == "usp"
+            if args.attention_backend == "usp" and not is_online
             else get_dp_group()
         ),
         is_vlm=args.is_vlm,
@@ -475,7 +474,7 @@ def build_dataloaders(
             shuffle=False,
             process_group=(
                 get_draft_dp_group()
-                if args.attention_backend == "usp"
+                if args.attention_backend == "usp" and not is_online
                 else get_dp_group()
             ),
             is_vlm=args.is_vlm,
@@ -632,13 +631,59 @@ def record_metrcs(
     tracker.log(logdict, step=global_step)
 
 
-def get_dp_data_shard_from_tp(tensor: torch.Tensor) -> torch.Tensor:
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+
+def get_dp_data_shard_from_tp(tensor: torch.Tensor, sp_dim: int = 1) -> torch.Tensor:
     """
-    Get the data shard from the tensor.
+    Process: TP split -> Pad to Max Len -> SP gather.
     """
-    tp_size = dist.get_world_size(get_tp_group())
-    tp_rank = dist.get_rank(get_tp_group())
-    return tensor.chunk(tp_size, dim=0)[tp_rank]
+    # 1. TP: Slice the tensor along the batch dimension
+    tp_group = get_tp_group()
+    tp_size = dist.get_world_size(tp_group)
+    tp_rank = dist.get_rank(tp_group)
+
+    local_tp_shard = tensor.chunk(tp_size, dim=0)[tp_rank]
+
+    # 2. SP: Handle dynamic sequence lengths and Gather
+    sp_group = get_draft_sp_group()
+
+    if sp_group is not None and dist.get_world_size(sp_group) > 1:
+        sp_world_size = dist.get_world_size(sp_group)
+
+        # --- Fix for Variable Sequence Lengths ---
+        local_seq_len = local_tp_shard.size(sp_dim)
+
+        # Find global max sequence length in SP group
+        len_tensor = torch.tensor([local_seq_len], device=local_tp_shard.device, dtype=torch.long)
+        dist.all_reduce(len_tensor, op=dist.ReduceOp.MAX, group=sp_group)
+        max_seq_len = len_tensor.item()
+
+        # Pad local tensor if necessary
+        # Assuming shape is [Batch, Seq, Hidden] or [Batch, Seq], and sp_dim=1
+        if local_seq_len < max_seq_len:
+            pad_size = max_seq_len - local_seq_len
+
+            # Construct pad tuple for F.pad (applies from last dim backwards)
+            # Initialize with all zeros (no padding for other dims)
+            pad_config = [0] * (local_tp_shard.ndim * 2)
+
+            pad_idx = (local_tp_shard.ndim - 1 - sp_dim) * 2 + 1
+            pad_config[pad_idx] = pad_size
+
+            # Pad value: 0 is standard, ensure it matches your pad_token_id logic if needed
+            local_tp_shard_padded = F.pad(local_tp_shard, pad_config, value=0)
+        else:
+            local_tp_shard_padded = local_tp_shard
+
+        gathered_shards = [torch.empty_like(local_tp_shard_padded) for _ in range(sp_world_size)]
+        dist.all_gather(gathered_shards, local_tp_shard_padded.contiguous(), group=sp_group)
+
+        return torch.cat(gathered_shards, dim=sp_dim)
+
+    return local_tp_shard
 
 
 def main():
diff --git a/specforge/layers/ring/__init__.py b/specforge/layers/ring/__init__.py
@@ -5,3 +5,8 @@
     ring_flash_attn_qkvpacked_func,
 )
 
+__all__ = [
+    "ring_flash_attn_func",
+    "ring_flash_attn_kvpacked_func",
+    "ring_flash_attn_qkvpacked_func",
+]

Original file line number	Diff line number	Diff line change
`@@ -5,3 +5,8 @@`
`5`	`5`	`ring_flash_attn_qkvpacked_func,`
`6`	`6`	`)`
`7`	`7`
	`8`	`+__all__ = [`
	`9`	`+ "ring_flash_attn_func",`
	`10`	`+ "ring_flash_attn_kvpacked_func",`
	`11`	`+ "ring_flash_attn_qkvpacked_func",`
	`12`	`+]`