Code clean.

Victarry · Victarry · commit 873234c78739 · 2025-12-02T18:30:22.000-08:00
diff --git a/megatron/core/transformer/moe/shared_experts.py b/megatron/core/transformer/moe/shared_experts.py
@@ -2,11 +2,23 @@
 
 import warnings
 from copy import deepcopy
+from enum import Enum
+from functools import wraps
 from typing import Optional
 
 import torch
 import torch.nn.functional as F
 
+
+class SharedExpertState(Enum):
+    """State machine states for SharedExpertMLP overlapped forward pass."""
+
+    IDLE = 0
+    PRE_FORWARD_COMM_DONE = 1
+    FC1_FORWARD_DONE = 2
+    FC2_FORWARD_DONE = 3
+    POST_FORWARD_COMM_DONE = 4
+
 from megatron.core.dist_checkpointing.mapping import ShardedStateDict
 from megatron.core.fusions.fused_bias_geglu import bias_geglu_impl
 from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
@@ -27,6 +39,41 @@
 )
 
 
+def overlap_state_check(
+    required_state: "SharedExpertState",
+    next_state: "SharedExpertState",
+):
+    """
+    Decorator to validate overlap state and cached variables before method execution,
+    and update state after method execution.
+
+    Args:
+        required_state: The expected SharedExpertState before this method runs.
+        next_state: The SharedExpertState to transition to after method execution.
+    """
+
+    def decorator(method):
+        @wraps(method)
+        def wrapper(self, *args, **kwargs):
+            # Check overlap is enabled
+            assert self.config.moe_shared_expert_overlap, (
+                f"{method.__name__} requires --moe-shared-expert-overlap to be set"
+            )
+            # Check state machine
+            assert self._overlap_state == required_state, (
+                f"{method.__name__} must be called from {required_state.name} state, "
+                f"but current state is {self._overlap_state.name}"
+            )
+            # Execute method
+            result = method(self, *args, **kwargs)
+            # Update state after method execution
+            self._overlap_state = next_state
+            return result
+
+        return wrapper
+
+    return decorator
+
 class _BackwardStreamWait(torch.autograd.Function):
     @staticmethod
     def forward(ctx, input, stream):
@@ -131,6 +178,9 @@ def __init__(
             self.cached_output = None
             self.gate_score = None
 
+            # State machine to ensure correct calling order of overlapped forward methods
+            self._overlap_state = SharedExpertState.IDLE
+
             if SharedExpertMLP.stream is None:
                 SharedExpertMLP.stream = torch.cuda.Stream()
 
@@ -163,14 +213,15 @@ def wait_current_stream(self):
         """Wait for the current stream to complete."""
         self.stream.wait_stream(torch.cuda.current_stream())
 
+    @overlap_state_check(
+        SharedExpertState.IDLE, SharedExpertState.PRE_FORWARD_COMM_DONE,
+    )
     def pre_forward_comm(self, input, wait_current_stream=True):
         """
         All Gather for SP before forward.
         This function is used to overlap shared experts with the dispatcher.
         It is only useful when --moe-shared-expert-overlap is set and may be changed.
         """
-        assert self.config.moe_shared_expert_overlap
-        assert self.cached_output is None
         if wait_current_stream:
             self.wait_current_stream()
         with torch.cuda.stream(self.stream):
@@ -185,14 +236,15 @@ def pre_forward_comm(self, input, wait_current_stream=True):
                 self.cached_fc1_input = copy_to_tensor_model_parallel_region(input)
             set_tensor_grad_fn_sequence_sr(self.cached_fc1_input, torch.iinfo(torch.int).max)
 
+    @overlap_state_check(
+        SharedExpertState.PRE_FORWARD_COMM_DONE, SharedExpertState.FC1_FORWARD_DONE,
+    )
     def linear_fc1_forward_and_act(self, overlapped_comm_output=None):
         """
         Do Linear FC1 and activation function forward.
         This function is used to overlap shared experts with the dispatcher.
         It is only useful when --moe-shared-expert-overlap is set and may be changed.
         """
-        assert self.config.moe_shared_expert_overlap
-        assert self.cached_fc1_input is not None
         with torch.cuda.stream(self.stream):
             # [s, b, 4 * h/p]
             intermediate_parallel, bias_parallel = self.linear_fc1(self.cached_fc1_input)
@@ -242,29 +294,31 @@ def glu(x):
             # Make sure the shared expert fc1 backward is launched after the routed fc1 backward
             self.cached_fc2_input = _BackwardStreamWait.apply(intermediate_parallel, self.stream)
 
+    @overlap_state_check(
+        SharedExpertState.FC1_FORWARD_DONE, SharedExpertState.FC2_FORWARD_DONE,
+    )
     def linear_fc2_forward(self, overlapped_comm_output=None):
         """
         Do Linear FC2 forward.
         This function is used to overlap shared experts with the dispatcher.
         It is only useful when --moe-shared-expert-overlap is set and may be changed.
         """
-        assert self.config.moe_shared_expert_overlap
-        assert self.cached_fc2_input is not None
         if overlapped_comm_output is not None:
             set_tensor_grad_fn_sequence_sr(overlapped_comm_output, torch.iinfo(torch.int).max)
         with torch.cuda.stream(self.stream):
             # [s, b, h]
             self.cached_fc2_output, _ = self.linear_fc2(self.cached_fc2_input)
             self.cached_fc2_input = None
 
+    @overlap_state_check(
+        SharedExpertState.FC2_FORWARD_DONE, SharedExpertState.POST_FORWARD_COMM_DONE,
+    )
     def post_forward_comm(self):
         """
         Reduce scatter for SP after forward.
         This function is used to overlap shared experts with the dispatcher.
         It is only useful when --moe-shared-expert-overlap is set and may be changed.
         """
-        assert self.config.moe_shared_expert_overlap
-        assert self.cached_fc2_output is not None
         with torch.cuda.stream(self.stream):
             if self.config.sequence_parallel:
                 self.cached_output = reduce_scatter_to_sequence_parallel_region(
@@ -277,14 +331,15 @@ def post_forward_comm(self):
             self.cached_fc2_output = None
             set_tensor_grad_fn_sequence_sr(self.cached_output, torch.iinfo(torch.int).max)
 
+    @overlap_state_check(
+        SharedExpertState.POST_FORWARD_COMM_DONE, SharedExpertState.IDLE,
+    )
     def get_output(self):
         """
         Gets the module forward output.
         This function is used to overlap shared experts with the dispatcher.
         It is only useful when --moe-shared-expert-overlap is set and may be changed.
         """
-        assert self.config.moe_shared_expert_overlap
-        assert self.cached_output is not None
         with torch.cuda.stream(self.stream):
             if self.use_shared_expert_gate:
                 assert self.gate_score is not None
diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py
@@ -624,7 +624,7 @@ def token_dispatch(self, permutated_local_input_tokens, permuted_probs):
         Returns:
             A tuple of tokens and probabilities after All-to-All.
         """
-        # Make sure the shared experts fc1 is not launched before dispatch.
+        # Make sure the shared experts fc1 is not launched too early when CUDA_DEVICE_MAX_CONNECTIONS>1.
         if self.shared_experts is not None:
             self.shared_experts.wait_current_stream()
         # Perform expert parallel AlltoAll communication
@@ -784,7 +784,8 @@ def token_combine(
         Returns:
             Tokens after the All-to-All communication for combining.
         """
-        # Make sure the shared experts fc2 is not launched before combine.
+        # Make sure the shared experts fc2 is not overlapped with routed experts fc1 
+        # when CUDA_DEVICE_MAX_CONNECTIONS>1.
         if self.shared_experts is not None:
             self.shared_experts.wait_current_stream()
         # Perform expert parallel AlltoAll communication
@@ -796,6 +797,9 @@ def token_combine(
             self.output_splits,
             use_nccl_stream=True,
         )
+        if self.shared_experts is not None:
+            self.shared_experts.linear_fc2_forward(permutated_local_input_tokens)
+            self.shared_experts.post_forward_comm()
         return permutated_local_input_tokens
 
     def combine_postprocess(self, permutated_local_input_tokens):
@@ -811,9 +815,6 @@ def combine_postprocess(self, permutated_local_input_tokens):
         Returns:
             The final MoE layer output reshaped to its original dimensions.
         """
-        if self.shared_experts is not None:
-            self.shared_experts.linear_fc2_forward(permutated_local_input_tokens)
-            self.shared_experts.post_forward_comm()
 
         # Unpermutation 1: AlltoAll output to output
         output = unpermute(
@@ -1418,8 +1419,6 @@ def dispatch_preprocess(
         # Initialize metadata
         routing_map, probs = self._initialize_metadata(routing_map, probs)
 
-        if self.shared_experts is not None:
-            self.shared_experts.wait_current_stream()
         self._comm_manager.setup_metadata(routing_map, probs)
         return hidden_states, self._comm_manager.token_probs
 
@@ -1447,7 +1446,6 @@ def token_dispatch(
         Returns:
             A tuple of dispatched tokens and probabilities.
         """
-        # Make sure the shared experts fc1 is not launched before dispatch.
         if self.shared_experts is not None:
             self.shared_experts.wait_current_stream()
         dispatched_hidden_states = self._comm_manager.dispatch(
@@ -1505,7 +1503,8 @@ def token_combine(
         Returns:
             Combined tokens after fused un-permutation and communication.
         """
-        # Make sure the shared experts fc2 is not launched before combine.
+        # Make sure the shared experts fc2 is not overlapped with routed experts GEMM 
+        # when CUDA_DEVICE_MAX_CONNECTIONS>1.
         if self.shared_experts is not None:
             self.shared_experts.wait_current_stream()
         return self._comm_manager.combine(hidden_states, async_finish, allocate_on_comm_stream)