Varun's fixes/cleanups

bnellnm · bnellnm · commit a674762d869b · 2025-05-14T14:54:24.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -7,7 +7,7 @@
 import triton.language as tl
 
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    invoke_batched_silu_and_mul, invoke_moe_batched_triton_kernel)
+    invoke_moe_batched_triton_kernel)
 
 
 @dataclass
@@ -103,75 +103,5 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
 
     ref_output = ref_impl(tensors.A, tensors.B, ref_output,
                           tensors.num_expert_tokens)
-    #torch.cuda.synchronize()
-    #print (f"ref output {ref_output}")
-    #print (f"test output {test_output}")
 
     torch.testing.assert_close(test_output, ref_output, atol=1e-3, rtol=1e-3)
-
-
-@dataclass
-class BatchedSiluMulConfig:
-    dtype: torch.dtype
-    num_experts: int
-    max_tokens_per_expert: int
-    D: int
-
-
-@dataclass
-class BatchedSiluMulTensors:
-    input: torch.Tensor
-    output: torch.Tensor
-    expert_num_tokens: torch.Tensor
-
-    @staticmethod
-    def make_tensors(config: BatchedSiluMulConfig):
-        input = torch.randn(
-            (config.num_experts, config.max_tokens_per_expert, config.D * 2),
-            device="cuda",
-            dtype=config.dtype) / 50.0
-        output = torch.zeros(
-            (config.num_experts, config.max_tokens_per_expert, config.D),
-            device="cuda",
-            dtype=config.dtype)
-        num_expert_tokens = torch.randint(low=0,
-                                          high=config.max_tokens_per_expert,
-                                          size=(config.num_experts, ),
-                                          device="cuda",
-                                          dtype=torch.int32)
-        return BatchedSiluMulTensors(input, output, num_expert_tokens)
-
-
-def ref_batched_silu_mul(output: torch.Tensor, input: torch.Tensor,
-                         num_expert_tokens: torch.Tensor) -> torch.Tensor:
-
-    num_expert_tokens_cpu = num_expert_tokens.clone()
-    num_expert_tokens_cpu = num_expert_tokens_cpu.to(device="cpu")
-    num_experts = num_expert_tokens.size(0)
-
-    for e in range(num_experts):
-        num_tokens = num_expert_tokens_cpu[e].item()
-        out_part = output[e, :num_tokens, :]
-        in_part = input[e, :num_tokens, :]
-        torch.ops._C.silu_and_mul(out_part, in_part)
-
-
-@pytest.mark.parametrize("num_experts", [16, 32])
-@pytest.mark.parametrize("max_tokens_per_expert", [128])
-@pytest.mark.parametrize("D", [128, 256])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-def test_batched_silu_mul(num_experts: int, max_tokens_per_expert: int, D: int,
-                          dtype: torch.dtype):
-
-    config = BatchedSiluMulConfig(dtype, num_experts, max_tokens_per_expert, D)
-    tensors = BatchedSiluMulTensors.make_tensors(config)
-
-    test_out = tensors.output
-    ref_out = torch.zeros_like(test_out)
-
-    ref_batched_silu_mul(ref_out, tensors.input, tensors.expert_num_tokens)
-
-    invoke_batched_silu_and_mul(test_out, tensors.input,
-                                tensors.expert_num_tokens)
-
-    torch.testing.assert_close(test_out, ref_out)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -968,6 +968,9 @@ def pplx_finalize():
     if PPLX_DID_INIT:
         from pplx_kernels.nvshmem import nvshmem_finalize
         logger.info("PPLX finalize")
+        from vllm.model_executor.layers.fused_moe.layer import (
+            _all_to_all_cache)
+        _all_to_all_cache.destroy()
         nvshmem_finalize()
 
 
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
@@ -23,7 +23,7 @@
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.utils import get_tcp_uri
+from vllm.utils import get_tcp_uri, is_torch_equal_or_newer
 
 logger = init_logger(__name__)
 
@@ -362,11 +362,11 @@ def stateless_destroy_torch_distributed_process_group(
     Destroy ProcessGroup returned by
         stateless_init_torch_distributed_process_group().
     """
-    # TODO: pytorch < 2.7?
-    if False:
+    if is_torch_equal_or_newer("2.7"):
+        pg.shutdown()
+    else:
         # Lazy import for non-CUDA backends.
         from torch.distributed.distributed_c10d import _shutdown_backend
         _shutdown_backend(pg)
-    else:
-        pg.shutdown()
+
     _unregister_process_group(pg.group_name)
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
@@ -27,10 +27,8 @@
 
 @dataclass
 class DPMetadata:
-    max_tokens_across_dp: torch.Tensor
-    num_tokens_across_dp: torch.Tensor
+    max_tokens_across_dp_cpu: torch.Tensor
     cu_tokens_across_dp_cpu: torch.Tensor
-    dp_rank_num_tokens: torch.Tensor
 
 
 @dataclass
@@ -93,16 +91,10 @@ def set_forward_context(attn_metadata: Any,
                                          dtype=torch.int32)
         from vllm.distributed.parallel_state import get_dp_group
         dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
-        #TODO device? (tms)
-        max_tokens_across_dp = torch.max(
-            num_tokens_tensor)  #.to(device="cuda")
+        max_tokens_across_dp_cpu = torch.max(num_tokens_tensor)
         cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0)
-        dp_rank_num_tokens = torch.tensor(
-            [num_tokens],
-            dtype=torch.uint32,
-            device=vllm_config.device_config.device)
-        dp_metadata = DPMetadata(max_tokens_across_dp, num_tokens_tensor,
-                                 cu_tokens_across_dp_cpu, dp_rank_num_tokens)
+        dp_metadata = DPMetadata(max_tokens_across_dp_cpu,
+                                 cu_tokens_across_dp_cpu)
 
     global _forward_context
     prev_context = _forward_context
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -12,65 +12,6 @@
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 
 
-@triton.jit
-def batched_silu_and_mul_kernel(
-        output,  # [E, MAX_NUM_TOKENS, D]
-        input,  # [E, MAX_NUM_TOKENS, D * 2]
-        expert_num_tokens,  # [E]
-        stride_oe,
-        stride_om,
-        stride_ie,
-        stride_im,
-        compute_type: tl.constexpr,
-        D,
-        BLOCK_M: tl.constexpr,
-        BLOCK_D: tl.constexpr):
-
-    expert_id = tl.program_id(axis=0)
-    e_num_tokens = tl.load(expert_num_tokens + expert_id)
-    if e_num_tokens == 0:
-        # early exit
-        return
-
-    pid_m = tl.program_id(axis=1)
-    cta_m_start = pid_m * BLOCK_M
-    if cta_m_start >= e_num_tokens:
-        # early exit
-        return
-
-    cta_input_ptr = input + expert_id * stride_ie + cta_m_start * stride_im
-    cta_output_ptr = output + expert_id * stride_oe + cta_m_start * stride_om
-
-    cta_m_size = min(BLOCK_M, e_num_tokens - cta_m_start)
-    offs_m = tl.arange(0, BLOCK_M)[:, None]
-    mask_m = offs_m < cta_m_size
-
-    cta_input_ptrs = cta_input_ptr + offs_m * stride_im
-    cta_output_ptrs = cta_output_ptr + offs_m * stride_om
-
-    # offset by D
-    offs_D = tl.arange(0, BLOCK_D)
-    cta_input_ptrs = cta_input_ptrs + offs_D
-    cta_output_ptrs = cta_output_ptrs + offs_D
-
-    for d in range(0, tl.cdiv(D, BLOCK_D)):
-        mask_D = offs_D < (D - (d * BLOCK_D))
-        mask_tile = mask_m & mask_D
-
-        x_tile = tl.load(cta_input_ptrs, mask=mask_tile,
-                         other=0.0).to(dtype=tl.float32)
-        y_tile = tl.load(cta_input_ptrs + D, mask=mask_tile, other=0.0)
-
-        # silu and mul
-        out_tile = (x_tile * (1.0 /
-                              (1.0 + tl.exp(-x_tile)))).to(dtype=compute_type)
-        out_tile = out_tile * y_tile
-        tl.store(cta_output_ptrs, out_tile, mask=mask_tile)
-
-        cta_input_ptrs = cta_input_ptrs + BLOCK_D
-        cta_output_ptrs = cta_output_ptrs + BLOCK_D
-
-
 @triton.jit
 def moe_mmk(
         a_ptrs,
@@ -438,33 +379,6 @@ def invoke_moe_batched_triton_kernel(
         BLOCK_K=BLOCK_K)
 
 
-def invoke_batched_silu_and_mul(
-        output: torch.Tensor,  #[E, MAX_TOKENS, D]
-        input: torch.Tensor,  #[E, MAX_TOKENS, D * 2]
-        expert_num_tokens: torch.Tensor):
-
-    num_experts = output.size(0)
-    max_num_tokens = output.size(1)
-    D = output.size(2)
-
-    BLOCK_D = 1024
-    BLOCK_M = 1
-
-    compute_tl_dtype = {
-        torch.float16: tl.float16,
-        torch.float32: tl.float32,
-        torch.bfloat16: tl.bfloat16
-    }[output.dtype]
-
-    #print(f"compute type {compute_tl_dtype}")
-
-    grid = (num_experts, triton.cdiv(max_num_tokens, BLOCK_M))
-    batched_silu_and_mul_kernel[grid](output, input, expert_num_tokens,
-                                      output.stride(0), output.stride(1),
-                                      input.stride(0), input.stride(1),
-                                      compute_tl_dtype, D, BLOCK_M, BLOCK_D)
-
-
 def rank_chunk(num, r, w):
     rem = num % w
     return (num // w) + (1 if r < rem else 0)
@@ -797,15 +711,10 @@ def apply(
                                          config=config,
                                          block_shape=self.block_shape)
 
-        if activation == "silu":
-            invoke_batched_silu_and_mul(output=intermediate_cache2,
-                                        input=intermediate_cache1,
-                                        expert_num_tokens=expert_num_tokens)
-        else:
-            # TODO: would be nice to use expert_num_tokens here to reduce
-            # garbage compute
-            self.activation(activation, intermediate_cache2.view(-1, N // 2),
-                            intermediate_cache1.view(-1, N))
+        # TODO: would be nice to use expert_num_tokens here to reduce
+        # garbage compute
+        self.activation(activation, intermediate_cache2.view(-1, N // 2),
+                        intermediate_cache1.view(-1, N))
 
         #qintermediate_cache2 = intermediate_cache2
         a2q_scale = a2_scale
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -70,7 +70,7 @@ class FusedMoEParallelConfig:
 
     @property
     def use_pplx_kernels(self):
-        return self.use_ep and has_pplx
+        return self.dp_size > 1 and self.use_ep and has_pplx
 
     @staticmethod
     def make(tp_size_: int, dp_size_: int,
@@ -277,6 +277,12 @@ def __init__(self):
         self._cache: WeakValueDictionary = WeakValueDictionary()
         self._lock = threading.RLock()  # Reentrant lock for thread safety
 
+    def destroy(self):
+        with self._lock:
+            # TODO: can we do del self._cache?
+            for _, a2a in self._cache.items():
+                a2a.destroy()
+
     def get_or_create(self, **kwargs):
         assert has_pplx
         import pplx_kernels as pplx
@@ -287,7 +293,9 @@ def get_or_create(self, **kwargs):
         with self._lock:
             instance = self._cache.get(key)
             if instance is None:
-                # TODO: should be intranode
+                # TODO (varun): Add support to switch to intranode
+                # when all communications are within the same
+                # node.
                 instance = pplx.AllToAll.internode(**kwargs)
                 self._cache[key] = instance
             return instance
@@ -676,7 +684,7 @@ def _construct_dispatch_combine(
     dp_size = moe.ep_size // moe.dp_size  # dp_size actually means TP.
     rank = moe.ep_rank
 
-    if moe.use_ep and has_pplx:
+    if moe.use_pplx_kernels:
         logger.debug("using pplx dispatch")
 
         all_to_all = get_all_to_all(
@@ -1236,17 +1244,27 @@ def naive_multicast(self, x: torch.Tensor,
 
         return buffer
 
-    def must_reduce_shared_outputs(self) -> bool:
-        return self.dp_size > 1 and self.use_ep and has_pplx
+    def must_reduce_shared_expert_outputs(self) -> bool:
+        """
+        The shared_experts are typically computed using the RowParallelLinear
+        layer. The result of this function is typically used as
+        the reduce_results argument to the module.
+        When just tensor-parallel is used, it is not required to reduce
+        the shared_experts results immediately. Instead we reduce at the
+        once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
+        With EP and the pplx kernels - this is no longer viable as all
+        GPU ranks in DP, produce the complete set of hidden_states.
+        Therefore it is required that we reduce the shared_experts output
+        early.
+        """
+        return self.use_pplx_kernels
 
     def maybe_all_reduce_tensor_model_parallel(
             self, final_hidden_states: torch.Tensor):
         """
-        The pplx combine kernel reduce across GPU ranks by default. The pplx
-        kernels are used when EP is enabled. In that case, this function is a
-        no-op.
+        The pplx combine kernel reduces across GPU ranks by default.
         """
-        if self.dp_size > 1 and self.use_ep and has_pplx:
+        if self.use_pplx_kernels:
             return final_hidden_states
         else:
             return tensor_model_parallel_all_reduce(final_hidden_states)
@@ -1291,7 +1309,7 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
                     final_hidden_states)
 
         ctx = get_forward_context()
-        max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp
+        max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
         moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE
 
         num_tokens = full_hidden_states.size(0)
@@ -1313,7 +1331,7 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
     def forward_impl(self, hidden_states: torch.Tensor,
                      router_logits: torch.Tensor):
         assert self.quant_method is not None
-        if self.dp_size > 1 and self.use_ep and has_pplx:
+        if self.moe_parallel_config.use_pplx_kernels:
             return self.forward_impl_chunked(hidden_states, router_logits)
 
         if self.dp_size > 1:
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
@@ -141,14 +141,8 @@ def __init__(
                 intermediate_size=intermediate_size,
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
-                # When just tensor-parallel is used, it isn't required
-                # to reduce the shared_output result. Instead we reduce
-                # at the end of the forward pass.
-                # With EP and the pplx kernels - this is no longer viable
-                # as all GPU ranks in DP, produce the complete set of
-                # hidden_states.
-                # Therefore reduce the shared experts early.
-                reduce_results=self.experts.must_reduce_shared_outputs(),
+                reduce_results=self.experts.must_reduce_shared_expert_outputs(
+                ),
                 prefix=f"{prefix}.shared_experts",
             )
 
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
@@ -88,7 +88,7 @@ def __init__(self,
             quant_config=quant_config,
             bias=False,
             prefix=f"{prefix}.shared_expert",
-            reduce_results=False,  # We need to do scatter before reduce
+            reduce_results=self.experts.must_reduce_shared_expert_outputs(),
         )
 
     def forward(self, hidden_states):
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ def __init__(self,`
`88`	`88`	`quant_config=quant_config,`
`89`	`89`	`bias=False,`
`90`	`90`	`prefix=f"{prefix}.shared_expert",`
`91`		`- reduce_results=False, # We need to do scatter before reduce`
	`91`	`+ reduce_results=self.experts.must_reduce_shared_expert_outputs(),`
`92`	`92`	`)`
`93`	`93`
`94`	`94`	`def forward(self, hidden_states):`