neuralmagic
diff --git a/‎examples/offline_inference/data_parallel.py
Lines changed: 7 additions & 3 deletions b/‎examples/offline_inference/data_parallel.py
Lines changed: 7 additions & 3 deletions
diff --git a/‎tests/kernels/moe/test_pplx_moe.py
Lines changed: 5 additions & 4 deletions b/‎tests/kernels/moe/test_pplx_moe.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/fused_batched_moe.py
Lines changed: 29 additions & 197 deletions b/‎vllm/model_executor/layers/fused_moe/fused_batched_moe.py
Lines changed: 29 additions & 197 deletions
@@ -69,11 +69,14 @@ def parse_args():
     parser.add_argument("--enforce-eager",
                         action='store_true',
                         help="Enforce eager mode execution.")
+    parser.add_argument("--trust-remote-code",
+                        action='store_true',
+                        help="Trust remote code.")
     return parser.parse_args()
 
 
 def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
-         dp_master_port, GPUs_per_dp_rank, enforce_eager):
+         dp_master_port, GPUs_per_dp_rank, enforce_eager, trust_remote_code):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
     os.environ["VLLM_DP_SIZE"] = str(dp_size)
@@ -125,6 +128,7 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
               enforce_eager=enforce_eager,
               enable_expert_parallel=True,
               compilation_config=cconfig,
+              trust_remote_code=trust_remote_code,
               )
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
@@ -168,12 +172,12 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
         proc = Process(target=main,
                        args=(args.model, dp_size, local_dp_rank,
                              global_dp_rank, dp_master_ip, dp_master_port,
-                             tp_size, args.enforce_eager))
+                             tp_size, args.enforce_eager, args.trust_remote_code))
         proc.start()
         procs.append(proc)
     exit_code = 0
     for proc in procs:
-        proc.join(timeout=3000)
+        proc.join(timeout=300)
         if proc.exitcode is None:
             print(f"Killing process {proc.pid} that "
                   f"didn't stop within 5 minutes.")
 
@@ -347,8 +347,9 @@ def pplx_dispatch_combine(pgi, dp_size, a, topk_weight, topk_ids, num_experts):
         ata,
         max_num_tokens,
         world_size,
-        dp_size,
         rank,
+        dp_size,
+        a.dtype,
     )
 
     a_chunk = chunk_by_rank(a, rank, world_size).to(device)
@@ -486,8 +487,8 @@ def pplx_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
         ata,
         max_num_tokens,
         world_size,
-        dp_size,
         rank,
+        dp_size,
     )
 
     experts = BatchedExperts(a.shape[0])
@@ -584,13 +585,13 @@ def _pplx_moe(
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
         torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids)
         pplx_output = pplx_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids)
-        #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids)
+        batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids)
 
     torch_output = chunk_by_rank(torch_output, pgi.rank,
                                  pgi.world_size).to(pplx_output.device)
 
     torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
-    #torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0)
+    torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0)
 
     nvshmem_finalize()
 
 
@@ -587,6 +587,8 @@ class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
         self,
+        world_size: int,
+        dp_size: int,
         max_num_tokens: Optional[int] = None,
         use_fp8_w8a8: bool = False,
         use_int8_w8a8: bool = False,
@@ -603,6 +605,8 @@ def __init__(
         assert not use_int8_w8a16, "NYI"
         assert not use_int4_w4a16, "NYI"
         self.max_num_tokens = max_num_tokens
+        self.world_size = world_size
+        self.dp_size = dp_size
 
     def workspace_shapes(
         self,
@@ -614,10 +618,12 @@ def workspace_shapes(
         num_experts: int,
     ) -> Tuple[int, int, torch.dtype]:
         assert a.dim() == 2
+        num_dp = self.world_size // self.dp_size
         max_num_tokens = a.shape[
             0] if self.max_num_tokens is None else self.max_num_tokens
-        workspace13 = num_experts * max_num_tokens * K
-        workspace2 = max_num_tokens * N
+        #print(f"WORKSPACE {max_num_tokens} {num_dp}")
+        workspace13 = num_experts * max_num_tokens * num_dp * K
+        workspace2 = max_num_tokens * num_dp * N
         return (workspace13, workspace2, a.dtype)
 
     def apply(
@@ -648,23 +654,24 @@ def apply(
         else:
             max_num_tokens = self.max_num_tokens
 
+        num_dp = self.world_size // self.dp_size
         num_experts = global_num_experts
         out = _resize_cache(workspace13,
-                            (num_experts, max_num_tokens, hidden_dim))
+                            (num_experts, max_num_tokens * num_dp, hidden_dim))
         num_local_experts = w1.shape[0] #expert_num_tokens.numel()
         assert num_local_experts == w1.shape[0], f"{num_local_experts} == {w1.shape[0]}"
 
         N = w1.shape[1] // 2
 
         # Not cudagraph friendly
-        assert (torch.cuda.is_current_stream_capturing() or
-                torch.all(expert_num_tokens <= max_num_tokens)), (
-                    f"{expert_num_tokens} <= {max_num_tokens}")
+        # assert (torch.cuda.is_current_stream_capturing() or
+        #         torch.all(expert_num_tokens <= max_num_tokens)), (
+        #             f"{expert_num_tokens} <= {max_num_tokens}")
 
         for expert in range(num_local_experts):
             # Indexing expert_num_tokens doesn't work w/cudagraphs
-            if torch.cuda.is_current_stream_capturing():
-                num = max_num_tokens
+            if True or torch.cuda.is_current_stream_capturing():
+                num = max_num_tokens * num_dp
             else:
                 num = int(expert_num_tokens[expert].item())
             tmp = _resize_cache(workspace2, (num, N))
@@ -675,166 +682,6 @@ def apply(
         return out
 
 
-def _apply(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_ids: torch.Tensor,
-    activation: str,
-    global_num_experts: int,
-    expert_map: Optional[torch.Tensor],
-    w1_scale: Optional[torch.Tensor],
-    w2_scale: Optional[torch.Tensor],
-    w1_zp: Optional[torch.Tensor],
-    w2_zp: Optional[torch.Tensor],
-    a1q_scale: Optional[torch.Tensor],
-    a2_scale: Optional[torch.Tensor],
-    workspace13: torch.Tensor,
-    workspace2: torch.Tensor,
-    expert_num_tokens: Optional[torch.Tensor],
-    use_fp8_w8a8: bool,
-    use_int8_w8a16: bool,
-    use_int4_w4a16: bool,
-    block_shape: Optional[List[int]],
-) -> torch.Tensor:
-    # Check constraints.
-    if use_int4_w4a16:
-        assert hidden_states.shape[-1] // 2 == w1.shape[
-            2], "Hidden size mismatch"
-    else:
-        assert hidden_states.shape[-1] == w1.shape[2], \
-            (f"Hidden size mismatch {hidden_states.shape[-1]} "
-             f"!= {w1.shape[2]}")
-
-    assert hidden_states.is_contiguous(
-    ), "Hidden_states must be contiguous"
-    assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
-    assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn
-    ]
-
-    # TODO: num_tokens -> max_num_tokens?
-    E, num_tokens, N, K, top_k_num = mk._moe_problem_size(
-        hidden_states, w1, w2, topk_ids)
-
-    assert w1.shape[0] == E
-    assert w2.shape[0] == E
-
-    config_dtype = get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8,
-                                        use_int8_w8a16=use_int8_w8a16,
-                                        use_int4_w4a16=use_int4_w4a16,
-                                        dtype=hidden_states.dtype)
-
-    config = try_get_optimal_moe_config(
-        w1.shape,
-        w2.shape,
-        top_k_num,
-        config_dtype,
-        num_tokens,
-        block_shape=block_shape,
-    )
-
-    if hidden_states.dtype == torch.bfloat16:
-        compute_type = tl.bfloat16
-    elif hidden_states.dtype == torch.float16:
-        compute_type = tl.float16
-    elif hidden_states.dtype == torch.float32:
-        compute_type = tl.float32
-    elif hidden_states.dtype == torch.float8_e4m3fn:
-        compute_type = tl.bfloat16
-    else:
-        raise ValueError(
-            f"Unsupported compute_type: {hidden_states.dtype}")
-
-    #print(f"shape: E={E}, M={num_tokens}, N={N}, K={K}, top_k={top_k_num}")
-    # We can reuse the memory between these because by the time we need
-    # cache3, we're done with cache1
-    intermediate_cache1 = _resize_cache(workspace13, (E, num_tokens, N))
-    intermediate_cache2 = _resize_cache(workspace2,
-                                        (E, num_tokens, N // 2))
-    intermediate_cache3 = _resize_cache(workspace13, (E, num_tokens, K))
-
-    # MM1
-    invoke_moe_batched_triton_kernel(A=hidden_states,
-                                     B=w1,
-                                     C=intermediate_cache1,
-                                     expert_num_tokens=expert_num_tokens,
-                                     compute_type=compute_type,
-                                     A_scale=a1q_scale,
-                                     B_scale=w1_scale,
-                                     B_zp=w1_zp,
-                                     use_fp8_w8a8=use_fp8_w8a8,
-                                     use_int8_w8a16=use_int8_w8a16,
-                                     use_int4_w4a16=use_int4_w4a16,
-                                     config=config,
-                                     block_shape=block_shape)
-
-    # Fix activations
-    assert activation == "silu"
-    invoke_batched_silu_and_mul(output=intermediate_cache2,
-                                input=intermediate_cache1,
-                                expert_num_tokens=expert_num_tokens)
-
-    #qintermediate_cache2 = intermediate_cache2
-    a2q_scale = a2_scale
-    # TODO (varun) : support w8a8
-    assert not use_fp8_w8a8
-    #if self.use_fp8_w8a8:
-    #    qintermediate_cache2, a2q_scale = _fp8_quantize(
-    #        intermediate_cache2, a2_scale, self.block_shape)
-
-    invoke_moe_batched_triton_kernel(A=intermediate_cache2,
-                                     B=w2,
-                                     C=intermediate_cache3,
-                                     expert_num_tokens=expert_num_tokens,
-                                     compute_type=compute_type,
-                                     A_scale=a2q_scale,
-                                     B_scale=w2_scale,
-                                     B_zp=w2_zp,
-                                     use_fp8_w8a8=use_fp8_w8a8,
-                                     use_int8_w8a16=use_int8_w8a16,
-                                     use_int4_w4a16=use_int4_w4a16,
-                                     config=config,
-                                     block_shape=block_shape)
-
-    return intermediate_cache3
-
-
-def _apply_fake(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_ids: torch.Tensor,
-    activation: str,
-    global_num_experts: int,
-    expert_map: Optional[torch.Tensor],
-    w1_scale: Optional[torch.Tensor],
-    w2_scale: Optional[torch.Tensor],
-    w1_zp: Optional[torch.Tensor],
-    w2_zp: Optional[torch.Tensor],
-    a1q_scale: Optional[torch.Tensor],
-    a2_scale: Optional[torch.Tensor],
-    workspace13: torch.Tensor,
-    workspace2: torch.Tensor,
-    expert_num_tokens: Optional[torch.Tensor],
-    use_fp8_w8a8: bool,
-    use_int8_w8a16: bool,
-    use_int4_w4a16: bool,
-    block_shape: Optional[List[int]],
-) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-direct_register_custom_op(
-    op_name="_apply",
-    op_func=_apply,
-    mutates_args=[],
-    fake_impl=_apply_fake,
-    tags=(torch.Tag.needs_fixed_stride_order, ),
-)
-
-
 class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
@@ -845,6 +692,8 @@ def __init__(
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
         block_shape: Optional[List[int]] = None,
+        world_size: int = 1,
+        dp_size: int = 1,
     ):
         super().__init__()
         self.use_fp8_w8a8 = use_fp8_w8a8
@@ -855,6 +704,8 @@ def __init__(
         self.max_num_tokens = max_num_tokens
         assert not use_int8_w8a8, "NYI"
         assert not use_int4_w4a16, "NYI"
+        self.world_size = world_size
+        self.dp_size = dp_size
 
     def workspace_shapes(
         self,
@@ -866,10 +717,11 @@ def workspace_shapes(
         num_experts: int,
     ) -> Tuple[int, int, torch.dtype]:
         assert a.dim() == 2
+        num_dp = self.world_size // self.dp_size
         max_num_tokens = a.shape[
             0] if self.max_num_tokens is None else self.max_num_tokens
-        workspace13 = num_experts * max_num_tokens * max(K, N)
-        workspace2 = num_experts * max_num_tokens * (N // 2)
+        workspace13 = num_experts * max_num_tokens * num_dp * max(K, N)
+        workspace2 = num_experts * max_num_tokens * num_dp * (N // 2)
         return (workspace13, workspace2, a.dtype)
 
     def apply(
@@ -891,29 +743,6 @@ def apply(
         workspace2: torch.Tensor,
         expert_num_tokens: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        return torch.ops.vllm._apply(
-            hidden_states,
-            w1,
-            w2,
-            topk_ids,
-            activation,
-            global_num_experts,
-            expert_map,
-            w1_scale,
-            w2_scale,
-            w1_zp,
-            w2_zp,
-            a1q_scale,
-            a2_scale,
-            workspace13,
-            workspace2,
-            expert_num_tokens,
-            self.use_fp8_w8a8,
-            self.use_int8_w8a16,
-            self.use_int4_w4a16,
-            self.block_shape,
-        )
-
         # Check constraints.
         if self.use_int4_w4a16:
             assert hidden_states.shape[-1] // 2 == w1.shape[
@@ -988,10 +817,13 @@ def apply(
                                          block_shape=self.block_shape)
 
         # Fix activations
-        assert activation == "silu"
-        invoke_batched_silu_and_mul(output=intermediate_cache2,
-                                    input=intermediate_cache1,
-                                    expert_num_tokens=expert_num_tokens)
+        # assert activation == "silu"
+        # invoke_batched_silu_and_mul(output=intermediate_cache2,
+        #                             input=intermediate_cache1,
+        #                             expert_num_tokens=expert_num_tokens)
+        self.activation(activation,
+                        intermediate_cache2.view(-1, N//2),
+                        intermediate_cache1.view(-1, N))
 
         #qintermediate_cache2 = intermediate_cache2
         a2q_scale = a2_scale