wip

bnellnm · bnellnm · commit 2bafbe0b7947 · 2025-05-07T15:37:14.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,8 +15,8 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "vllm"
 authors = [{name = "vLLM Team"}]
-#license = "Apache-2.0"
-#license-files = ["LICENSE"]
+license = "Apache-2.0"
+license-files = ["LICENSE"]
 readme = "README.md"
 description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
 classifiers = [
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
@@ -241,10 +241,10 @@ def test_cutlass_moe_8_bit_no_graph(
                                                   per_out_ch)
 
         score = torch.randn((m, e), device="cuda", dtype=torch.half)
-        topk_weights, topk_ids = fused_topk(mt.a,
-                                            score,
-                                            topk,
-                                            renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(mt.a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
 
         # Note that we are using the dequantized versions of the tensors.
         # Using a, w1 and w2 directly results in minor output differences.
@@ -285,10 +285,10 @@ def test_cutlass_moe_8_bit_cuda_graph(
                                                   per_out_ch)
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids = fused_topk(mt.a,
-                                            score,
-                                            topk,
-                                            renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(mt.a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
 
         # Note that we are using the dequantized versions of the tensors.
         # Using a, w1 and w2 directly results in minor output differences.
@@ -338,10 +338,10 @@ def test_cutlass_moe_8_bit_EP(
                                                   per_out_channel)
 
         score = torch.randn((m, e), device="cuda", dtype=torch.half)
-        topk_weights, topk_ids = fused_topk(mt.a,
-                                            score,
-                                            topk,
-                                            renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(mt.a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
 
         # Note that we are using the dequantized versions of the tensors.
         # Using a, w1 and w2 directly results in minor output differences.
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -296,7 +296,7 @@ def test_fused_moe_batched_experts(
     score = torch.randn((m, e), device="cuda", dtype=dtype)
 
     with set_current_vllm_config(vllm_config):
-        topk_weight, topk_ids = fused_topk(a, score, topk, False)
+        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
         baseline_output = torch_moe2(a, w1, w2, topk_weight, topk_ids)
         torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
         batched_output = batched_moe(a, w1, w2, topk_weight, topk_ids)
@@ -404,7 +404,7 @@ def _pplx_dispatch_combine(
     nvshmem_init(uid, pgi.rank, pgi.world_size)
     device = pgi.device
 
-    topk_weight, topk_ids = fused_topk(a, score, topk, False)
+    topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
     k = a.shape[1]
 
     a_rep = torch.repeat_interleave(a, topk, dim=0).to(device)
@@ -577,7 +577,7 @@ def _pplx_moe(
     e, _, n = w2.shape
 
     with set_current_vllm_config(vllm_config):
-        topk_weight, topk_ids = fused_topk(a, score, topk, False)
+        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
         torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids)
         pplx_output = pplx_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids)
         batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -887,10 +887,10 @@ def fused_topk(
         dtype=torch.int32 if indices_type is None else indices_type,
         device=hidden_states.device
     )
-    token_expert_indicies = torch.empty(M,
-                                        topk,
-                                        dtype=torch.int32,
-                                        device=hidden_states.device)
+    token_expert_indices = torch.empty(M,
+                                       topk,
+                                       dtype=torch.int32,
+                                       device=hidden_states.device)
 
     gating_output_float = gating_output.float()  # TODO(woosuk): Optimize this.
 
@@ -1211,28 +1211,29 @@ def fused_experts(hidden_states: torch.Tensor,
 
 
 def fused_experts_impl(
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        inplace: bool = False,
-        activation: str = "silu",
-        apply_router_weight_on_input: bool = False,
-        use_fp8_w8a8: bool = False,
-        use_int8_w8a8: bool = False,
-        use_int8_w8a16: bool = False,
-        use_int4_w4a16: bool = False,
-        per_channel_quant: bool = False,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        w1_scale: Optional[torch.Tensor] = None,
-        w2_scale: Optional[torch.Tensor] = None,
-        w1_zp: Optional[torch.Tensor] = None,
-        w2_zp: Optional[torch.Tensor] = None,
-        a1_scale: Optional[torch.Tensor] = None,
-        a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[List[int]] = None) -> torch.Tensor:
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
+) -> torch.Tensor:
     # Check constraints.
     if use_int4_w4a16:
         assert hidden_states.shape[1] // 2 == w1.shape[
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1002,12 +1002,13 @@ def select_experts(hidden_states: torch.Tensor,
                 scoring_func=scoring_func,
                 e_score_correction_bias=e_score_correction_bias)
         elif custom_routing_function is None:
-            topk_weights, topk_ids, token_expert_indices = fused_topk(hidden_states=hidden_states,
-                                                gating_output=router_logits,
-                                                topk=top_k,
-                                                renormalize=renormalize,
-                                                indices_type=indices_type,
-                                                )
+            topk_weights, topk_ids, token_expert_indices = fused_topk(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize,
+                indices_type=indices_type,
+            )
         else:
             assert indices_type is None or indices_type == torch.int32
             topk_weights, topk_ids = custom_routing_function(