neuralmagic
diff --git a/‎tests/kernels/moe/test_batched_moe.py
Lines changed: 76 additions & 58 deletions b/‎tests/kernels/moe/test_batched_moe.py
Lines changed: 76 additions & 58 deletions
diff --git a/‎tests/kernels/moe/test_moe.py
Lines changed: 1 addition & 3 deletions b/‎tests/kernels/moe/test_moe.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎tests/kernels/quantization/test_block_fp8.py
Lines changed: 2 additions & 3 deletions b/‎tests/kernels/quantization/test_block_fp8.py
Lines changed: 2 additions & 3 deletions
@@ -1,15 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import torch
-import triton
-import triton.language as tl
+from dataclasses import dataclass
 
 import pytest
-from dataclasses import dataclass
+import torch
+import triton.language as tl
 
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    invoke_moe_batched_triton_kernel,
-    invoke_batched_silu_and_mul)
+    invoke_batched_silu_and_mul, invoke_moe_batched_triton_kernel)
 
 
 @dataclass
@@ -20,25 +18,36 @@ class BatchedMMConfig:
     K: int
     N: int
 
+
 @dataclass
 class BatchedMMTensors:
     A: torch.Tensor  # [E, max_tokens, K]
     B: torch.Tensor  # [E, K, N] - column major
     C: torch.Tensor  # [E, max_tokens, N]
-    num_expert_tokens: torch.Tensor # [E]
+    num_expert_tokens: torch.Tensor  # [E]
 
     @staticmethod
     def make_tensors(config: BatchedMMConfig):
-        A = torch.randn((config.num_experts, config.max_tokens_per_expert, config.K), device="cuda", dtype=config.dtype) / 50.0
-        B = torch.randn((config.num_experts, config.N, config.K), device="cuda", dtype=config.dtype) / 50.0
-        C = torch.zeros((config.num_experts, config.max_tokens_per_expert, config.N), device="cuda", dtype=config.dtype)
-        num_expert_tokens=torch.randint(low = 0, high = config.max_tokens_per_expert, size=(config.num_experts,), device="cuda", dtype=torch.int32)
-        return BatchedMMTensors(A,B,C, num_expert_tokens)
-
-
-def ref_impl(A: torch.Tensor,
-             B: torch.Tensor,
-             C: torch.Tensor,
+        A = torch.randn(
+            (config.num_experts, config.max_tokens_per_expert, config.K),
+            device="cuda",
+            dtype=config.dtype) / 50.0
+        B = torch.randn((config.num_experts, config.N, config.K),
+                        device="cuda",
+                        dtype=config.dtype) / 50.0
+        C = torch.zeros(
+            (config.num_experts, config.max_tokens_per_expert, config.N),
+            device="cuda",
+            dtype=config.dtype)
+        num_expert_tokens = torch.randint(low=0,
+                                          high=config.max_tokens_per_expert,
+                                          size=(config.num_experts, ),
+                                          device="cuda",
+                                          dtype=torch.int32)
+        return BatchedMMTensors(A, B, C, num_expert_tokens)
+
+
+def ref_impl(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
              num_expert_tokens: torch.Tensor) -> torch.Tensor:
 
     num_expert_tokens_cpu = num_expert_tokens.clone()
@@ -49,49 +58,50 @@ def ref_impl(A: torch.Tensor,
         num_tokens = num_expert_tokens_cpu[e]
         C[e, :num_tokens, :] = A[e, :num_tokens, :] @ B[e].transpose(0, 1)
 
-
     return C
 
+
 @pytest.mark.parametrize("num_experts", [16, 32])
 @pytest.mark.parametrize("max_tokens_per_expert", [512])
 @pytest.mark.parametrize("K", [256])
 @pytest.mark.parametrize("N", [512])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-def test_batched_mm(num_experts: int,
-                    max_tokens_per_expert: int,
-                    K: int,
-                    N: int,
-                    dtype: torch.dtype):
+def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
+                    N: int, dtype: torch.dtype):
 
     config = BatchedMMConfig(dtype, num_experts, max_tokens_per_expert, K, N)
     tensors = BatchedMMTensors.make_tensors(config)
 
     test_output = tensors.C
     ref_output = test_output.clone()
 
-
-    compute_tl_dtype = {torch.float16 : tl.float16,
-                        torch.bfloat16 : tl.bfloat16,
-                        torch.float32 : tl.float32}[test_output.dtype]
-    invoke_moe_batched_triton_kernel(tensors.A,
-                                     tensors.B,
-                                     test_output,
-                                     tensors.num_expert_tokens,
-                                     compute_tl_dtype,
-                                     # Quantization data
-                                     None,
-                                     None,
-                                     None,
-                                     # Quantization schemes
-                                     False,
-                                     False,
-                                     False,
-                                     config = {"BLOCK_SIZE_M": 16,
-                                               "BLOCK_SIZE_N": 16,
-                                               "BLOCK_SIZE_K": 16})
-
-
-    ref_output = ref_impl(tensors.A, tensors.B, ref_output, tensors.num_expert_tokens)
+    compute_tl_dtype = {
+        torch.float16: tl.float16,
+        torch.bfloat16: tl.bfloat16,
+        torch.float32: tl.float32
+    }[test_output.dtype]
+    invoke_moe_batched_triton_kernel(
+        tensors.A,
+        tensors.B,
+        test_output,
+        tensors.num_expert_tokens,
+        compute_tl_dtype,
+        # Quantization data
+        None,
+        None,
+        None,
+        # Quantization schemes
+        False,
+        False,
+        False,
+        config={
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 16,
+            "BLOCK_SIZE_K": 16
+        })
+
+    ref_output = ref_impl(tensors.A, tensors.B, ref_output,
+                          tensors.num_expert_tokens)
     #torch.cuda.synchronize()
     #print (f"ref output {ref_output}")
     #print (f"test output {test_output}")
@@ -106,6 +116,7 @@ class BatchedSiluMulConfig:
     max_tokens_per_expert: int
     D: int
 
+
 @dataclass
 class BatchedSiluMulTensors:
     input: torch.Tensor
@@ -114,16 +125,24 @@ class BatchedSiluMulTensors:
 
     @staticmethod
     def make_tensors(config: BatchedSiluMulConfig):
-        input = torch.randn((config.num_experts, config.max_tokens_per_expert, config.D * 2), device="cuda", dtype=config.dtype) / 50.0
-        output = torch.zeros((config.num_experts, config.max_tokens_per_expert, config.D), device="cuda", dtype=config.dtype)
-        num_expert_tokens=torch.randint(low = 0, high = config.max_tokens_per_expert, size=(config.num_experts,), device="cuda", dtype=torch.int32)
+        input = torch.randn(
+            (config.num_experts, config.max_tokens_per_expert, config.D * 2),
+            device="cuda",
+            dtype=config.dtype) / 50.0
+        output = torch.zeros(
+            (config.num_experts, config.max_tokens_per_expert, config.D),
+            device="cuda",
+            dtype=config.dtype)
+        num_expert_tokens = torch.randint(low=0,
+                                          high=config.max_tokens_per_expert,
+                                          size=(config.num_experts, ),
+                                          device="cuda",
+                                          dtype=torch.int32)
         return BatchedSiluMulTensors(input, output, num_expert_tokens)
 
 
-def ref_batched_silu_mul(
-    output: torch.Tensor,
-    input: torch.Tensor,
-    num_expert_tokens: torch.Tensor) -> torch.Tensor:
+def ref_batched_silu_mul(output: torch.Tensor, input: torch.Tensor,
+                         num_expert_tokens: torch.Tensor) -> torch.Tensor:
 
     num_expert_tokens_cpu = num_expert_tokens.clone()
     num_expert_tokens_cpu = num_expert_tokens_cpu.to(device="cpu")
@@ -140,10 +159,8 @@ def ref_batched_silu_mul(
 @pytest.mark.parametrize("max_tokens_per_expert", [128])
 @pytest.mark.parametrize("D", [128, 256])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-def test_batched_silu_mul(num_experts: int,
-                    max_tokens_per_expert: int,
-                    D: int,
-                    dtype: torch.dtype):
+def test_batched_silu_mul(num_experts: int, max_tokens_per_expert: int, D: int,
+                          dtype: torch.dtype):
 
     config = BatchedSiluMulConfig(dtype, num_experts, max_tokens_per_expert, D)
     tensors = BatchedSiluMulTensors.make_tensors(config)
@@ -153,6 +170,7 @@ def test_batched_silu_mul(num_experts: int,
 
     ref_batched_silu_mul(ref_out, tensors.input, tensors.expert_num_tokens)
 
-    invoke_batched_silu_and_mul(test_out, tensors.input, tensors.expert_num_tokens)
+    invoke_batched_silu_and_mul(test_out, tensors.input,
+                                tensors.expert_num_tokens)
 
     torch.testing.assert_close(test_out, ref_out)
@@ -15,8 +15,7 @@
                                  torch_moe_single)
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe import fused_moe
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_topk, moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
     fused_moe as iterative_moe)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
@@ -26,7 +25,6 @@
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
-from vllm.model_executor.layers.activation import SiluAndMul
 
 NUM_EXPERTS = [8, 64]
 EP_SIZE = [1, 4]
 
@@ -11,7 +11,7 @@
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
-    _valid_deep_gemm_shape, deep_gemm_moe_fp8, modular_deep_gemm_fused_moe_fp8)
+    _valid_deep_gemm_shape, deep_gemm_moe_fp8)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
     moe_align_block_size)
@@ -437,8 +437,7 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
 
         topk_weights, topk_ids = fused_topk(a, score.float(), topk, False)
 
-        out = deep_gemm_moe_fp8(a, w1, w2, w1_s, w2_s, topk_weights,
-                                topk_ids)
+        out = deep_gemm_moe_fp8(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids)
 
     #print(f"{out.sum()=}")
     #print(f"{ref_out.sum()=}")