neuralmagic
diff --git a/‎csrc/activation_kernels.cu
Lines changed: 3 additions & 1 deletion b/‎csrc/activation_kernels.cu
Lines changed: 3 additions & 1 deletion
diff --git a/‎csrc/dispatch_utils.h
Lines changed: 9 additions & 8 deletions b/‎csrc/dispatch_utils.h
Lines changed: 9 additions & 8 deletions
diff --git a/‎examples/offline_inference/data_parallel.py
Lines changed: 8 additions & 15 deletions b/‎examples/offline_inference/data_parallel.py
Lines changed: 8 additions & 15 deletions
diff --git a/‎tests/kernels/moe/test_batched_moe.py
Lines changed: 2 additions & 1 deletion b/‎tests/kernels/moe/test_batched_moe.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/kernels/moe/test_moe.py
Lines changed: 1 addition & 3 deletions b/‎tests/kernels/moe/test_moe.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎tests/kernels/moe/test_pplx_moe.py
Lines changed: 13 additions & 13 deletions b/‎tests/kernels/moe/test_pplx_moe.py
Lines changed: 13 additions & 13 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/fused_batched_moe.py
Lines changed: 15 additions & 12 deletions b/‎vllm/model_executor/layers/fused_moe/fused_batched_moe.py
Lines changed: 15 additions & 12 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/fused_moe.py
Lines changed: 5 additions & 11 deletions b/‎vllm/model_executor/layers/fused_moe/fused_moe.py
Lines changed: 5 additions & 11 deletions
@@ -70,7 +70,9 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
   int64_t num_tokens = input.numel() / input.size(-1);                   \
   dim3 grid(num_tokens);                                                 \
   dim3 block(std::min(d, 1024));                                         \
-  if (num_tokens == 0) { return; }                                       \
+  if (num_tokens == 0) {                                                 \
+    return;                                                              \
+  }                                                                      \
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
   VLLM_DISPATCH_FLOATING_TYPES(                                          \
 
@@ -66,17 +66,18 @@
   AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
 
 #define VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(...) \
-  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)      \
-  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)      \
-  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)     \
-  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)       \
-  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)      \
-  AT_DISPATCH_CASE(at::ScalarType::UInt16, __VA_ARGS__)    \
-  AT_DISPATCH_CASE(at::ScalarType::UInt32, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)      \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)        \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::UInt16, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::UInt32, __VA_ARGS__)     \
   AT_DISPATCH_CASE(at::ScalarType::UInt64, __VA_ARGS__)
 
 #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
 
 #define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))
+  AT_DISPATCH_SWITCH(                                              \
+      TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))
@@ -31,7 +31,6 @@
 from time import sleep
 
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationConfig
 from vllm.utils import get_open_port
 
 
@@ -116,20 +115,13 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
                                      max_tokens=[16, 20][global_dp_rank % 2])
 
     # Create an LLM.
-    cconfig = CompilationConfig(
-        level=3,
-        #cudagraph_capture_sizes=[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208],
-        #cudagraph_capture_sizes=[512,256,1],
-        #cudagraph_capture_sizes=[192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1]
-        #cudagraph_capture_sizes=[128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1]
+    llm = LLM(
+        model=model,
+        tensor_parallel_size=GPUs_per_dp_rank,
+        enforce_eager=enforce_eager,
+        enable_expert_parallel=True,
+        trust_remote_code=trust_remote_code,
     )
-    llm = LLM(model=model,
-              tensor_parallel_size=GPUs_per_dp_rank,
-              enforce_eager=enforce_eager,
-              enable_expert_parallel=True,
-              compilation_config=cconfig,
-              trust_remote_code=trust_remote_code,
-              )
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
     for i, output in enumerate(outputs):
@@ -172,7 +164,8 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
         proc = Process(target=main,
                        args=(args.model, dp_size, local_dp_rank,
                              global_dp_rank, dp_master_ip, dp_master_port,
-                             tp_size, args.enforce_eager, args.trust_remote_code))
+                             tp_size, args.enforce_eager,
+                             args.trust_remote_code))
         proc.start()
         procs.append(proc)
     exit_code = 0
 
@@ -62,7 +62,8 @@ def ref_impl(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
 
 
 @pytest.mark.parametrize("num_experts", [16, 32])
-@pytest.mark.parametrize("max_tokens_per_expert", [32, 64, 128, 192, 224, 256, 512])
+@pytest.mark.parametrize("max_tokens_per_expert",
+                         [32, 64, 128, 192, 224, 256, 512])
 @pytest.mark.parametrize("K", [128, 256, 1024])
 @pytest.mark.parametrize("N", [128, 256, 512, 1024])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 
@@ -11,8 +11,7 @@
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
 import vllm.model_executor.layers.fused_moe  # noqa
-from tests.kernels.utils import (opcheck, stack_and_dev, torch_moe,
-                                 torch_moe_single)
+from tests.kernels.utils import opcheck, stack_and_dev, torch_moe
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
@@ -27,7 +26,6 @@
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.model_executor.layers.activation import SiluAndMul
 
 NUM_EXPERTS = [8, 64]
 EP_SIZE = [1, 4]
 
@@ -28,8 +28,7 @@
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedDispatchCombine,
-    BatchedExperts)
+    BatchedDispatchCombine, BatchedExperts)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
@@ -246,15 +245,9 @@ def batched_moe(a, w1, w2, topk_weight, topk_ids):
 
     fused_experts = FusedMoEModularKernel(
         BatchedDispatchCombine(a.shape[0], world_size=1, dp_size=1, rank=0),
-        BatchedExperts(a.shape[0])
-    )
+        BatchedExperts(a.shape[0]))
 
-    return fused_experts(a,
-                         w1,
-                         w2,
-                         topk_weight,
-                         topk_ids,
-                         num_experts)
+    return fused_experts(a, w1, w2, topk_weight, topk_ids, num_experts)
 
 
 # TODO: same as torch_moe but with fused_topk factored out.
@@ -301,9 +294,15 @@ def test_fused_moe_batched_experts(
         torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
         batched_output = batched_moe(a, w1, w2, topk_weight, topk_ids)
 
-    torch.testing.assert_close(baseline_output, torch_output, atol=2e-2, rtol=0)
+    torch.testing.assert_close(baseline_output,
+                               torch_output,
+                               atol=2e-2,
+                               rtol=0)
     torch.set_printoptions(profile="full")
-    torch.testing.assert_close(baseline_output, batched_output, atol=2e-2, rtol=0)
+    torch.testing.assert_close(baseline_output,
+                               batched_output,
+                               atol=2e-2,
+                               rtol=0)
 
 
 def rank_chunk(num, r, w):
@@ -585,7 +584,8 @@ def _pplx_moe(
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
         torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids)
         pplx_output = pplx_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids)
-        batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids)
+        batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight,
+                                      topk_ids)
 
     torch_output = chunk_by_rank(torch_output, pgi.rank,
                                  pgi.world_size).to(pplx_output.device)
 
@@ -10,7 +10,6 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     get_config_dtype_str, try_get_optimal_moe_config)
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
-from vllm.utils import direct_register_custom_op
 
 
 @triton.jit
@@ -473,7 +472,8 @@ def rank_chunk(num, r, w):
 
 class BatchedDispatchCombine(mk.FusedMoEQuantizeDispatchCombine):
 
-    def __init__(self, max_num_tokens: Optional[int], world_size: int, dp_size: int, rank: int):
+    def __init__(self, max_num_tokens: Optional[int], world_size: int,
+                 dp_size: int, rank: int):
         super().__init__()
         self.world_size = world_size
         self.dp_size = dp_size
@@ -510,16 +510,18 @@ def dispatch(
                                                minlength=num_experts)
             self.max_num_tokens = int(tokens_per_expert.max().item())
         else:
-            tokens_per_expert = torch.zeros(num_experts, dtype=torch.int,
+            tokens_per_expert = torch.zeros(num_experts,
+                                            dtype=torch.int,
                                             device=a1.device)
 
         rem_experts = num_experts % self.world_size
         num_local_experts = ((num_experts // self.world_size) +
                              (1 if self.rank < rem_experts else 0))
 
-        b_a1 = torch.zeros((num_local_experts, self.max_num_tokens, hidden_dim),
-                           dtype=a1.dtype,
-                           device=a1.device)
+        b_a1 = torch.zeros(
+            (num_local_experts, self.max_num_tokens, hidden_dim),
+            dtype=a1.dtype,
+            device=a1.device)
 
         first_expert = (((num_experts // self.world_size) * self.rank) +
                         rem_experts - self.rank)
@@ -540,7 +542,8 @@ def dispatch(
         for expert_id in range(first_expert, last_expert):
             topks = torch.any(topk_ids == expert_id, dim=1).flatten()
             rows = torch.count_nonzero(topks.flatten())
-            b_a1[expert_id - first_expert, :rows, :] = a1[:topks.numel()][topks]
+            b_a1[expert_id -
+                 first_expert, :rows, :] = a1[:topks.numel()][topks]
             tokens_per_expert[expert_id - first_expert] = rows
 
         return b_a1, a1_scale, tokens_per_expert
@@ -561,7 +564,7 @@ def combine(
 
         output.fill_(0)
 
-        first_expert = num_local_experts * self.rank # NOT QUITE RIGHT
+        first_expert = num_local_experts * self.rank  # NOT QUITE RIGHT
         last_expert = first_expert + num_local_experts
 
         # for expert_id in range(first_expert, last_expert):
@@ -658,8 +661,9 @@ def apply(
         num_experts = global_num_experts
         out = _resize_cache(workspace13,
                             (num_experts, max_num_tokens * num_dp, hidden_dim))
-        num_local_experts = w1.shape[0] #expert_num_tokens.numel()
-        assert num_local_experts == w1.shape[0], f"{num_local_experts} == {w1.shape[0]}"
+        num_local_experts = w1.shape[0]  #expert_num_tokens.numel()
+        assert num_local_experts == w1.shape[
+            0], f"{num_local_experts} == {w1.shape[0]}"
 
         N = w1.shape[1] // 2
 
@@ -821,8 +825,7 @@ def apply(
         # invoke_batched_silu_and_mul(output=intermediate_cache2,
         #                             input=intermediate_cache1,
         #                             expert_num_tokens=expert_num_tokens)
-        self.activation(activation,
-                        intermediate_cache2.view(-1, N//2),
+        self.activation(activation, intermediate_cache2.view(-1, N // 2),
                         intermediate_cache1.view(-1, N))
 
         #qintermediate_cache2 = intermediate_cache2
 
@@ -21,7 +21,7 @@
     _resize_cache, moe_kernel_quantize_input)
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
-from vllm.utils import direct_register_custom_op, round_up
+from vllm.utils import direct_register_custom_op
 
 from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
 
@@ -885,8 +885,7 @@ def fused_topk(
         M,
         topk,
         dtype=torch.int32 if indices_type is None else indices_type,
-        device=hidden_states.device
-    )
+        device=hidden_states.device)
     token_expert_indices = torch.empty(M,
                                        topk,
                                        dtype=torch.int32,
@@ -980,7 +979,7 @@ def get_config_dtype_str(
     return None
 
 
-# TODO: use scalar_type?
+# TODO: use scalar_type instead of bools?
 def get_config_qtype(
     use_fp8_w8a8: bool,
     use_int8_w8a8: bool,
@@ -1239,8 +1238,8 @@ def fused_experts_impl(
         assert hidden_states.shape[1] // 2 == w1.shape[
             2], "Hidden size mismatch"
     else:
-        assert hidden_states.shape[1] == w1.shape[2], \
-            f"Hidden size mismatch {hidden_states.shape[1]} != {w1.shape[2]}"
+        assert hidden_states.shape[1] == w1.shape[2], (
+            f"Hidden size mismatch {hidden_states.shape[1]} != {w1.shape[2]}")
 
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
@@ -1655,16 +1654,11 @@ def apply(
             expert_ids = torch.repeat_interleave(expert_ids,
                                                  max_num_tokens,
                                                  dim=0)
-            print(f"EXPERT_IDS {expert_ids}")
-            #num_tokens_post_padded = torch.tensor([num_tokens],
-            #                                      device=hidden_states.device,
-            #                                      dtype=torch.int32)
             num_tokens_post_padded = torch.zeros(1,
                                                  device=hidden_states.device,
                                                  dtype=torch.int32)
             num_tokens_post_padded.fill_(max_num_tokens)
             hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-            #print(f"P = {sorted_token_ids}, {hidden_states.shape}")
 
         invoke_fused_moe_kernel(hidden_states,
                                 w1,