neuralmagic
diff --git a/‎csrc/dispatch_utils.h
Lines changed: 13 additions & 0 deletions b/‎csrc/dispatch_utils.h
Lines changed: 13 additions & 0 deletions
diff --git a/‎csrc/moe/moe_align_sum_kernels.cu
Lines changed: 4 additions & 4 deletions b/‎csrc/moe/moe_align_sum_kernels.cu
Lines changed: 4 additions & 4 deletions
diff --git a/‎csrc/moe/topk_softmax_kernels.cu
Lines changed: 45 additions & 18 deletions b/‎csrc/moe/topk_softmax_kernels.cu
Lines changed: 45 additions & 18 deletions
diff --git a/‎examples/offline_inference/data_parallel.py
Lines changed: 12 additions & 4 deletions b/‎examples/offline_inference/data_parallel.py
Lines changed: 12 additions & 4 deletions
diff --git a/‎pyproject.toml
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/compilation/compiler_interface.py
Lines changed: 2 additions & 2 deletions b/‎vllm/compilation/compiler_interface.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/distributed/utils.py
Lines changed: 7 additions & 3 deletions b/‎vllm/distributed/utils.py
Lines changed: 7 additions & 3 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/fused_batched_moe.py
Lines changed: 11 additions & 11 deletions b/‎vllm/model_executor/layers/fused_moe/fused_batched_moe.py
Lines changed: 11 additions & 11 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/fused_moe.py
Lines changed: 2 additions & 1 deletion b/‎vllm/model_executor/layers/fused_moe/fused_moe.py
Lines changed: 2 additions & 1 deletion
@@ -65,5 +65,18 @@
   AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
   AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
 
+#define VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(...) \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)      \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)      \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)      \
+  AT_DISPATCH_CASE(at::ScalarType::UInt16, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::UInt32, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::UInt64, __VA_ARGS__)
+
 #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
+
+#define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))
@@ -326,7 +326,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
   }
 
   if (use_global_memory) {
-    VLLM_DISPATCH_INTEGRAL_TYPES(
+    VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
           // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
           // tensors
@@ -351,7 +351,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               cumsum_buffer.data_ptr<int32_t>());
         });
   } else if (use_i16) {
-    VLLM_DISPATCH_INTEGRAL_TYPES(
+    VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
           // set dynamic shared mem
           auto kernel =
@@ -366,7 +366,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               topk_ids.numel());
         });
   } else {
-    VLLM_DISPATCH_INTEGRAL_TYPES(
+    VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
           auto kernel =
               vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
@@ -391,7 +391,7 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
   TORCH_CHECK(num_experts == 256,
               "sgl_moe_align_block_size kernel only supports deepseek v3.");
 
-  VLLM_DISPATCH_INTEGRAL_TYPES(
+  VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
       topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
         // calc needed amount of shared mem for `cumsum` tensors
         auto options_int =
 
@@ -108,9 +108,17 @@ __launch_bounds__(TPB) __global__
     }
 }
 
-template <int TPB>
-__launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax, const bool* finished, float* output,
-    int* indices, int* source_rows, const int num_experts, const int k, const int start_expert, const int end_expert)
+template <int TPB, typename IndType>
+__launch_bounds__(TPB) __global__ void moeTopK(
+    const float* inputs_after_softmax,
+    const bool* finished,
+    float* output,
+    IndType* indices,
+    int* source_rows,
+    const int num_experts,
+    const int k,
+    const int start_expert,
+    const int end_expert)
 {
 
     using cub_kvp = cub::KeyValuePair<int, float>;
@@ -182,9 +190,9 @@ __launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax
   2) This implementation assumes k is small, but will work for any k.
 */
 
-template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
+template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, typename IndType>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
-    void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, int* indices,
+    void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
         int* source_rows, const int k, const int start_expert, const int end_expert)
 {
     // We begin by enforcing compile time assertions and setting up compile time constants.
@@ -397,8 +405,8 @@ struct TopkConstants
 };
 } // namespace detail
 
-template <int EXPERTS, int WARPS_PER_TB>
-void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, int* indices,
+template <int EXPERTS, int WARPS_PER_TB, typename IndType>
+void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
     int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
 {
     static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
@@ -421,10 +429,11 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
         token_expert_indices, num_tokens, topk, 0, num_experts,         \
         stream);
 
+template <typename IndType>
 void topkGatingSoftmaxKernelLauncher(
     const float* gating_output,
     float* topk_weights,
-    int* topk_indicies,
+    IndType* topk_indicies,
     int* token_expert_indices,
     float* softmax_workspace,
     const int num_tokens,
@@ -493,14 +502,32 @@ void topk_softmax(
     const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
     const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
     torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
-    vllm::moe::topkGatingSoftmaxKernelLauncher(
-        gating_output.data_ptr<float>(),
-        topk_weights.data_ptr<float>(),
-        topk_indices.data_ptr<int>(),
-        token_expert_indices.data_ptr<int>(),
-        softmax_workspace.data_ptr<float>(),
-        num_tokens,
-        num_experts,
-        topk,
-        stream);
+
+    if(topk_indices.scalar_type() == at::ScalarType::Int)
+    {
+        vllm::moe::topkGatingSoftmaxKernelLauncher(
+            gating_output.data_ptr<float>(),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens,
+            num_experts,
+            topk,
+            stream);
+    }
+    else
+    {
+        assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
+        vllm::moe::topkGatingSoftmaxKernelLauncher(
+            gating_output.data_ptr<float>(),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<uint32_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens,
+            num_experts,
+            topk,
+            stream);
+    }
 }
@@ -31,6 +31,7 @@
 from time import sleep
 
 from vllm import LLM, SamplingParams
+from vllm.config import CompilationConfig
 from vllm.utils import get_open_port
 
 
@@ -65,11 +66,14 @@ def parse_args():
                         type=int,
                         default=0,
                         help="Master node port")
+    parser.add_argument("--enforce-eager",
+                        action='store_true',
+                        help="Enforce eager mode execution.")
     return parser.parse_args()
 
 
 def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
-         dp_master_port, GPUs_per_dp_rank):
+         dp_master_port, GPUs_per_dp_rank, enforce_eager):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
     os.environ["VLLM_DP_SIZE"] = str(dp_size)
@@ -109,10 +113,14 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
                                      max_tokens=[16, 20][global_dp_rank % 2])
 
     # Create an LLM.
+    cconfig = CompilationConfig(
+        level=0,
+    )
     llm = LLM(model=model,
               tensor_parallel_size=GPUs_per_dp_rank,
-              enforce_eager=True,
-              enable_expert_parallel=True)
+              enforce_eager=enforce_eager,
+              enable_expert_parallel=True,
+              compilation_config=cconfig)
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
     for i, output in enumerate(outputs):
@@ -155,7 +163,7 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
         proc = Process(target=main,
                        args=(args.model, dp_size, local_dp_rank,
                              global_dp_rank, dp_master_ip, dp_master_port,
-                             tp_size))
+                             tp_size, args.enforce_eager))
         proc.start()
         procs.append(proc)
     exit_code = 0
 
@@ -15,8 +15,8 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "vllm"
 authors = [{name = "vLLM Team"}]
-license = "Apache-2.0"
-license-files = ["LICENSE"]
+#license = "Apache-2.0"
+#license-files = ["LICENSE"]
 readme = "README.md"
 description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
 classifiers = [
 
@@ -326,9 +326,9 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
         # compilation cache.
         if not envs.VLLM_DISABLE_COMPILE_CACHE:
             assert hash_str is not None, (
-                "failed to get the hash of the compiled graph")
+                f"failed to get the hash of the compiled graph: {file_path}")
             assert file_path is not None, (
-                "failed to get the file path of the compiled graph")
+                "failed to get the file path of the compiled graph: {file_path}")
         return compiled_graph, (hash_str, file_path)
 
     def load(self,
 
@@ -360,7 +360,11 @@ def stateless_destroy_torch_distributed_process_group(
     Destroy ProcessGroup returned by
         stateless_init_torch_distributed_process_group().
     """
-    # Lazy import for non-CUDA backends.
-    from torch.distributed.distributed_c10d import _shutdown_backend
-    _shutdown_backend(pg)
+    # TODO: pytorch < 2.7?
+    if False:
+        # Lazy import for non-CUDA backends.
+        from torch.distributed.distributed_c10d import _shutdown_backend
+        _shutdown_backend(pg)
+    else:
+        pg.shutdown()
     _unregister_process_group(pg.group_name)
@@ -577,11 +577,11 @@ def workspace_shapes(
         topk: int,
         num_experts: int,
     ) -> Tuple[int, int, torch.dtype]:
+        assert a.dim() == 2
         max_num_tokens = a.shape[
-            1] if self.max_num_tokens is None else self.max_num_tokens
-        # TODO: *2 is a hack
-        workspace13 = num_experts * max_num_tokens * K * topk * 2
-        workspace2 = max_num_tokens * N
+            0] if self.max_num_tokens is None else self.max_num_tokens
+        workspace13 = num_experts * max_num_tokens * max(K, N)
+        workspace2 = max_num_tokens * (N // 2)
         return (workspace13, workspace2, a.dtype)
 
     def apply(
@@ -605,6 +605,7 @@ def apply(
     ) -> torch.Tensor:
         assert hidden_states.dim() == 3
         assert expert_num_tokens is not None
+        hidden_dim = hidden_states.shape[-1]
 
         if self.max_num_tokens is None:
             max_num_tokens = hidden_states.shape[1]
@@ -613,13 +614,13 @@ def apply(
 
         num_experts = global_num_experts
         out = _resize_cache(workspace13,
-                            (num_experts, max_num_tokens, w2.shape[1]))
+                            (num_experts, max_num_tokens, hidden_dim))
         num_local_experts = expert_num_tokens.numel()
 
         for expert in range(num_local_experts):
             num = expert_num_tokens[expert]
-            assert num <= max_num_tokens, f"{num}, {max_num_tokens}"
-            if num > 0:
+            #assert num <= max_num_tokens, f"{num}, {max_num_tokens}"
+            if True or num > 0:  # CUDAGRAPH unfriendly?
                 tmp = _resize_cache(workspace2, (num, w1.shape[1] // 2))
                 self.activation(
                     activation, tmp,
@@ -660,8 +661,9 @@ def workspace_shapes(
         topk: int,
         num_experts: int,
     ) -> Tuple[int, int, torch.dtype]:
+        assert a.dim() == 2
         max_num_tokens = a.shape[
-            1] if self.max_num_tokens is None else self.max_num_tokens
+            0] if self.max_num_tokens is None else self.max_num_tokens
         workspace13 = num_experts * max_num_tokens * max(K, N)
         workspace2 = num_experts * max_num_tokens * (N // 2)
         return (workspace13, workspace2, a.dtype)
@@ -685,9 +687,6 @@ def apply(
         workspace2: torch.Tensor,
         expert_num_tokens: Optional[torch.Tensor],
     ) -> torch.Tensor:
-
-        num_tokens = topk_ids.size(0)
-
         # Check constraints.
         if self.use_int4_w4a16:
             assert hidden_states.shape[-1] // 2 == w1.shape[
@@ -705,6 +704,7 @@ def apply(
             torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn
         ]
 
+        # TODO: num_tokens -> max_num_tokens?
         E, num_tokens, N, K, top_k_num = mk._moe_problem_size(
             hidden_states, w1, w2, topk_ids)
 
 
@@ -870,6 +870,7 @@ def fused_topk(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
+    indices_type: torch.dtype = torch.int32,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     assert hidden_states.shape[0] == gating_output.shape[0], (
         "Number of tokens mismatch")
@@ -882,7 +883,7 @@ def fused_topk(
                                device=hidden_states.device)
     topk_ids = torch.empty(M,
                            topk,
-                           dtype=torch.int32,
+                           dtype=indices_type,
                            device=hidden_states.device)
     token_expert_indicies = torch.empty(M,
                                         topk,