WIP

IwakuraRein · IwakuraRein · commit c6edf1a35dd5 · 2025-08-12T14:00:57.000-07:00
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -1053,7 +1053,7 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe(
     std::optional<int64_t> n_group, std::optional<int64_t> topk_group, int64_t intermediate_size,
     int64_t local_expert_offset, int64_t local_num_experts,
     std::optional<double> routed_scaling_factor, int64_t tile_tokens_dim,
-    int64_t routing_method_type, bool do_finalize, at::Tensor& output) {
+    int64_t routing_method_type, bool do_finalize, at::Tensor& output, int64_t config_index) {
   using RunnerType = tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner;
 
   int const num_tokens = hidden_states.sizes()[0];
@@ -1106,8 +1106,10 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe(
       mDtypeAct, mDtypeWeights, mUseDeepSeekFp8, (int32_t)tile_tokens_dim,
       tensorrt_llm::kernels::ActType::SwiGlu, /*useShuffledMatrixA*/ true);
 
-  auto const moeConfigIndex = mRunner->getDefaultValidConfigIndex(
-      top_k, hidden_size, intermediate_size, local_num_experts, num_tokens);
+  if (config_index == -1) {
+    config_index = mRunner->getDefaultValidConfigIndex(top_k, hidden_size, intermediate_size,
+                                                       local_num_experts, num_tokens);
+  }
 
   return trtllm_fp4_block_scale_moe_launcher(
       routing_logits, topk_ids, expert_weights, routing_bias, hidden_states, hidden_states_scale,
@@ -1116,7 +1118,70 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe(
       output1_scales_gate_scalar, output2_scales_scalar, num_experts, top_k, n_group, topk_group,
       intermediate_size, local_expert_offset, local_num_experts, routed_scaling_factor,
       tile_tokens_dim, routing_method_type, do_finalize, *mRunner, mDtypeAct, mDtypeWeights,
-      moeConfigIndex, output);
+      config_index, output);
+}
+
+inline btg::Dtype get_dtype(int64_t const dtype) {
+  switch (dtype) {
+    case 0:
+      return btg::Dtype::Bfloat16;
+    case 1:
+      return btg::Dtype::Bool;
+    case 2:
+      return btg::Dtype::E2m1;
+    case 3:
+      return btg::Dtype::E2m3;
+    case 4:
+      return btg::Dtype::E3m2;
+    case 5:
+      return btg::Dtype::E4m3;
+    case 6:
+      return btg::Dtype::E5m2;
+    case 7:
+      return btg::Dtype::Fp16;
+    case 8:
+      return btg::Dtype::Fp32;
+    case 9:
+      return btg::Dtype::Int8;
+    case 10:
+      return btg::Dtype::Int32;
+    case 11:
+      return btg::Dtype::Int64;
+    case 12:
+      return btg::Dtype::MxE2m1;
+    case 13:
+      return btg::Dtype::MxE4m3;
+    case 14:
+      return btg::Dtype::UE8m0;
+    case 15:
+      return btg::Dtype::UInt8;
+    case 16:
+      return btg::Dtype::UInt16;
+    case 17:
+      return btg::Dtype::UInt32;
+    case 18:
+      return btg::Dtype::UInt64;
+    case 19:
+      return btg::Dtype::UInt128;
+    case 20:
+      return btg::Dtype::Void;
+    default:
+      TORCH_CHECK(false, "Invalid trtllm-gen dtype");
+  }
+  return btg::Dtype::E2m1;
+}
+
+std::vector<int64_t> trtllm_get_valid_moe_configs(
+    int64_t const tile_tokens_dim, int64_t const dtype_act_, int64_t const dtype_weights_,
+    bool const useDeepSeekFp8, int64_t const top_k, int64_t const hidden_size,
+    int64_t const intermediate_size, int64_t const num_local_experts, int64_t const num_tokens) {
+  auto dtype_act = get_dtype(dtype_act_);
+  auto dtype_weights = get_dtype(dtype_weights_);
+  tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner moe_runner(
+      dtype_act, dtype_weights, useDeepSeekFp8, (int32_t)tile_tokens_dim,
+      tensorrt_llm::kernels::ActType::SwiGlu, /*useShuffledMatrixA*/ true);
+  return moe_runner.getValidConfigIndices(top_k, hidden_size, intermediate_size, num_local_experts,
+                                          num_tokens);
 }
 
 namespace trtllm_cubin_loader {
@@ -1127,6 +1192,7 @@ TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   m.def("trtllm_fp8_per_tensor_scale_moe", trtllm_fp8_per_tensor_scale_moe);
   m.def("trtllm_fp8_block_scale_moe", trtllm_fp8_block_scale_moe);
   m.def("trtllm_fp4_block_scale_moe", trtllm_fp4_block_scale_moe);
+  m.def("trtllm_get_valid_moe_configs", trtllm_get_valid_moe_configs);
 }
 
 }  // namespace flashinfer
diff --git a/flashinfer/autotuner.py b/flashinfer/autotuner.py
@@ -707,7 +707,15 @@ def _create_tensor_like(
         # TODO: FIXME, sometimes the content of the tensor can affect the performance, like MOE
         # One solution is to manituplate the tensor content to make it more like the real data
         # during the tuning process. This can by controlled in the preparation phase by the runner.
-        return torch.zeros(shapes, dtype=dtype, device=device)
+        # return torch.zeros(shapes, dtype=dtype, device=device)
+        if dtype == torch.int8:
+            return torch.randint(0, 127, shapes, dtype=dtype, device=device)
+        elif dtype == torch.uint8:
+            return torch.randint(0, 255, shapes, dtype=dtype, device=device)
+        elif dtype == torch.int32:
+            return torch.randint(0, 1000000, shapes, dtype=dtype, device=device)
+        else:
+            return torch.randn(shapes, dtype=dtype, device=device)
 
     def _prepare_input_tensors(
         self, profile: OptimizationProfile, inputs: List[torch.Tensor]
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py