flashinfer-ai · fzyzcjy · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025
diff --git a/benchmarks/bench_cutlass_fused_moe.py b/benchmarks/bench_cutlass_fused_moe.py
@@ -29,20 +29,24 @@
 FLOAT4_E2M1_MAX = 6.0
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
 
+num_ranks = 2
 
 test_configs = [
+    # {
+    #     "hidden_size": 7168,
+    #     "num_experts": 256,
+    #     "top_k": 8,
+    #     "intermediate_size": 256,
+    # },
     {
         "hidden_size": 7168,
-        "num_experts": 256,
-        "top_k": 8,
-        "intermediate_size": 256,
-    },
-    {
-        "hidden_size": 7168,
-        "num_experts": 32,
+        "num_experts": num_experts,
         "top_k": 8,
         "intermediate_size": 2048,
-    },
+    }
+    for num_experts in [
+        256 // num_ranks,
+    ]
 ]
 
 
@@ -131,6 +135,13 @@ def bench_cutlass_fused_moe(
     router_logits = torch.randn(m, e, dtype=otype).cuda()
     routing_weights, selected_experts = compute_routing(router_logits, top_k)
 
+    if 1:
+        print("HACK: mask some selected_experts")
+        selected_experts[torch.randn(selected_experts.shape) > 1 / num_ranks] = 9999999
+
+        tune_max_num_tokens = batch_size
+        print(f"HACK: {tune_max_num_tokens=}")
+
     flash_output = torch.zeros_like(x)
 
     quant_scales = [
@@ -143,6 +154,7 @@ def bench_cutlass_fused_moe(
     ]
     hidden_states = x
     hidden_states, input_sf = fp4_quantize(x, a1_gs)
+    print(f"{hidden_states.shape=}")
 
     # Warmup
     for _ in range(3):
@@ -156,7 +168,7 @@ def bench_cutlass_fused_moe(
             quant_scales=quant_scales,
             input_sf=input_sf,
             output=flash_output,
-            tune_max_num_tokens=16384,
+            tune_max_num_tokens=tune_max_num_tokens,
         )
 
     if not skip_autotune:
@@ -171,10 +183,20 @@ def bench_cutlass_fused_moe(
                 quant_scales=quant_scales,
                 input_sf=input_sf,
                 output=flash_output,
-                tune_max_num_tokens=16384,
+                tune_max_num_tokens=tune_max_num_tokens,
             )
-    ms_list = bench_gpu_time(
-        lambda: fused_moe.cutlass_fused_moe(
+
+    counter = 0
+
+    def f():
+        nonlocal counter
+        counter += 1
+
+        if counter == 10:
+            print("hi call cudaProfilerStart")
+            torch.cuda.cudart().cudaProfilerStart()
+
+        fused_moe.cutlass_fused_moe(
             hidden_states,
             selected_experts.to(torch.int),
             routing_weights,
@@ -184,8 +206,13 @@ def bench_cutlass_fused_moe(
             quant_scales=quant_scales,
             input_sf=input_sf,
             output=flash_output,
-        ),
-    )
+        )
+
+        if counter == 10:
+            print("hi call cudaProfilerStop")
+            torch.cuda.cudart().cudaProfilerStop()
+
+    ms_list = bench_gpu_time(f)
     median_ms = np.median(ms_list)
     print(f"{'input':<15} {'weight1':<20} {'weight2':<20} {'time(ms)'}")
     print(
@@ -201,7 +228,7 @@ def bench_cutlass_fused_moe(
         help="Update the config file with the new profiling results",
     )
     parser.add_argument(
-        "--num-tokens", type=int, default=32, help="Number of tokens to profile"
+        "--num-tokens", type=int, default=32768 * num_ranks, help="Number of tokens to profile"
     )
     parser.add_argument("--skip-autotune", action="store_true", help="Skip autotuning")
     args = parser.parse_args()

diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
@@ -865,7 +865,7 @@ void threeStepBuildExpertMapsSortFirstToken(
 // ============================== Infer GEMM sizes =================================
 // TODO Could linear search be better for small # experts
 template <class T>
-__device__ inline int64_t findTotalEltsLessThanTarget(T const* sorted_indices,
+__device__ inline int64_t findTotalEltsLessThanTarget_v1(T const* sorted_indices,
                                                       int64_t const arr_length, T const target) {
   int64_t low = 0, high = arr_length - 1, target_location = -1;
   while (low <= high) {
@@ -881,6 +881,49 @@ __device__ inline int64_t findTotalEltsLessThanTarget(T const* sorted_indices,
   return target_location + 1;
 }
 
+template <class T>
+__device__ inline int64_t findTotalEltsLessThanTarget_v2(T const* sorted_indices, int64_t const arr_length, T const target) {
+  constexpr int ARR_LENGTH_CONST = 128;
+  if (arr_length != ARR_LENGTH_CONST) {
+      asm("trap;");
+  }
+
+  constexpr unsigned full_mask = 0xffffffffu;
+  constexpr int WARP_SZ = 32;
+  const int lane_id = threadIdx.x & (WARP_SZ - 1);
+
+  int local_count = 0;
+#pragma unroll
+  for (int k = 0; k < ARR_LENGTH_CONST / WARP_SZ; ++k) {
+    const int idx = lane_id + k * WARP_SZ;
+    T v = sorted_indices[idx];
+    local_count += (v < target) ? 1 : 0;
+  }
+
+#pragma unroll
+  for (int offset = 16; offset > 0; offset >>= 1) {
+    local_count += __shfl_down_sync(full_mask, local_count, offset);
+  }
+  int total = __shfl_sync(full_mask, local_count, 0);
+
+  return (int64_t)total;
+}
+
+template <class T>
+__device__ inline int64_t findTotalEltsLessThanTarget(T const* sorted_indices, int64_t const arr_length, T const target) {
+    return findTotalEltsLessThanTarget_v1(sorted_indices, arr_length, target);
+
+//     return findTotalEltsLessThanTarget_v2(sorted_indices, arr_length, target);
+
+//     int64_t out_v1 = findTotalEltsLessThanTarget_v1(sorted_indices, arr_length, target);
+//     int64_t out_v2 = findTotalEltsLessThanTarget_v2(sorted_indices, arr_length, target);
+//     if (out_v1 != out_v2) {
+//         printf("different output! v1=%lld v2=%lld\n", out_v1, out_v2);
+//         asm("trap;");
+//     }
+//     return out_v1;
+}
+
 template <class T>
 using sizeof_bits = cutlass::sizeof_bits<
     typename cutlass_kernels::TllmToCutlassTypeAdapter<std::remove_cv_t<T>>::type>;

diff --git a/flashinfer/autotuner.py b/flashinfer/autotuner.py
@@ -447,6 +447,11 @@ def choose_one(
                 logger.debug(
                     f"[AutoTunner]: Generated key{AutoTuner._get_cache_key(custom_op, runners[0], input_shapes, tuning_config)}"
                 )
+            else:
+                # NOTE ADD
+                logger.debug(
+                    f"[AutoTunner]: HACK ADD cache hit {custom_op=} {input_shapes=}"
+                )
             return runner, tactic
 
         assert len(runners) > 0, "At least one runner is required"

diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -870,6 +870,12 @@ def cutlass_fused_moe(
     if enable_pdl is None:
         enable_pdl = device_support_pdl(input.device)
 
+    print(
+        "hi flashinfer cutlass_fused_moe "
+        f"{input.shape=} {input.dtype=} "
+        f"{token_selected_experts.shape=}"
+    )
+
     num_rows = input.shape[0]
     if min_latency_mode:
         num_rows *= fc2_expert_weights.shape[0]

diff --git a/pyproject.toml b/pyproject.toml
@@ -17,11 +17,11 @@ name = "flashinfer-python"
 description = "FlashInfer: Kernel Library for LLM Serving"
 requires-python = ">=3.9,<4.0"
 authors = [{ name = "FlashInfer team" }]
-license = "Apache-2.0"
+#license = "Apache-2.0"
-#license = "Apache-2.0"
+license = "Apache-2.0"
-#license = "Apache-2.0"
+license = "Apache-2.0"
 readme = "README.md"
 urls = { Homepage = "https://github.com/flashinfer-ai/flashinfer" }
 dynamic = ["dependencies", "version"]
-license-files = ["LICENSE", "licenses/*"]
+#license-files = ["LICENSE", "licenses/*"]
-#license-files = ["LICENSE", "licenses/*"]
+license-files = ["LICENSE", "licenses/*"]
-#license-files = ["LICENSE", "licenses/*"]
+license-files = ["LICENSE", "licenses/*"]
 
 [build-system]
 requires = ["setuptools>=77", "packaging>=24"]