MaxActiveBlocks

syuoni · syuoni · commit 4b6b292bf300 · 2025-12-02T05:27:40.000Z
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/common/cudaUtils.h b/cpp/include/tensorrt_llm/common/cudaUtils.h
@@ -38,6 +38,7 @@
 #include <optional>
 #include <sstream>
 #include <string>
+#include <unordered_map>
 #ifndef _WIN32 // Linux
 #include <sys/sysinfo.h>
 #endif         // not WIN32
@@ -432,6 +433,24 @@ inline int getMaxSharedMemoryPerBlockOptin()
     return nByteMaxSharedMemoryPerBlockOptin;
 }
 
+template <typename T>
+inline int getMaxActiveBlocksPerSM(T kernel, int blockSize, size_t dynamicSMemSize)
+{
+    static std::unordered_map<T, int> cache;
+    auto it = cache.find(kernel);
+    if (it == cache.end())
+    {
+        int numBlocks;
+        check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, blockSize, dynamicSMemSize));
+        cache[kernel] = numBlocks;
+        return numBlocks;
+    }
+    else
+    {
+        return it->second;
+    }
+}
+
 template <typename T1, typename T2>
 inline size_t divUp(T1 const& a, T2 const& b)
 {
diff --git a/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu b/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu
@@ -141,12 +141,12 @@ void moePermute(InputType const* input, InputType* permuted_output, SFType const
     }
 #endif
 
+    auto kernel = &moePermuteKernel<InputType, SFType, kSFVecSize, kThreadsPerBlock>;
     static int32_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
-    int32_t const blocks = std::min(smCount * 8, max_num_permuted_tokens);
+    int32_t const maxBlocksPerSM = tensorrt_llm::common::getMaxActiveBlocksPerSM(kernel, kThreadsPerBlock, 0);
+    int32_t const blocks = std::min(smCount * maxBlocksPerSM, max_num_permuted_tokens);
     int32_t const threads = kThreadsPerBlock;
 
-    auto kernel = &moePermuteKernel<InputType, SFType, kSFVecSize, kThreadsPerBlock>;
-
     cudaLaunchConfig_t config;
     config.gridDim = blocks;
     config.blockDim = threads;
@@ -382,10 +382,6 @@ void moeActivation(InputType const* input, OutputType* output, float const* glob
     }
 #endif
 
-    static int32_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
-    int32_t const blocks = std::min(smCount * 8, max_num_permuted_tokens);
-    int32_t const threads = kThreadsPerBlock;
-
     auto get_act_kernel = [](ActivationType activation_type) -> void (*)(InputType const* input, OutputType* output,
                                                                  float const* global_sf, SFType* output_sf,
                                                                  int32_t const* tile_idx_to_mn_limit,
@@ -424,6 +420,11 @@ void moeActivation(InputType const* input, OutputType* output, float const* glob
     };
     auto kernel = get_act_kernel(activation_params.activation_type);
 
+    static int32_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
+    int32_t const maxBlocksPerSM = tensorrt_llm::common::getMaxActiveBlocksPerSM(kernel, kThreadsPerBlock, 0);
+    int32_t const blocks = std::min(smCount * maxBlocksPerSM, max_num_permuted_tokens);
+    int32_t const threads = kThreadsPerBlock;
+
     cudaLaunchConfig_t config;
     config.gridDim = blocks;
     config.blockDim = threads;
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
@@ -1587,11 +1587,6 @@ void expandInputRowsKernelLauncher(InputActivationsType const* unpermuted_input,
     int64_t num_padding_tokens = 0;
 #endif
 
-    static int64_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
-    // Note: Launching 8 blocks per SM can fully leverage the memory bandwidth (tested on B200).
-    int64_t const blocks = std::min(smCount * 8, std::max(num_rows * k, num_padding_tokens));
-    int64_t const threads = EXPAND_THREADS_PER_BLOCK;
-
     auto func = [&]()
     {
 #ifdef ENABLE_FP8
@@ -1637,6 +1632,12 @@ void expandInputRowsKernelLauncher(InputActivationsType const* unpermuted_input,
         }
     }();
 
+    static int32_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
+    int32_t const maxBlocksPerSM = tensorrt_llm::common::getMaxActiveBlocksPerSM(func, EXPAND_THREADS_PER_BLOCK, 0);
+    int32_t const blocks
+        = std::min(smCount * maxBlocksPerSM, static_cast<int32_t>(std::max(num_rows * k, num_padding_tokens)));
+    int32_t const threads = EXPAND_THREADS_PER_BLOCK;
+
     cudaLaunchConfig_t config;
     config.gridDim = blocks;
     config.blockDim = threads;
@@ -1891,15 +1892,18 @@ void finalizeMoeRoutingKernelLauncher(GemmOutputType const* expanded_permuted_ro
     if (parallelism_config.ep_size > 1 && enable_alltoall)
     {
         // If all-to-all comm is enabled, finalizeMoeRouting doesn't need to fill the invalid output tokens with zeros.
-        static int const smCount = tensorrt_llm::common::getMultiProcessorCount();
-        // Note: Launching 8 blocks per SM can fully leverage the memory bandwidth (tested on B200).
-        int64_t const blocks = smCount * 8;
-        int64_t const threads = FINALIZE_THREADS_PER_BLOCK;
-        config.gridDim = blocks;
-        config.blockDim = threads;
         auto func = final_scales
             ? &finalizeMoeRoutingNoFillingKernel<OutputType, GemmOutputType, ScaleBiasType, ScaleMode::DEFAULT>
             : &finalizeMoeRoutingNoFillingKernel<OutputType, GemmOutputType, ScaleBiasType, ScaleMode::NO_SCALE>;
+
+        static int32_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
+        int32_t const maxBlocksPerSM
+            = tensorrt_llm::common::getMaxActiveBlocksPerSM(func, FINALIZE_THREADS_PER_BLOCK, 0);
+        int32_t const blocks = std::min(smCount * maxBlocksPerSM, static_cast<int32_t>(num_rows * experts_per_token));
+        int32_t const threads = FINALIZE_THREADS_PER_BLOCK;
+
+        config.gridDim = blocks;
+        config.blockDim = threads;
         cudaLaunchKernelEx(&config, func, expanded_permuted_rows, reduced_unpermuted_output, bias_ptr, final_scales,
             unpermuted_row_to_permuted_row, permuted_row_to_unpermuted_row, token_selected_experts,
             expert_first_token_offset, num_rows, padded_cols, unpadded_cols, experts_per_token, num_experts_per_node,
@@ -2235,11 +2239,6 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
     int64_t num_padding_tokens = 0;
 #endif
 
-    static int64_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
-    // Note: Launching 8 blocks per SM can fully leverage the memory bandwidth (tested on B200).
-    int64_t const blocks = std::min(smCount * 8, std::max(expanded_num_tokens, num_padding_tokens));
-    int64_t const threads = ACTIVATION_THREADS_PER_BLOCK;
-
     auto fn = [&]()
     {
         // IMPORTANT: Keep the order of the activation functions in the same order as the ActivationType enum in
@@ -2302,6 +2301,12 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
         }
     }();
 
+    static int32_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
+    int32_t const maxBlocksPerSM = tensorrt_llm::common::getMaxActiveBlocksPerSM(fn, ACTIVATION_THREADS_PER_BLOCK, 0);
+    int32_t const blocks
+        = std::min(smCount * maxBlocksPerSM, static_cast<int32_t>(std::max(expanded_num_tokens, num_padding_tokens)));
+    int32_t const threads = ACTIVATION_THREADS_PER_BLOCK;
+
     cudaLaunchConfig_t config;
     config.gridDim = blocks;
     config.blockDim = threads;
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -727,7 +727,7 @@ def choose_one(
             # Log the cache miss. Expect no cache miss in inference.
             if not is_cache_hit:
                 logger.warning_once(
-                    f"[AutoTunner] Using the fallback tactic, due to cache miss on input shapes={input_shapes}",
+                    f"[AutoTunner] {custom_op} using the fallback tactic, due to cache miss on input shapes={input_shapes}",
                     key=(custom_op, "warning_autotuning_cache_miss_fallback"))
 
             return (best_runner, best_tactic)