flashinfer-ai
diff --git a/‎csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
Lines changed: 127 additions & 160 deletions b/‎csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
Lines changed: 127 additions & 160 deletions
diff --git a/‎csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu
Lines changed: 11 additions & 9 deletions b/‎csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu
Lines changed: 11 additions & 9 deletions
diff --git a/‎csrc/nv_internal/cpp/common/envUtils.cpp
Lines changed: 0 additions & 13 deletions b/‎csrc/nv_internal/cpp/common/envUtils.cpp
Lines changed: 0 additions & 13 deletions
diff --git a/‎csrc/nv_internal/cpp/kernels/quantization.cu
Lines changed: 33 additions & 24 deletions b/‎csrc/nv_internal/cpp/kernels/quantization.cu
Lines changed: 33 additions & 24 deletions
diff --git a/‎csrc/nv_internal/tensorrt_llm/common/envUtils.h
Lines changed: 0 additions & 3 deletions b/‎csrc/nv_internal/tensorrt_llm/common/envUtils.h
Lines changed: 0 additions & 3 deletions
diff --git a/‎csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
Lines changed: 3 additions & 0 deletions b/‎csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
Lines changed: 3 additions & 0 deletions
@@ -204,7 +204,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       torch::optional<at::Tensor> const& input_sf, int64_t const tp_size, int64_t const tp_rank,
       int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size,
       int64_t const cluster_rank, bool const enable_alltoall, bool min_latency_mode,
-      torch::optional<c10::ArrayRef<int64_t>> const& profile_ids) {
+      torch::optional<c10::ArrayRef<int64_t>> const& profile_ids, bool enable_pdl) {
     std::lock_guard<std::mutex> lock(mMutex);
     // Free the profile workspace to save memory
     freeProfileWorkspace();
@@ -315,7 +315,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
         static_cast<char*>(workspace_info.workspace.data_ptr()), output.data_ptr(),
         static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, enable_alltoall,
         false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
-        stream);
+        enable_pdl, stream);
 #else
     mKernelRunner->runMoe(
         input.const_data_ptr(), input_sf.has_value() ? input_sf.value().const_data_ptr() : nullptr,
@@ -331,7 +331,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
         static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace),
         output.data_ptr(), static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
         false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
-        stream);
+        enable_pdl, stream);
 #endif
 
     return output;
@@ -346,7 +346,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       torch::optional<at::Tensor> const& input_sf, int64_t const tp_size, int64_t const tp_rank,
       int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size,
       int64_t const cluster_rank, bool const enable_alltoall, bool min_latency_mode,
-      torch::optional<c10::ArrayRef<int64_t>> const& profile_ids) {
+      torch::optional<c10::ArrayRef<int64_t>> const& profile_ids, bool enable_pdl) {
     std::lock_guard<std::mutex> lock(mMutex);
 
     // Free the profile workspace to save memory
@@ -458,7 +458,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
         static_cast<char*>(workspace_info.workspace.data_ptr()), output.data_ptr(),
         static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, enable_alltoall,
         false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
-        stream);
+        enable_pdl, stream);
 #else
     mKernelRunner->runMoe(
         input.const_data_ptr(), input_sf.has_value() ? input_sf.value().const_data_ptr() : nullptr,
@@ -474,7 +474,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
         static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace),
         output.data_ptr(), static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
         false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
-        stream);
+        enable_pdl, stream);
 #endif
 
     return std::make_tuple(output, num_active_experts_per_node, experts_to_token_score,
@@ -493,7 +493,8 @@ class FusedMoeRunner : public torch::CustomClassHolder {
                       int64_t const tp_size, int64_t const tp_rank, int64_t const ep_size,
                       int64_t const ep_rank, int64_t const cluster_size, int64_t const cluster_rank,
                       bool const enable_alltoall, bool const min_latency_mode,
-                      int64_t const gemm_idx, int64_t const profile_id, bool const do_preparation) {
+                      int64_t const gemm_idx, int64_t const profile_id, bool const do_preparation,
+                      bool enable_pdl) {
     std::lock_guard<std::mutex> lock(mMutex);
 
     // TODO: support profiling under fp8 block scaling in the future
@@ -558,11 +559,12 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       TORCH_CHECK(cu_malloc_status == cudaSuccess,
                   "Can't allocate profile workspace for MoE GEMM profile.");
 
-      mProfiler->prepare(num_rows, mProfileWorkspace, expert_weights_ptr, stream);
+      mProfiler->prepare(num_rows, mProfileWorkspace, expert_weights_ptr, enable_pdl, stream);
     }
 
     // Profile specific tactic. Assuming at least one preparation phase has been executed already.
-    mProfiler->runProfiler(num_rows, profile, mProfileWorkspace, expert_weights_ptr, stream);
+    mProfiler->runProfiler(num_rows, profile, mProfileWorkspace, expert_weights_ptr, enable_pdl,
+                           stream);
   }
 
  private:
 
@@ -189,19 +189,6 @@ bool getEnvUseTileSizeKv64ForTrtllmGen() {
   return useTileSizeKv64;
 }
 
-bool getEnvEnablePDL() {
-  static std::once_flag flag;
-  static bool enablePDL = false;
-
-  std::call_once(flag, [&]() {
-    if (getSMVersion() >= 90) {
-      // PDL will be enabled by setting the env variables `TRTLLM_ENABLE_PDL` to `1`
-      enablePDL = getBoolEnv("TRTLLM_ENABLE_PDL");
-    }
-  });
-  return enablePDL;
-}
-
 bool getEnvUseUCXKvCache() {
   static bool const useUCXKVCache = getBoolEnv("TRTLLM_USE_UCX_KVCACHE");
   return useUCXKVCache;
 
@@ -76,7 +76,7 @@ template void invokeQuantization<__nv_bfloat16>(int8_t* dst, __nv_bfloat16 const
 template <typename T>
 void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output,
                              int32_t* SFOuput, FP4QuantizationSFLayout layout,
-                             int multiProcessorCount, cudaStream_t stream) {
+                             int multiProcessorCount, bool enable_pdl, cudaStream_t stream) {
   // Fixed SF_VEC_SIZE as 32
   static constexpr int SF_VEC_SIZE = 32;
 
@@ -95,7 +95,7 @@ void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input,
   config.stream = stream;
   cudaLaunchAttribute attrs[1];
   attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
+  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
   config.numAttrs = 1;
   config.attrs = attrs;
   cudaLaunchKernelEx(
@@ -168,7 +168,7 @@ INSTANTIATE_INVOKE_PER_TOKEN_QUANTIZATION(__nv_bfloat16, __nv_fp8_e4m3);
 template <typename T, int SF_VEC_SIZE>
 void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, int64_t* output,
                            int32_t* SFOuput, bool useUE8M0, FP4QuantizationSFLayout layout,
-                           int multiProcessorCount, cudaStream_t stream) {
+                           int multiProcessorCount, bool enable_pdl, cudaStream_t stream) {
 #ifdef ENABLE_FP8
   if constexpr (std::is_same_v<T, __nv_fp8_e4m3>) {
     // Grid, Block size.
@@ -204,7 +204,7 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, i
     config.stream = stream;
     cudaLaunchAttribute attrs[1];
     attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-    attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
+    attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
     config.numAttrs = 1;
     config.attrs = attrs;
     cudaLaunchKernelEx(&config, kernel_instance, m, n, input, SFScale,
@@ -217,7 +217,7 @@ template <typename T, int SF_VEC_SIZE>
 void invokeBatchedFP4Quantization(int b, int m, int n, T const* input, float const* SFScale,
                                   int64_t* output, int32_t* SFOuput, bool useUE8M0,
                                   int multiProcessorCount, FP4QuantizationSFLayout layout,
-                                  cudaStream_t stream) {
+                                  bool enable_pdl, cudaStream_t stream) {
 #ifdef ENABLE_FP8
   if constexpr (std::is_same_v<T, __nv_fp8_e4m3>) {
     // Grid, Block size.
@@ -253,7 +253,7 @@ void invokeBatchedFP4Quantization(int b, int m, int n, T const* input, float con
     config.stream = stream;
     cudaLaunchAttribute attrs[1];
     attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-    attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
+    attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
     config.numAttrs = 1;
     config.attrs = attrs;
     cudaLaunchKernelEx(&config, kernel_instance, b, m, n, input, SFScale,
@@ -344,47 +344,56 @@ void invokeNVFP4BlockScaleInterleaveReverse(int b, int m, int n, uint8_t const*
 template void invokeFP4Quantization<half, 16>(int m, int n, half const* input, float const* SFScale,
                                               int64_t* output, int32_t* SFOuput, bool useUE8M0,
                                               FP4QuantizationSFLayout layout,
-                                              int multiProcessorCount, cudaStream_t stream);
+                                              int multiProcessorCount, bool enable_pdl,
+                                              cudaStream_t stream);
 template void invokeFP4Quantization<half, 32>(int m, int n, half const* input, float const* SFScale,
                                               int64_t* output, int32_t* SFOuput, bool useUE8M0,
                                               FP4QuantizationSFLayout layout,
-                                              int multiProcessorCount, cudaStream_t stream);
-template void invokeBatchedFP4Quantization<half, 16>(
-    int b, int m, int n, half const* input, float const* SFScale, int64_t* output, int32_t* SFOuput,
-    bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout, cudaStream_t stream);
-template void invokeBatchedFP4Quantization<half, 32>(
-    int b, int m, int n, half const* input, float const* SFScale, int64_t* output, int32_t* SFOuput,
-    bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout, cudaStream_t stream);
+                                              int multiProcessorCount, bool enable_pdl,
+                                              cudaStream_t stream);
+template void invokeBatchedFP4Quantization<half, 16>(int b, int m, int n, half const* input,
+                                                     float const* SFScale, int64_t* output,
+                                                     int32_t* SFOuput, bool useUE8M0,
+                                                     int multiProcessorCount,
+                                                     FP4QuantizationSFLayout layout,
+                                                     bool enable_pdl, cudaStream_t stream);
+template void invokeBatchedFP4Quantization<half, 32>(int b, int m, int n, half const* input,
+                                                     float const* SFScale, int64_t* output,
+                                                     int32_t* SFOuput, bool useUE8M0,
+                                                     int multiProcessorCount,
+                                                     FP4QuantizationSFLayout layout,
+                                                     bool enable_pdl, cudaStream_t stream);
 template void invokeMxFP8Quantization<half>(int b, int m, int n, int padded_n, half const* input,
                                             int64_t* output, int32_t* SFOuput,
                                             FP4QuantizationSFLayout layout, int multiProcessorCount,
-                                            cudaStream_t stream);
+                                            bool enable_pdl, cudaStream_t stream);
 #ifdef ENABLE_BF16
 template void invokeFP4Quantization<__nv_bfloat16, 16>(int m, int n, __nv_bfloat16 const* input,
                                                        float const* SFScale, int64_t* output,
                                                        int32_t* SFOuput, bool useUE8M0,
                                                        FP4QuantizationSFLayout layout,
-                                                       int multiProcessorCount,
+                                                       int multiProcessorCount, bool enable_pdl,
                                                        cudaStream_t stream);
 template void invokeFP4Quantization<__nv_bfloat16, 32>(int m, int n, __nv_bfloat16 const* input,
                                                        float const* SFScale, int64_t* output,
                                                        int32_t* SFOuput, bool useUE8M0,
                                                        FP4QuantizationSFLayout layout,
-                                                       int multiProcessorCount,
+                                                       int multiProcessorCount, bool enable_pdl,
                                                        cudaStream_t stream);
 template void invokeBatchedFP4Quantization<__nv_bfloat16, 16>(
     int b, int m, int n, __nv_bfloat16 const* input, float const* SFScale, int64_t* output,
     int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
-    cudaStream_t stream);
+    bool enable_pdl, cudaStream_t stream);
 template void invokeBatchedFP4Quantization<__nv_bfloat16, 32>(
     int b, int m, int n, __nv_bfloat16 const* input, float const* SFScale, int64_t* output,
     int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
-    cudaStream_t stream);
+    bool enable_pdl, cudaStream_t stream);
 template void invokeMxFP8Quantization<__nv_bfloat16>(int b, int m, int n, int padded_n,
                                                      __nv_bfloat16 const* input, int64_t* output,
                                                      int32_t* SFOuput,
                                                      FP4QuantizationSFLayout layout,
-                                                     int multiProcessorCount, cudaStream_t stream);
+                                                     int multiProcessorCount, bool enable_pdl,
+                                                     cudaStream_t stream);
 
 #endif
 
@@ -393,22 +402,22 @@ template void invokeFP4Quantization<__nv_fp8_e4m3, 16>(int m, int n, __nv_fp8_e4
                                                        float const* SFScale, int64_t* output,
                                                        int32_t* SFOuput, bool useUE8M0,
                                                        FP4QuantizationSFLayout layout,
-                                                       int multiProcessorCount,
+                                                       int multiProcessorCount, bool enable_pdl,
                                                        cudaStream_t stream);
 template void invokeFP4Quantization<__nv_fp8_e4m3, 32>(int m, int n, __nv_fp8_e4m3 const* input,
                                                        float const* SFScale, int64_t* output,
                                                        int32_t* SFOuput, bool useUE8M0,
                                                        FP4QuantizationSFLayout layout,
-                                                       int multiProcessorCount,
+                                                       int multiProcessorCount, bool enable_pdl,
                                                        cudaStream_t stream);
 template void invokeBatchedFP4Quantization<__nv_fp8_e4m3, 16>(
     int b, int m, int n, __nv_fp8_e4m3 const* input, float const* SFScale, int64_t* output,
     int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
-    cudaStream_t stream);
+    bool enable_pdl, cudaStream_t stream);
 template void invokeBatchedFP4Quantization<__nv_fp8_e4m3, 32>(
     int b, int m, int n, __nv_fp8_e4m3 const* input, float const* SFScale, int64_t* output,
     int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
-    cudaStream_t stream);
+    bool enable_pdl, cudaStream_t stream);
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -48,9 +48,6 @@ int getEnvMmhaBlocksPerSequence();
 
 int getEnvMmhaKernelBlockSize();
 
-// Whether PDL is enabled.
-bool getEnvEnablePDL();
-
 bool getEnvUseUCXKvCache();
 
 bool getEnvUseMPIKvCache();
 
@@ -226,6 +226,9 @@ struct TmaWarpSpecializedGroupedGemmInput {
   uint8_t* gemm_workspace = nullptr;
   size_t gemm_workspace_size = 0;
 
+  // Whether to enable PDL (Programmatic Dependent Launch).
+  bool enable_pdl;
+
   static std::array<size_t, 17> workspaceBuffers(int num_experts, FpXBlockScalingType scaling_type);
 
   static size_t workspaceSize(int num_experts, FpXBlockScalingType scaling_type);