Skip to content

Remove getEnvEnablePDL in favor of enable_pdl parameter #1446

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
287 changes: 127 additions & 160 deletions csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
torch::optional<at::Tensor> const& input_sf, int64_t const tp_size, int64_t const tp_rank,
int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size,
int64_t const cluster_rank, bool const enable_alltoall, bool min_latency_mode,
torch::optional<c10::ArrayRef<int64_t>> const& profile_ids) {
torch::optional<c10::ArrayRef<int64_t>> const& profile_ids, bool enable_pdl) {
std::lock_guard<std::mutex> lock(mMutex);
// Free the profile workspace to save memory
freeProfileWorkspace();
Expand Down Expand Up @@ -315,7 +315,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
static_cast<char*>(workspace_info.workspace.data_ptr()), output.data_ptr(),
static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, enable_alltoall,
false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
stream);
enable_pdl, stream);
#else
mKernelRunner->runMoe(
input.const_data_ptr(), input_sf.has_value() ? input_sf.value().const_data_ptr() : nullptr,
Expand All @@ -331,7 +331,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace),
output.data_ptr(), static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
stream);
enable_pdl, stream);
#endif

return output;
Expand All @@ -346,7 +346,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
torch::optional<at::Tensor> const& input_sf, int64_t const tp_size, int64_t const tp_rank,
int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size,
int64_t const cluster_rank, bool const enable_alltoall, bool min_latency_mode,
torch::optional<c10::ArrayRef<int64_t>> const& profile_ids) {
torch::optional<c10::ArrayRef<int64_t>> const& profile_ids, bool enable_pdl) {
std::lock_guard<std::mutex> lock(mMutex);

// Free the profile workspace to save memory
Expand Down Expand Up @@ -458,7 +458,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
static_cast<char*>(workspace_info.workspace.data_ptr()), output.data_ptr(),
static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, enable_alltoall,
false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
stream);
enable_pdl, stream);
#else
mKernelRunner->runMoe(
input.const_data_ptr(), input_sf.has_value() ? input_sf.value().const_data_ptr() : nullptr,
Expand All @@ -474,7 +474,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace),
output.data_ptr(), static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
stream);
enable_pdl, stream);
#endif

return std::make_tuple(output, num_active_experts_per_node, experts_to_token_score,
Expand All @@ -493,7 +493,8 @@ class FusedMoeRunner : public torch::CustomClassHolder {
int64_t const tp_size, int64_t const tp_rank, int64_t const ep_size,
int64_t const ep_rank, int64_t const cluster_size, int64_t const cluster_rank,
bool const enable_alltoall, bool const min_latency_mode,
int64_t const gemm_idx, int64_t const profile_id, bool const do_preparation) {
int64_t const gemm_idx, int64_t const profile_id, bool const do_preparation,
bool enable_pdl) {
std::lock_guard<std::mutex> lock(mMutex);

// TODO: support profiling under fp8 block scaling in the future
Expand Down Expand Up @@ -558,11 +559,12 @@ class FusedMoeRunner : public torch::CustomClassHolder {
TORCH_CHECK(cu_malloc_status == cudaSuccess,
"Can't allocate profile workspace for MoE GEMM profile.");

mProfiler->prepare(num_rows, mProfileWorkspace, expert_weights_ptr, stream);
mProfiler->prepare(num_rows, mProfileWorkspace, expert_weights_ptr, enable_pdl, stream);
}

// Profile specific tactic. Assuming at least one preparation phase has been executed already.
mProfiler->runProfiler(num_rows, profile, mProfileWorkspace, expert_weights_ptr, stream);
mProfiler->runProfiler(num_rows, profile, mProfileWorkspace, expert_weights_ptr, enable_pdl,
stream);
}

private:
Expand Down
13 changes: 0 additions & 13 deletions csrc/nv_internal/cpp/common/envUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,19 +189,6 @@ bool getEnvUseTileSizeKv64ForTrtllmGen() {
return useTileSizeKv64;
}

bool getEnvEnablePDL() {
static std::once_flag flag;
static bool enablePDL = false;

std::call_once(flag, [&]() {
if (getSMVersion() >= 90) {
// PDL will be enabled by setting the env variables `TRTLLM_ENABLE_PDL` to `1`
enablePDL = getBoolEnv("TRTLLM_ENABLE_PDL");
}
});
return enablePDL;
}

bool getEnvUseUCXKvCache() {
static bool const useUCXKVCache = getBoolEnv("TRTLLM_USE_UCX_KVCACHE");
return useUCXKVCache;
Expand Down
57 changes: 33 additions & 24 deletions csrc/nv_internal/cpp/kernels/quantization.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ template void invokeQuantization<__nv_bfloat16>(int8_t* dst, __nv_bfloat16 const
template <typename T>
void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output,
int32_t* SFOuput, FP4QuantizationSFLayout layout,
int multiProcessorCount, cudaStream_t stream) {
int multiProcessorCount, bool enable_pdl, cudaStream_t stream) {
// Fixed SF_VEC_SIZE as 32
static constexpr int SF_VEC_SIZE = 32;

Expand All @@ -95,7 +95,7 @@ void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input,
config.stream = stream;
cudaLaunchAttribute attrs[1];
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
config.numAttrs = 1;
config.attrs = attrs;
cudaLaunchKernelEx(
Expand Down Expand Up @@ -168,7 +168,7 @@ INSTANTIATE_INVOKE_PER_TOKEN_QUANTIZATION(__nv_bfloat16, __nv_fp8_e4m3);
template <typename T, int SF_VEC_SIZE>
void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0, FP4QuantizationSFLayout layout,
int multiProcessorCount, cudaStream_t stream) {
int multiProcessorCount, bool enable_pdl, cudaStream_t stream) {
#ifdef ENABLE_FP8
if constexpr (std::is_same_v<T, __nv_fp8_e4m3>) {
// Grid, Block size.
Expand Down Expand Up @@ -204,7 +204,7 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, i
config.stream = stream;
cudaLaunchAttribute attrs[1];
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
config.numAttrs = 1;
config.attrs = attrs;
cudaLaunchKernelEx(&config, kernel_instance, m, n, input, SFScale,
Expand All @@ -217,7 +217,7 @@ template <typename T, int SF_VEC_SIZE>
void invokeBatchedFP4Quantization(int b, int m, int n, T const* input, float const* SFScale,
int64_t* output, int32_t* SFOuput, bool useUE8M0,
int multiProcessorCount, FP4QuantizationSFLayout layout,
cudaStream_t stream) {
bool enable_pdl, cudaStream_t stream) {
#ifdef ENABLE_FP8
if constexpr (std::is_same_v<T, __nv_fp8_e4m3>) {
// Grid, Block size.
Expand Down Expand Up @@ -253,7 +253,7 @@ void invokeBatchedFP4Quantization(int b, int m, int n, T const* input, float con
config.stream = stream;
cudaLaunchAttribute attrs[1];
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
config.numAttrs = 1;
config.attrs = attrs;
cudaLaunchKernelEx(&config, kernel_instance, b, m, n, input, SFScale,
Expand Down Expand Up @@ -344,47 +344,56 @@ void invokeNVFP4BlockScaleInterleaveReverse(int b, int m, int n, uint8_t const*
template void invokeFP4Quantization<half, 16>(int m, int n, half const* input, float const* SFScale,
int64_t* output, int32_t* SFOuput, bool useUE8M0,
FP4QuantizationSFLayout layout,
int multiProcessorCount, cudaStream_t stream);
int multiProcessorCount, bool enable_pdl,
cudaStream_t stream);
template void invokeFP4Quantization<half, 32>(int m, int n, half const* input, float const* SFScale,
int64_t* output, int32_t* SFOuput, bool useUE8M0,
FP4QuantizationSFLayout layout,
int multiProcessorCount, cudaStream_t stream);
template void invokeBatchedFP4Quantization<half, 16>(
int b, int m, int n, half const* input, float const* SFScale, int64_t* output, int32_t* SFOuput,
bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout, cudaStream_t stream);
template void invokeBatchedFP4Quantization<half, 32>(
int b, int m, int n, half const* input, float const* SFScale, int64_t* output, int32_t* SFOuput,
bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout, cudaStream_t stream);
int multiProcessorCount, bool enable_pdl,
cudaStream_t stream);
template void invokeBatchedFP4Quantization<half, 16>(int b, int m, int n, half const* input,
float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0,
int multiProcessorCount,
FP4QuantizationSFLayout layout,
bool enable_pdl, cudaStream_t stream);
template void invokeBatchedFP4Quantization<half, 32>(int b, int m, int n, half const* input,
float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0,
int multiProcessorCount,
FP4QuantizationSFLayout layout,
bool enable_pdl, cudaStream_t stream);
template void invokeMxFP8Quantization<half>(int b, int m, int n, int padded_n, half const* input,
int64_t* output, int32_t* SFOuput,
FP4QuantizationSFLayout layout, int multiProcessorCount,
cudaStream_t stream);
bool enable_pdl, cudaStream_t stream);
#ifdef ENABLE_BF16
template void invokeFP4Quantization<__nv_bfloat16, 16>(int m, int n, __nv_bfloat16 const* input,
float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0,
FP4QuantizationSFLayout layout,
int multiProcessorCount,
int multiProcessorCount, bool enable_pdl,
cudaStream_t stream);
template void invokeFP4Quantization<__nv_bfloat16, 32>(int m, int n, __nv_bfloat16 const* input,
float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0,
FP4QuantizationSFLayout layout,
int multiProcessorCount,
int multiProcessorCount, bool enable_pdl,
cudaStream_t stream);
template void invokeBatchedFP4Quantization<__nv_bfloat16, 16>(
int b, int m, int n, __nv_bfloat16 const* input, float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
cudaStream_t stream);
bool enable_pdl, cudaStream_t stream);
template void invokeBatchedFP4Quantization<__nv_bfloat16, 32>(
int b, int m, int n, __nv_bfloat16 const* input, float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
cudaStream_t stream);
bool enable_pdl, cudaStream_t stream);
template void invokeMxFP8Quantization<__nv_bfloat16>(int b, int m, int n, int padded_n,
__nv_bfloat16 const* input, int64_t* output,
int32_t* SFOuput,
FP4QuantizationSFLayout layout,
int multiProcessorCount, cudaStream_t stream);
int multiProcessorCount, bool enable_pdl,
cudaStream_t stream);

#endif

Expand All @@ -393,22 +402,22 @@ template void invokeFP4Quantization<__nv_fp8_e4m3, 16>(int m, int n, __nv_fp8_e4
float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0,
FP4QuantizationSFLayout layout,
int multiProcessorCount,
int multiProcessorCount, bool enable_pdl,
cudaStream_t stream);
template void invokeFP4Quantization<__nv_fp8_e4m3, 32>(int m, int n, __nv_fp8_e4m3 const* input,
float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0,
FP4QuantizationSFLayout layout,
int multiProcessorCount,
int multiProcessorCount, bool enable_pdl,
cudaStream_t stream);
template void invokeBatchedFP4Quantization<__nv_fp8_e4m3, 16>(
int b, int m, int n, __nv_fp8_e4m3 const* input, float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
cudaStream_t stream);
bool enable_pdl, cudaStream_t stream);
template void invokeBatchedFP4Quantization<__nv_fp8_e4m3, 32>(
int b, int m, int n, __nv_fp8_e4m3 const* input, float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
cudaStream_t stream);
bool enable_pdl, cudaStream_t stream);
#endif

////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
3 changes: 0 additions & 3 deletions csrc/nv_internal/tensorrt_llm/common/envUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,6 @@ int getEnvMmhaBlocksPerSequence();

int getEnvMmhaKernelBlockSize();

// Whether PDL is enabled.
bool getEnvEnablePDL();

bool getEnvUseUCXKvCache();

bool getEnvUseMPIKvCache();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,9 @@ struct TmaWarpSpecializedGroupedGemmInput {
uint8_t* gemm_workspace = nullptr;
size_t gemm_workspace_size = 0;

// Whether to enable PDL (Programmatic Dependent Launch).
bool enable_pdl;

static std::array<size_t, 17> workspaceBuffers(int num_experts, FpXBlockScalingType scaling_type);

static size_t workspaceSize(int num_experts, FpXBlockScalingType scaling_type);
Expand Down
Loading