[https://nvbugs/5690172][fix] Fix Qwen3-235B ATP accuracy issue with PDL (#9530)

syuoni · web-flow · commit 34e2fa5c964d · 2025-12-01T09:10:21.000+08:00
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu
@@ -120,6 +120,11 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues
     auto warp = cg::tiled_partition<WARP_SIZE>(block);
 
     BaseType minScore = BaseType{-INFINITY};
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    cudaGridDependencySynchronize();
+#endif
+
     for (uint32_t tokenId = warpIdx; tokenId < numTokens; tokenId += warpNum)
     {
         auto scoreOffset = tokenId * numExperts;
@@ -168,6 +173,10 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues
             }
         }
     } // end for tokenId
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 int nextPowerOfTwo(int num)
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt
@@ -66,6 +66,8 @@ function(process_target target_name enable_hopper enable_blackwell)
   if(${enable_hopper} AND "90" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
     # No kernels should be parsed, unless hopper is specified. This is a build
     # time improvement
+    target_compile_options(${target_name}
+                           PRIVATE "-DCUTLASS_ENABLE_GDC_FOR_SM90=1")
     target_compile_definitions(${target_name} PUBLIC COMPILE_HOPPER_TMA_GEMMS)
     target_compile_definitions(${target_name}
                                PUBLIC COMPILE_HOPPER_TMA_GROUPED_GEMMS)
@@ -78,6 +80,8 @@ function(process_target target_name enable_hopper enable_blackwell)
           OR "121" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
          ))
 
+    target_compile_options(${target_name}
+                           PRIVATE "-DCUTLASS_ENABLE_GDC_FOR_SM100=1")
     # Both 100 and 103 support these kernels
     if("100" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
        OR "103" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
diff --git a/examples/disaggregated/slurm/benchmark/accuracy_eval.sh b/examples/disaggregated/slurm/benchmark/accuracy_eval.sh
@@ -7,6 +7,7 @@ accuracy_model=${2}
 accuracy_tasks=${3}
 model_path=${4}
 model_args_extra=${5}
+output_dir=${6}
 
 echo "Starting accuracy evaluation..."
 echo "Log directory: ${full_logdir}"
@@ -31,9 +32,12 @@ echo "Installing lm_eval[api] and running evaluation..."
 pip install lm_eval[api]==0.4.8
 
 echo "Running lm_eval with tasks: ${accuracy_tasks}..."
+
+mkdir -p ${output_dir}
 lm_eval --model ${accuracy_model} \
     --tasks ${accuracy_tasks} \
     --model_args model=${model_path},base_url=${base_url},${model_args_extra} \
+    --output_path ${output_dir} --log_samples \
     --trust_remote_code
 
 echo "Accuracy evaluation completed successfully"
diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
@@ -276,7 +276,7 @@ if [ "${enable_accuracy_test}" = "true" ]; then
         --mpi=pmix --overlap -N 1 -n 1 \
         bash ${work_dir}/accuracy_eval.sh \
         "${full_logdir}" "${accuracy_model}" "${accuracy_tasks}" "${model_path}" \
-        "${model_args_extra}" \
+        "${model_args_extra}" "${full_logdir}/accuracy_eval" \
         &> ${full_logdir}/accuracy_eval.log; then
         cleanup_on_failure "Accuracy evaluation failed. Check ${full_logdir}/accuracy_eval.log for details"
     fi
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -83,7 +83,7 @@ BUILD_CORES_REQUEST = "8"
 BUILD_CORES_LIMIT = "8"
 BUILD_MEMORY_REQUEST = "48Gi"
 BUILD_MEMORY_LIMIT = "96Gi"
-BUILD_JOBS = "8"
+BUILD_JOBS = "4"
 
 SLURM_CORES_REQUEST = "1"
 SLURM_CORES_LIMIT = "1"
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1719,7 +1719,8 @@ def previous_seq_slots_device():
         num_draft_tokens = len(draft_tokens)
         total_num_tokens = len(position_ids)
         assert total_num_tokens <= self.max_num_tokens, (
-            "total_num_tokens should be less than or equal to max_num_tokens")
+            f"total_num_tokens ({total_num_tokens}) should be less than or equal to max_num_tokens ({self.max_num_tokens})"
+        )
         # if exist requests that do not have previous batch, copy input_ids and draft_tokens
         if num_tokens > 0:
             input_ids = torch.tensor(input_ids,

Original file line number	Diff line number	Diff line change
`@@ -120,6 +120,11 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues`
`120`	`120`	`auto warp = cg::tiled_partition<WARP_SIZE>(block);`
`121`	`121`
`122`	`122`	`BaseType minScore = BaseType{-INFINITY};`
	`123`	`+`
	`124`	`+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)`
	`125`	`+ cudaGridDependencySynchronize();`
	`126`	`+#endif`
	`127`	`+`
`123`	`128`	`for (uint32_t tokenId = warpIdx; tokenId < numTokens; tokenId += warpNum)`
`124`	`129`	`{`
`125`	`130`	`auto scoreOffset = tokenId * numExperts;`
`@@ -168,6 +173,10 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues`
`168`	`173`	`}`
`169`	`174`	`}`
`170`	`175`	`} // end for tokenId`
	`176`	`+`
	`177`	`+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)`
	`178`	`+ cudaTriggerProgrammaticLaunchCompletion();`
	`179`	`+#endif`
`171`	`180`	`}`
`172`	`181`
`173`	`182`	`int nextPowerOfTwo(int num)`