Skip to content

Commit 34e2fa5

Browse files
authored
[https://nvbugs/5690172][fix] Fix Qwen3-235B ATP accuracy issue with PDL (#9530)
Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
1 parent 6e470aa commit 34e2fa5

File tree

6 files changed

+21
-3
lines changed

6 files changed

+21
-3
lines changed

cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,11 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues
120120
auto warp = cg::tiled_partition<WARP_SIZE>(block);
121121

122122
BaseType minScore = BaseType{-INFINITY};
123+
124+
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
125+
cudaGridDependencySynchronize();
126+
#endif
127+
123128
for (uint32_t tokenId = warpIdx; tokenId < numTokens; tokenId += warpNum)
124129
{
125130
auto scoreOffset = tokenId * numExperts;
@@ -168,6 +173,10 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues
168173
}
169174
}
170175
} // end for tokenId
176+
177+
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
178+
cudaTriggerProgrammaticLaunchCompletion();
179+
#endif
171180
}
172181

173182
int nextPowerOfTwo(int num)

cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ function(process_target target_name enable_hopper enable_blackwell)
6666
if(${enable_hopper} AND "90" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
6767
# No kernels should be parsed, unless hopper is specified. This is a build
6868
# time improvement
69+
target_compile_options(${target_name}
70+
PRIVATE "-DCUTLASS_ENABLE_GDC_FOR_SM90=1")
6971
target_compile_definitions(${target_name} PUBLIC COMPILE_HOPPER_TMA_GEMMS)
7072
target_compile_definitions(${target_name}
7173
PUBLIC COMPILE_HOPPER_TMA_GROUPED_GEMMS)
@@ -78,6 +80,8 @@ function(process_target target_name enable_hopper enable_blackwell)
7880
OR "121" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
7981
))
8082

83+
target_compile_options(${target_name}
84+
PRIVATE "-DCUTLASS_ENABLE_GDC_FOR_SM100=1")
8185
# Both 100 and 103 support these kernels
8286
if("100" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
8387
OR "103" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)

examples/disaggregated/slurm/benchmark/accuracy_eval.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ accuracy_model=${2}
77
accuracy_tasks=${3}
88
model_path=${4}
99
model_args_extra=${5}
10+
output_dir=${6}
1011

1112
echo "Starting accuracy evaluation..."
1213
echo "Log directory: ${full_logdir}"
@@ -31,9 +32,12 @@ echo "Installing lm_eval[api] and running evaluation..."
3132
pip install lm_eval[api]==0.4.8
3233

3334
echo "Running lm_eval with tasks: ${accuracy_tasks}..."
35+
36+
mkdir -p ${output_dir}
3437
lm_eval --model ${accuracy_model} \
3538
--tasks ${accuracy_tasks} \
3639
--model_args model=${model_path},base_url=${base_url},${model_args_extra} \
40+
--output_path ${output_dir} --log_samples \
3741
--trust_remote_code
3842

3943
echo "Accuracy evaluation completed successfully"

examples/disaggregated/slurm/benchmark/disaggr_torch.slurm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ if [ "${enable_accuracy_test}" = "true" ]; then
276276
--mpi=pmix --overlap -N 1 -n 1 \
277277
bash ${work_dir}/accuracy_eval.sh \
278278
"${full_logdir}" "${accuracy_model}" "${accuracy_tasks}" "${model_path}" \
279-
"${model_args_extra}" \
279+
"${model_args_extra}" "${full_logdir}/accuracy_eval" \
280280
&> ${full_logdir}/accuracy_eval.log; then
281281
cleanup_on_failure "Accuracy evaluation failed. Check ${full_logdir}/accuracy_eval.log for details"
282282
fi

jenkins/L0_Test.groovy

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ BUILD_CORES_REQUEST = "8"
8383
BUILD_CORES_LIMIT = "8"
8484
BUILD_MEMORY_REQUEST = "48Gi"
8585
BUILD_MEMORY_LIMIT = "96Gi"
86-
BUILD_JOBS = "8"
86+
BUILD_JOBS = "4"
8787

8888
SLURM_CORES_REQUEST = "1"
8989
SLURM_CORES_LIMIT = "1"

tensorrt_llm/_torch/pyexecutor/model_engine.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1719,7 +1719,8 @@ def previous_seq_slots_device():
17191719
num_draft_tokens = len(draft_tokens)
17201720
total_num_tokens = len(position_ids)
17211721
assert total_num_tokens <= self.max_num_tokens, (
1722-
"total_num_tokens should be less than or equal to max_num_tokens")
1722+
f"total_num_tokens ({total_num_tokens}) should be less than or equal to max_num_tokens ({self.max_num_tokens})"
1723+
)
17231724
# if exist requests that do not have previous batch, copy input_ids and draft_tokens
17241725
if num_tokens > 0:
17251726
input_ids = torch.tensor(input_ids,

0 commit comments

Comments
 (0)