[https://nvbugs/5545522][fix] move PREEXIT in UB kernels to fix accuracy issue (NVIDIA#8318)

dc3671 · fredricz-20070104 · commit 62dc3b76628b · 2025-11-05T10:55:21.000+08:00
Signed-off-by: Zhenhuan Chen &lt;zhenhuanc@nvidia.com&gt;
Signed-off-by: Mike Iovine &lt;6158008+mikeiovine@users.noreply.github.com&gt;
Signed-off-by: FredricZ-2007 &lt;226039983+fredricz-20070104@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu
@@ -601,6 +601,8 @@ __global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel(
         }
     }
     __syncthreads();
+    asm volatile("griddepcontrol.wait;");
+    asm volatile("griddepcontrol.launch_dependents;");
 
     if (warp_idx < 2)
     {
@@ -622,7 +624,6 @@ __global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel(
         mma_computer.issue_mainloop();
         mma_computer.epi();
     }
-    asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }
 
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu
@@ -56,12 +56,12 @@ __global__ void __launch_bounds__(MAX_THREADS)
     userbuffers_fp16_sum_inplace_gpu_rw(int const op, int const flagoffset, int const firstrank, int const myrank,
         int const gpustep, size_t const lineoffset, int const numlines, void** commbuff, int const handleridx)
 {
-#if __CUDA_ARCH__ >= 900
-    cudaTriggerProgrammaticLaunchCompletion();
-#endif
     __shared__ int4* userptr[RANKS];
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+#if __CUDA_ARCH__ >= 900
+    cudaGridDependencySynchronize();
+#endif
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -72,9 +72,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-#if __CUDA_ARCH__ >= 900
-        cudaGridDependencySynchronize();
-#endif
         flagptr[physgpu] = reduce_id;
         userptr[threadIdx.x] = reinterpret_cast<int4*>(commbuff[targetgpu + handleridx]);
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
@@ -130,19 +127,22 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+#if __CUDA_ARCH__ >= 900
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 } // fp16 inplace reduce kernel (Hopper)
 
 template <typename DType, int RANKS>
 __global__ void __launch_bounds__(MAX_THREADS)
     userbuffers_fp16_sum_inplace_gpu_rr(int const op, int const flagoffset, int const firstrank, int const myrank,
         int const gpustep, size_t const lineoffset, int const numlines, void** commbuff, int const handleridx)
 {
-#if __CUDA_ARCH__ >= 900
-    cudaTriggerProgrammaticLaunchCompletion();
-#endif
     __shared__ int4* userptr[RANKS];
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+#if __CUDA_ARCH__ >= 900
+    cudaGridDependencySynchronize();
+#endif
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -153,9 +153,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-#if __CUDA_ARCH__ >= 900
-        cudaGridDependencySynchronize();
-#endif
         flagptr[physgpu] = reduce_id;
         userptr[threadIdx.x] = reinterpret_cast<int4*>(commbuff[targetgpu + handleridx]);
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
@@ -239,6 +236,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+#if __CUDA_ARCH__ >= 900
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 } // fp16 inplace reduce kernel (Ampere)
 
 #if __CUDA_ARCH__ >= 900
@@ -365,7 +365,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
         *reduceidptr = reduce_id;
 } // fp16 inplace reduce kernel (Hopper) MC
 
-#else
+#else  // __CUDA_ARCH__ >= 900
 template <typename DType, int RANKS, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc(int const op, int const flagoffset,
     int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
@@ -375,7 +375,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
     asm volatile("brkpt;\n");
 }
 
-#endif
+#endif // __CUDA_ARCH__ >= 900
 
 #define callranks(x)                                                                                                   \
     if (ar_nvsize == x)                                                                                                \
@@ -568,13 +568,13 @@ __global__ void __launch_bounds__(MAX_THREADS)
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
     constexpr int SF_VEC_SIZE = 16;
     using PackedVec = PackedVec<DType>;
-    cudaTriggerProgrammaticLaunchCompletion();
     float sf = *scale;
     __shared__ float s_variance;
     int hidden_dim = blockDim.x * UNROLL_NLINES * sizeof(int4) / sizeof(DType);
 
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -585,7 +585,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
         reduce_id = next_flag(reduce_id);
@@ -670,6 +669,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -684,13 +684,13 @@ __global__ void __launch_bounds__(MAX_THREADS)
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
     constexpr int SF_VEC_SIZE = 16;
     using PackedVec = PackedVec<DType>;
-    cudaTriggerProgrammaticLaunchCompletion();
     float sf = *scale;
     __shared__ float s_variance;
     int hidden_dim = blockDim.x * UNROLL_NLINES * sizeof(int4) / sizeof(DType);
 
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -701,7 +701,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
     }
@@ -772,6 +771,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -784,11 +784,11 @@ __global__ void __launch_bounds__(MAX_THREADS)
         float4* mc_ptr, DType const* beta, DType const* gamma, float const eps, int const RANKS, float4* mc_ptr_out,
         size_t const out_lineoffset, uint4* residual_in, uint4* residual_out, int res_offset)
 {
-    cudaTriggerProgrammaticLaunchCompletion();
     __shared__ float s_variance;
     int hidden_dim = blockDim.x * UNROLL_NLINES * sizeof(int4) / sizeof(DType);
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -799,7 +799,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
         reduce_id = next_flag(reduce_id);
@@ -874,6 +873,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 } // fp16 inplace reduce kernel (Hopper) MC with rmsNorm fused
 
 template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
@@ -883,11 +883,11 @@ __global__ void __launch_bounds__(MAX_THREADS)
         int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma, float const eps, int const RANKS,
         uint4* uc_ptr_out, size_t const out_lineoffset, uint4* residual_in, uint4* residual_out, int res_offset)
 {
-    cudaTriggerProgrammaticLaunchCompletion();
     __shared__ float s_variance;
     int hidden_dim = blockDim.x * UNROLL_NLINES * sizeof(int4) / sizeof(DType);
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -898,7 +898,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
     }
@@ -962,6 +961,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 } // fp16 inplace reduce kernel (Hopper) MC with rmsNorm fused oneshot
 
 template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
@@ -971,13 +971,13 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
     float const eps, int const RANKS, float2* mc_ptr_out, size_t const out_lineoffset, float const* scale,
     uint4* residual_in, uint4* residual_out, int res_offset)
 {
-    cudaTriggerProgrammaticLaunchCompletion();
     float const sf = 1.f / (*scale);
     __shared__ float s_variance;
     int hidden_dim = blockDim.x * UNROLL_NLINES * sizeof(int4) / sizeof(DType);
 
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -988,7 +988,6 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
         reduce_id = next_flag(reduce_id);
@@ -1066,6 +1065,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 } // quant kernel fp16->fp8 twoshot
 
 template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
@@ -1075,13 +1075,13 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
     float const eps, int const RANKS, uint2* mc_ptr_out, size_t const out_lineoffset, float const* scale,
     uint4* residual_in, uint4* residual_out, int res_offset)
 {
-    cudaTriggerProgrammaticLaunchCompletion();
     float const sf = 1.f / (*scale);
     __shared__ float s_variance;
     int hidden_dim = blockDim.x * UNROLL_NLINES * sizeof(int4) / sizeof(DType);
 
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -1092,7 +1092,6 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
     }
@@ -1160,6 +1159,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 } // quant kernel fp16->fp8 oneshot
 
 template <typename DType, int UNROLL_NLINES>
@@ -1168,9 +1168,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
         int const myrank, int const gpustep, size_t const lineoffset, int const numlines, void** commbuff,
         int const handleridx, float4* mc_ptr, int const RANKS, uint4* residual_in, int res_offset)
 {
-    cudaTriggerProgrammaticLaunchCompletion();
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -1181,7 +1181,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
         reduce_id = next_flag(reduce_id);
@@ -1217,9 +1216,10 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 } // residual allgather kernel
 
-#else
+#else  // __CUDA_ARCH__ >= 900
 template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS)
     userbuffers_fp16_sum_gpu_mc_rmsnorm(int const op, int const flagoffset, int const firstrank, int const myrank,
@@ -1274,7 +1274,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
     asm volatile("brkpt;\n");
 }
 
-#endif
+#endif // __CUDA_ARCH__ >= 900
 
 #define callranksMC_RMSNORM_QUANT(x)                                                                                   \
     if (nlines == x)                                                                                                   \

Original file line number	Diff line number	Diff line change
`@@ -601,6 +601,8 @@ __global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel(`
`601`	`601`	`}`
`602`	`602`	`}`
`603`	`603`	`__syncthreads();`
	`604`	`+ asm volatile("griddepcontrol.wait;");`
	`605`	`+ asm volatile("griddepcontrol.launch_dependents;");`
`604`	`606`
`605`	`607`	`if (warp_idx < 2)`
`606`	`608`	`{`
`@@ -622,7 +624,6 @@ __global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel(`
`622`	`624`	`mma_computer.issue_mainloop();`
`623`	`625`	`mma_computer.epi();`
`624`	`626`	`}`
`625`		`- asm volatile("griddepcontrol.launch_dependents;");`
`626`	`627`	`#endif`
`627`	`628`	`}`
`628`	`629`