NVIDIA · chzblych · Nov 4, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/cpp/tensorrt_llm/common/customAllReduceUtils.h b/cpp/tensorrt_llm/common/customAllReduceUtils.h
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu
@@ -134,8 +134,12 @@ public:
             // corresponding CTA has not been launched.
             for (int flag_idx = blockIdx.x; flag_idx < kBarrierFlagCount; flag_idx += gridDim.x)
             {
-                st_flag(m_target_flag + flag_idx * NRanks, m_flag_value);
+                asm volatile(
+                    "st.global.relaxed.sys.b32 [%1], %0;" ::"r"(m_flag_value), "l"(m_target_flag + flag_idx * NRanks));
             }
+            // Single release fence
+            asm volatile("fence.release.sys;");
+
             while (ld_flag(m_current_flag) == prev_flag(m_flag_value))
             {
             }

diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h
@@ -106,6 +106,30 @@ inline std::string toString(AllReduceFusionOp op)
     return oss.str();
 }
 
+inline std::ostream& operator<<(std::ostream& os, AllReduceStrategyType op)
+{
+    switch (op)
+    {
+    case AllReduceStrategyType::NCCL: os << "NCCL"; break;
+    case AllReduceStrategyType::MIN_LATENCY: os << "MIN_LATENCY"; break;
+    case AllReduceStrategyType::UB: os << "UB"; break;
+    case AllReduceStrategyType::AUTO: os << "AUTO"; break;
+    case AllReduceStrategyType::ONESHOT: os << "ONESHOT"; break;
+    case AllReduceStrategyType::TWOSHOT: os << "TWOSHOT"; break;
+    case AllReduceStrategyType::LOWPRECISION: os << "LOWPRECISION"; break;
+    case AllReduceStrategyType::MNNVL: os << "MNNVL"; break;
+    case AllReduceStrategyType::NCCL_SYMMETRIC: os << "NCCL_SYMMETRIC"; break;
+    }
+    return os;
+}
+
+inline std::string toString(AllReduceStrategyType op)
+{
+    std::ostringstream oss;
+    oss << op;
+    return oss.str();
+}
+
 struct AllReduceFusionParams
 {
     AllReduceFusionParams()

diff --git a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu
@@ -601,6 +601,8 @@ __global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel(
         }
     }
     __syncthreads();
+    asm volatile("griddepcontrol.wait;");
+    asm volatile("griddepcontrol.launch_dependents;");
 
     if (warp_idx < 2)
     {
@@ -622,7 +624,6 @@ __global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel(
         mma_computer.issue_mainloop();
         mma_computer.epi();
     }
-    asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }
 

diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu
@@ -56,12 +56,12 @@ __global__ void __launch_bounds__(MAX_THREADS)
     userbuffers_fp16_sum_inplace_gpu_rw(int const op, int const flagoffset, int const firstrank, int const myrank,
         int const gpustep, size_t const lineoffset, int const numlines, void** commbuff, int const handleridx)
 {
-#if __CUDA_ARCH__ >= 900
-    cudaTriggerProgrammaticLaunchCompletion();
-#endif
     __shared__ int4* userptr[RANKS];
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+#if __CUDA_ARCH__ >= 900
+    cudaGridDependencySynchronize();
+#endif
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -72,9 +72,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-#if __CUDA_ARCH__ >= 900
-        cudaGridDependencySynchronize();
-#endif
         flagptr[physgpu] = reduce_id;
         userptr[threadIdx.x] = reinterpret_cast<int4*>(commbuff[targetgpu + handleridx]);
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
@@ -130,19 +127,22 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+#if __CUDA_ARCH__ >= 900
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 } // fp16 inplace reduce kernel (Hopper)
 
 template <typename DType, int RANKS>
 __global__ void __launch_bounds__(MAX_THREADS)
     userbuffers_fp16_sum_inplace_gpu_rr(int const op, int const flagoffset, int const firstrank, int const myrank,
         int const gpustep, size_t const lineoffset, int const numlines, void** commbuff, int const handleridx)
 {
-#if __CUDA_ARCH__ >= 900
-    cudaTriggerProgrammaticLaunchCompletion();
-#endif
     __shared__ int4* userptr[RANKS];
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+#if __CUDA_ARCH__ >= 900
+    cudaGridDependencySynchronize();
+#endif
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -153,9 +153,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-#if __CUDA_ARCH__ >= 900
-        cudaGridDependencySynchronize();
-#endif
         flagptr[physgpu] = reduce_id;
         userptr[threadIdx.x] = reinterpret_cast<int4*>(commbuff[targetgpu + handleridx]);
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
@@ -239,6 +236,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+#if __CUDA_ARCH__ >= 900
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 } // fp16 inplace reduce kernel (Ampere)
 
 #if __CUDA_ARCH__ >= 900
@@ -365,7 +365,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
         *reduceidptr = reduce_id;
 } // fp16 inplace reduce kernel (Hopper) MC
 
-#else
+#else  // __CUDA_ARCH__ >= 900
 template <typename DType, int RANKS, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc(int const op, int const flagoffset,
     int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
@@ -375,7 +375,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
     asm volatile("brkpt;\n");
 }
 
-#endif
+#endif // __CUDA_ARCH__ >= 900
 
 #define callranks(x)                                                                                                   \
     if (ar_nvsize == x)                                                                                                \
@@ -568,13 +568,13 @@ __global__ void __launch_bounds__(MAX_THREADS)
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
     constexpr int SF_VEC_SIZE = 16;
     using PackedVec = PackedVec<DType>;
-    cudaTriggerProgrammaticLaunchCompletion();
     float sf = *scale;
     __shared__ float s_variance;
     int hidden_dim = blockDim.x * UNROLL_NLINES * sizeof(int4) / sizeof(DType);
 
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -585,7 +585,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
         reduce_id = next_flag(reduce_id);
@@ -670,6 +669,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -684,13 +684,13 @@ __global__ void __launch_bounds__(MAX_THREADS)
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
     constexpr int SF_VEC_SIZE = 16;
     using PackedVec = PackedVec<DType>;
-    cudaTriggerProgrammaticLaunchCompletion();
     float sf = *scale;
     __shared__ float s_variance;
     int hidden_dim = blockDim.x * UNROLL_NLINES * sizeof(int4) / sizeof(DType);
 
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -701,7 +701,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
     }
@@ -772,6 +771,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -784,11 +784,11 @@ __global__ void __launch_bounds__(MAX_THREADS)
         float4* mc_ptr, DType const* beta, DType const* gamma, float const eps, int const RANKS, float4* mc_ptr_out,
         size_t const out_lineoffset, uint4* residual_in, uint4* residual_out, int res_offset)
 {
-    cudaTriggerProgrammaticLaunchCompletion();
     __shared__ float s_variance;
     int hidden_dim = blockDim.x * UNROLL_NLINES * sizeof(int4) / sizeof(DType);
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -799,7 +799,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
         reduce_id = next_flag(reduce_id);
@@ -874,6 +873,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 } // fp16 inplace reduce kernel (Hopper) MC with rmsNorm fused
 
 template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
@@ -883,11 +883,11 @@ __global__ void __launch_bounds__(MAX_THREADS)
         int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma, float const eps, int const RANKS,
         uint4* uc_ptr_out, size_t const out_lineoffset, uint4* residual_in, uint4* residual_out, int res_offset)
 {
-    cudaTriggerProgrammaticLaunchCompletion();
     __shared__ float s_variance;
     int hidden_dim = blockDim.x * UNROLL_NLINES * sizeof(int4) / sizeof(DType);
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -898,7 +898,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
     }
@@ -962,6 +961,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 } // fp16 inplace reduce kernel (Hopper) MC with rmsNorm fused oneshot
 
 template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
@@ -971,13 +971,13 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
     float const eps, int const RANKS, float2* mc_ptr_out, size_t const out_lineoffset, float const* scale,
     uint4* residual_in, uint4* residual_out, int res_offset)
 {
-    cudaTriggerProgrammaticLaunchCompletion();
     float const sf = 1.f / (*scale);
     __shared__ float s_variance;
     int hidden_dim = blockDim.x * UNROLL_NLINES * sizeof(int4) / sizeof(DType);
 
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -988,7 +988,6 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
         reduce_id = next_flag(reduce_id);
@@ -1066,6 +1065,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 } // quant kernel fp16->fp8 twoshot
 
 template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
@@ -1075,13 +1075,13 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
     float const eps, int const RANKS, uint2* mc_ptr_out, size_t const out_lineoffset, float const* scale,
     uint4* residual_in, uint4* residual_out, int res_offset)
 {
-    cudaTriggerProgrammaticLaunchCompletion();
     float const sf = 1.f / (*scale);
     __shared__ float s_variance;
     int hidden_dim = blockDim.x * UNROLL_NLINES * sizeof(int4) / sizeof(DType);
 
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -1092,7 +1092,6 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
     }
@@ -1160,6 +1159,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 } // quant kernel fp16->fp8 oneshot
 
 template <typename DType, int UNROLL_NLINES>
@@ -1168,9 +1168,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
         int const myrank, int const gpustep, size_t const lineoffset, int const numlines, void** commbuff,
         int const handleridx, float4* mc_ptr, int const RANKS, uint4* residual_in, int res_offset)
 {
-    cudaTriggerProgrammaticLaunchCompletion();
     int *flagptr, physgpu, targetgpu, *myptr;
     int *reduceidptr, reduce_id;
+    cudaGridDependencySynchronize();
     if (threadIdx.x < RANKS)
     {
         physgpu = myrank * gpustep + firstrank;
@@ -1181,7 +1181,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
         reduce_id = next_flag(*reduceidptr);
         flagptr = (reinterpret_cast<int*>(commbuff[targetgpu])) + flagoffset + blockflagoffset;
         myptr += blockflagoffset;
-        cudaGridDependencySynchronize();
         flagptr[physgpu] = reduce_id;
         multi_gpu_block_barrier(reduce_id, (int volatile*) &myptr[targetgpu]);
         reduce_id = next_flag(reduce_id);
@@ -1217,9 +1216,10 @@ __global__ void __launch_bounds__(MAX_THREADS)
     }
     if (threadIdx.x == 0 && blockIdx.x == 0)
         *reduceidptr = reduce_id;
+    cudaTriggerProgrammaticLaunchCompletion();
 } // residual allgather kernel
 
-#else
+#else  // __CUDA_ARCH__ >= 900
 template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS)
     userbuffers_fp16_sum_gpu_mc_rmsnorm(int const op, int const flagoffset, int const firstrank, int const myrank,
@@ -1274,7 +1274,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
     asm volatile("brkpt;\n");
 }
 
-#endif
+#endif // __CUDA_ARCH__ >= 900
 
 #define callranksMC_RMSNORM_QUANT(x)                                                                                   \
     if (nlines == x)                                                                                                   \