Try fix the accuracy issue with warp reduce.

timlee0212 · timlee0212 · commit a81a6588068a · 2025-11-03T16:37:30.000-08:00
Signed-off-by: Shiyu Li &lt;shili@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu
@@ -381,17 +381,16 @@ inline __device__ T add(T a, T b)
 template <typename T>
 __inline__ __device__ T warpReduceSum(T val)
 {
-    // Get the actual number of active threads in this warp
-    int active_warp_size = min(WARP_SIZE, blockDim.x - (threadIdx.x & ~(WARP_SIZE - 1)));
-    unsigned int mask = (1U << active_warp_size) - 1;
+    int lane_id = threadIdx.x & 0x1f;
+    int warp_size = blockDim.x - (threadIdx.x & ~(WARP_SIZE - 1));
+    unsigned int active_mask = (1U << warp_size) - 1;
 
 #pragma unroll
     for (int offset = 16; offset > 0; offset >>= 1)
     {
-        if (offset < active_warp_size)
-        {
-            val = add<T>(val, __shfl_xor_sync(mask, val, offset, WARP_SIZE));
-        }
+        int target_lane = lane_id ^ offset;
+        auto tmp = __shfl_xor_sync(active_mask, val, offset, WARP_SIZE);
+        val = add<T>(val, target_lane < warp_size ? tmp : 0);
     }
     return val;
 }
@@ -409,8 +408,17 @@ inline __device__ float block_reduce_sum(float val)
         smem[warp_id] = val;
     }
     __syncthreads();
-    val = lane_id < warp_num ? smem[lane_id] : 0.f;
-    val = warpReduceSum(val);
+    if (warp_id == 0)
+    {
+        val = lane_id < warp_num ? smem[lane_id] : 0.f;
+        val = warpReduceSum(val);
+        if (lane_id == 0)
+        {
+            val = smem[0];
+        }
+    }
+    __syncthreads();
+    val = smem[0];
 
     return val;
 }
diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
@@ -161,7 +161,7 @@ def func(input, residual, norm_weight, eps, enable_fusion):
             )
 
 
-@pytest.mark.skip(reason="https://nvbugs/5597647")
+#@pytest.mark.skip(reason="https://nvbugs/5597647")
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="needs 2 GPUs to run this test")
 @pytest.mark.parametrize(

Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,7 @@ def func(input, residual, norm_weight, eps, enable_fusion):`
`161`	`161`	`)`
`162`	`162`
`163`	`163`
`164`		`-@pytest.mark.skip(reason="https://nvbugs/5597647")`
	`164`	`+#@pytest.mark.skip(reason="https://nvbugs/5597647")`
`165`	`165`	`@pytest.mark.skipif(torch.cuda.device_count() < 2,`
`166`	`166`	`reason="needs 2 GPUs to run this test")`
`167`	`167`	`@pytest.mark.parametrize(`