From b551565b48807547fde7cc0829ef55c977ee0854 Mon Sep 17 00:00:00 2001
From: wangdongxing <wangdongxing@fangqing-system.com>
Date: Mon, 28 Jul 2025 17:45:32 +0800
Subject: [PATCH] Fix potential bug in intranode dispatch receive

Since each block is responsible for a set of groups, the Block-Stride Loop approach should be used to process groups during the receive phase.
Therefore, the increment of the for loop should be blockDim.x, rather than gridDim.x * expertsPerBlock.
---
 csrc/all_to_all/intranode_dispatch.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/csrc/all_to_all/intranode_dispatch.cu b/csrc/all_to_all/intranode_dispatch.cu
index 402773b..ac72f8f 100644
--- a/csrc/all_to_all/intranode_dispatch.cu
+++ b/csrc/all_to_all/intranode_dispatch.cu
@@ -203,8 +203,7 @@ __global__ __launch_bounds__(NUM_WARPS * 32, 1) void dispatchKernel(
     unsigned firstGroup = blockIdx.x * expertsPerBlock;
     unsigned lastGroup = std::min(firstGroup + expertsPerBlock, numExpertsAndRanks);
 
-    for (unsigned group = firstGroup + threadIdx.x; group < lastGroup;
-         group += gridDim.x * expertsPerBlock) {
+    for (unsigned group = firstGroup + threadIdx.x; group < lastGroup; group += blockDim.x) {
       const uint32_t srcRank = group / numLocalExperts;
       const uint32_t srcLocalExpert = group % numLocalExperts;