From b551565b48807547fde7cc0829ef55c977ee0854 Mon Sep 17 00:00:00 2001 From: wangdongxing Date: Mon, 28 Jul 2025 17:45:32 +0800 Subject: [PATCH] Fix potential bug in intranode dispatch receive Since each block is responsible for a set of groups, the Block-Stride Loop approach should be used to process groups during the receive phase. Therefore, the increment of the for loop should be blockDim.x, rather than gridDim.x * expertsPerBlock. --- csrc/all_to_all/intranode_dispatch.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/csrc/all_to_all/intranode_dispatch.cu b/csrc/all_to_all/intranode_dispatch.cu index 402773b..ac72f8f 100644 --- a/csrc/all_to_all/intranode_dispatch.cu +++ b/csrc/all_to_all/intranode_dispatch.cu @@ -203,8 +203,7 @@ __global__ __launch_bounds__(NUM_WARPS * 32, 1) void dispatchKernel( unsigned firstGroup = blockIdx.x * expertsPerBlock; unsigned lastGroup = std::min(firstGroup + expertsPerBlock, numExpertsAndRanks); - for (unsigned group = firstGroup + threadIdx.x; group < lastGroup; - group += gridDim.x * expertsPerBlock) { + for (unsigned group = firstGroup + threadIdx.x; group < lastGroup; group += blockDim.x) { const uint32_t srcRank = group / numLocalExperts; const uint32_t srcLocalExpert = group % numLocalExperts;