fix ht combine after removing busy-wait

irexyc · irexyc · commit fb0abad69db9 · 2026-04-26T07:58:23.000Z
diff --git a/3rdparty/deep_ep/deep_ep.cpp b/3rdparty/deep_ep/deep_ep.cpp
@@ -1428,6 +1428,7 @@ Buffer::internode_combine(const Tensor&                x,
                           const Tensor&                gbl_channel_prefix_matrix,
                           Tensor&                      combined_rdma_head,
                           Tensor&                      combined_nvl_head,
+                          const int*                   num_recv_tokens_ptr,
                           const Config&                config)
 {
     const int num_channels = config.num_sms / 2;
@@ -1545,6 +1546,7 @@ Buffer::internode_combine(const Tensor&                x,
                        rdma_channel_prefix_matrix.data<int>(),
                        rdma_rank_prefix_sum.data<int>(),
                        gbl_channel_prefix_matrix.data<int>(),
+                       num_recv_tokens_ptr,
                        num_tokens,
                        num_combined_tokens,
                        hidden,
diff --git a/3rdparty/deep_ep/deep_ep.hpp b/3rdparty/deep_ep/deep_ep.hpp
@@ -263,6 +263,7 @@ class Buffer {
                       const Tensor&                gbl_channel_prefix_matrix,
                       Tensor&                      combined_rdma_head,
                       Tensor&                      combined_nvl_head,
+                      const int*                   num_recv_tokens_ptr,
                       const Config&                config);
 
     Config get_dispatch_config();
diff --git a/3rdparty/deep_ep/kernels/api.cuh b/3rdparty/deep_ep/kernels/api.cuh
@@ -278,6 +278,7 @@ void combine(cudaDataType_t type,
              const int*     rdma_channel_prefix_matrix,
              const int*     rdma_rank_prefix_sum,
              const int*     gbl_channel_prefix_matrix,
+             const int*     num_recv_tokens_ptr,
              int            num_tokens,
              int            num_combined_tokens,
              int            hidden,
diff --git a/3rdparty/deep_ep/kernels/internode.cu b/3rdparty/deep_ep/kernels/internode.cu
@@ -1901,6 +1901,7 @@ __global__ void __launch_bounds__((kNumForwarders + 1) * 32, 1) combine(int4* co
                                                                         const int* rdma_channel_prefix_matrix,
                                                                         const int* rdma_rank_prefix_sum,
                                                                         const int* gbl_channel_prefix_matrix,
+                                                                        const int* num_recv_tokens_ptr,
                                                                         int num_tokens,
                                                                         int num_combined_tokens,
                                                                         int hidden,
@@ -2005,7 +2006,12 @@ __global__ void __launch_bounds__((kNumForwarders + 1) * 32, 1) combine(int4* co
         if (lane_id < kNumRDMARanks) {
             int prefix_idx = (lane_id * NUM_MAX_NVL_PEERS + dst_nvl_rank) * num_channels + channel_id;
             token_start_idx = gbl_channel_prefix_matrix[prefix_idx];
-            token_end_idx = (prefix_idx == num_channels * num_ranks - 1) ? num_tokens : gbl_channel_prefix_matrix[prefix_idx + 1];
+            // The last `(rdma, nvl, channel)` slot has no `+1` neighbor, so its upper bound has to come
+            // from the real recv-token total. When `num_recv_tokens_ptr` is supplied (HT dispatch with
+            // `num_worst_tokens > 0` pads `x` to the worst-case size), read the device-side total to
+            // avoid sending into the padding region. Otherwise fall back to the input shape.
+            const int real_num_tokens = num_recv_tokens_ptr != nullptr ? __ldg(num_recv_tokens_ptr) : num_tokens;
+            token_end_idx = (prefix_idx == num_channels * num_ranks - 1) ? real_num_tokens : gbl_channel_prefix_matrix[prefix_idx + 1];
         }
         __syncwarp();
 
@@ -2513,6 +2519,7 @@ void combine(cudaDataType_t type,
              const int* rdma_channel_prefix_matrix,
              const int* rdma_rank_prefix_sum,
              const int* gbl_channel_prefix_matrix,
+             const int* num_recv_tokens_ptr,
              int num_tokens,
              int num_combined_tokens,
              int hidden,
@@ -2568,6 +2575,7 @@ void combine(cudaDataType_t type,
                       rdma_channel_prefix_matrix,                                     \
                       rdma_rank_prefix_sum,                                           \
                       gbl_channel_prefix_matrix,                                      \
+                      num_recv_tokens_ptr,                                            \
                       num_tokens,                                                     \
                       num_combined_tokens,                                            \
                       hidden,                                                         \
diff --git a/src/turbomind/comm/nccl/nccl_ep.cu b/src/turbomind/comm/nccl/nccl_ep.cu
@@ -345,6 +345,14 @@ void NcclCommImpl::Combine(const EpCombineInput& input, EpCombineOutput& output,
             auto combined_rdma_head         = input.handle[8];
             auto combined_nvl_head          = input.handle[9];
 
+            // Real recv-token total lives at the last slot of `recv_gbl_rank_prefix_sum`. The
+            // internode combine kernel needs it to bound the very last (rdma, nvl, channel)
+            // task range when HT dispatch was called with `num_worst_tokens > 0` (which pads
+            // `input.x` past the real total).
+            auto       recv_gbl_rank_prefix_sum = input.handle[6];
+            const int* num_recv_tokens_ptr =
+                recv_gbl_rank_prefix_sum.data<int>() + recv_gbl_rank_prefix_sum.shape(0) - 1;
+
             auto [combined_x, combined_topk_weights] = buffer_->internode_combine(input.x,
                                                                                   std::nullopt,
                                                                                   std::nullopt,
@@ -356,6 +364,7 @@ void NcclCommImpl::Combine(const EpCombineInput& input, EpCombineOutput& output,
                                                                                   gbl_channel_prefix_matrix,
                                                                                   combined_rdma_head,
                                                                                   combined_nvl_head,
+                                                                                  num_recv_tokens_ptr,
                                                                                   config);
             sync_check_cuda_error();
             output.out_x = combined_x;