First draft manuscript

zzx-study · zzx-study · commit a53f55d6b268 · 2026-02-06T14:41:16.000+08:00
rebuild tokenIdxPerExpert
diff --git a/csrc/deepep/deep_ep.cpp b/csrc/deepep/deep_ep.cpp
@@ -143,7 +143,7 @@ Buffer::get_dispatch_layout(const torch::Tensor &topk_idx, int num_experts, std:
     5. The order in which each token of this NPU is sent to various servers.
        size:[MAX_BS, serverNum]
     6. The order in which each token is sent to the expert.
-       size:[MAX_BS, numTopk]
+       size:[MAX_BS, numExpert]
     7. The server offset of tokens received by each expert from this NPU.
        size:[numExpert, MAX_BS]
     */
@@ -157,6 +157,7 @@ Buffer::get_dispatch_layout(const torch::Tensor &topk_idx, int num_experts, std:
     this->notify_send_data = notify_send_data;
     this->send_token_idx_small = send_token_idx_small;
     this->notify_send_data_size = notify_send_data_size;
+    this->tokens_per_rank = num_tokens_per_rank;
 
     std::optional<torch::Tensor> num_tokens_per_rdma_rank = std::nullopt;
     std::optional<EventHandle> output_event = std::nullopt;
@@ -770,6 +771,8 @@ Buffer::internode_dispatch(
         at::empty({num_experts, num_ranks, MAX_BATCH_SIZE}, at::dtype(at::kInt).device(x.device()));
     at::Tensor dst_offset_rank_token_idx =
         at::empty({num_experts, num_ranks, MAX_BATCH_SIZE}, at::dtype(at::kInt).device(x.device()));
+    at::Tensor token_idx_per_expert =
+        at::empty({num_ranks, num_experts}, at::dtype(at::kInt).device(x.device()));
     // The offsetInner for the current rank and the peer rank
     at::Tensor offset_inner = at::empty({2, MAX_BATCH_SIZE, num_experts}, at::dtype(at::kInt).device(x.device()));
     at::Tensor count_outer = at::empty({MAX_BATCH_SIZE}, at::dtype(at::kInt).device(x.device()));
@@ -792,7 +795,7 @@ Buffer::internode_dispatch(
                  local_rank_size, local_rank_id,
                  send_data_offset,  // A2 not use
                  recv_data, token_server_idx, token_unique_per_server, ep_rank_token_cnt, recv_tokens_per_expert,
-                 src_offset_rank_token_idx, dst_offset_rank_token_idx, offset_inner, count_outer, expand_idx,
+                 src_offset_rank_token_idx, dst_offset_rank_token_idx, token_idx_per_expert, offset_inner, count_outer, expand_idx,
                  total_recv_token);
 
     int total_count = total_recv_token.item<int>();
@@ -808,7 +811,7 @@ Buffer::internode_dispatch(
     }
 
     EXEC_NPU_CMD(aclnnDispatchNormalA2, new_x, expert_ids, x_scales, xActiveMask, new_topk_weights, token_server_idx,
-                 token_unique_per_server, ep_rank_token_cnt, src_offset_rank_token_idx, dst_offset_rank_token_idx,
+                 token_unique_per_server, ep_rank_token_cnt, src_offset_rank_token_idx, dst_offset_rank_token_idx, token_idx_per_expert,
                  hcom_ep_name, num_ranks, rank, num_experts, hcom_ep_name, tp_size, tp_rank, expertShardType,
                  sharedExpertNum, sharedExpertRankNum, quant_mode, global_bs, expertTokenNumsType, expandx_out,
                  dynamic_scales_out, expand_idx, expertTokenNums, epRecvCount, expand_scales,
diff --git a/csrc/deepep/deep_ep.hpp b/csrc/deepep/deep_ep.hpp
@@ -34,6 +34,7 @@ struct Buffer {
     at::Tensor new_scales;
     at::Tensor notify_send_data;  // only for internode notify
     at::Tensor send_token_idx_small;
+    at::Tensor tokens_per_rank;
     int notify_send_data_size;  // only for internode notify
 
     int64_t shared_expert_rank_num;
diff --git a/csrc/deepep/ops2/op_host/dispatch_normal_a2.cpp b/csrc/deepep/ops2/op_host/dispatch_normal_a2.cpp
@@ -66,6 +66,12 @@ class DispatchNormalA2 : public OpDef
             .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
             .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
             .AutoContiguous();
+        this->Input("tokenIdxPerExpert")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
 
         this->Output("recv_x")
             .ParamType(REQUIRED)
diff --git a/csrc/deepep/ops2/op_host/dispatch_normal_a2_tiling.cpp b/csrc/deepep/ops2/op_host/dispatch_normal_a2_tiling.cpp
@@ -63,6 +63,7 @@ constexpr uint32_t TOKEN_SERVER_CNT_INDEX = 6;
 constexpr uint32_t EP_RANK_TOKEN_CNT_INDEX = 7;
 constexpr uint32_t SRC_OFFSET_RANK_TOKEN_IDX_INDEX = 8;
 constexpr uint32_t DST_OFFSET_RANK_TOKEN_IDX_INDEX = 9;
+constexpr uint32_t TOKEN_IDX_PER_EXPERT_INDEX = 10;
 constexpr uint32_t OUTPUT_EXPAND_X_INDEX = 0;
 constexpr uint32_t OUTPUT_DYNAMIC_SCALES_INDEX = 1;
 constexpr uint32_t OUTPUT_EXPAND_IDX_INDEX = 2;
@@ -175,6 +176,7 @@ static bool CheckTensorDim(const gert::TilingContext &context, const char *nodeN
                     return false);
     OP_LOGD(nodeName, "expertId dim0 = %ld", expertIdStorageShape->GetStorageShape().GetDim(0));
     OP_LOGD(nodeName, "expertId dim1 = %ld", expertIdStorageShape->GetStorageShape().GetDim(1));
+
     // 如果scales不为空进行shape维度检查
     if (isScales) {
         const gert::StorageShape *scalesStorageShape = context.GetOptionalInputShape(SCALES_INDEX);
@@ -601,6 +603,7 @@ static ge::graphStatus CheckTensorShape(const gert::TilingContext &context, cons
                 expertIdsDim1),
         return ge::GRAPH_FAILED);
     tilingData.moeDistributeDispatchInfo.k = static_cast<uint32_t>(expertIdsDim1);
+
     // 校验scales的维度
     if (isScales) {
         const gert::StorageShape *scalesStorageShape = context.GetOptionalInputShape(SCALES_INDEX);
@@ -932,6 +935,10 @@ static ge::graphStatus MoeDistributeDispatchA2CheckShapeAndSetTiling(const gert:
         context.GetInputShape(DST_OFFSET_RANK_TOKEN_IDX_INDEX);
     OP_TILING_CHECK(dstOffsetRankTokenIdxStorageShape == nullptr,
                     OP_LOGE(K_INNER_DEBUG, "dstOffsetRankTokenIdxStorageShape is null."), return GRAPH_FAILED);
+    const gert::StorageShape *tokenIdxPerExpertStorageShape =
+        context.GetInputShape(TOKEN_IDX_PER_EXPERT_INDEX);
+    OP_TILING_CHECK(tokenIdxPerExpertStorageShape == nullptr,
+                    OP_LOGE(K_INNER_DEBUG, "tokenIdxPerExpertStorageShape is null."), return GRAPH_FAILED);
 
     info.isQuant = isScales;
     info.bs = bs;
diff --git a/csrc/deepep/ops2/op_host/notify_dispatch_a2.cpp b/csrc/deepep/ops2/op_host/notify_dispatch_a2.cpp
@@ -61,6 +61,10 @@ class NotifyDispatchA2 : public OpDef
             .DataType({ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32})
             .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
             .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("tokenIdxPerExpert")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
         this->Output("offsetInner")
             .ParamType(REQUIRED)
             .DataType({ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32})
diff --git a/csrc/deepep/ops2/op_host/notify_dispatch_tiling_a2.cc b/csrc/deepep/ops2/op_host/notify_dispatch_tiling_a2.cc
@@ -65,10 +65,11 @@ constexpr uint32_t OUTPUT_EP_RANK_TOKEN_CNT_INDEX = 4;
 constexpr uint32_t OUTPUT_LOCAL_EP_TOKEN_CNT_INDEX = 5;
 constexpr uint32_t OUTPUT_SRC_OFFSET_RANK_TOKEN_INDEX = 6;
 constexpr uint32_t OUTPUT_DST_OFFSET_RANK_TOKEN_INDEX = 7;
-constexpr uint32_t OUTPUT_OFFSET_INNER_INDEX = 8;
-constexpr uint32_t OUTPUT_COUNT_OUTER_INDEX = 9;
-constexpr uint32_t OUTPUT_EXPAND_IDX_INDEX = 10;
-constexpr uint32_t OUTPUT_TOTAL_RECV_TOKENS_INDEX = 11;
+constexpr uint32_t TOKEN_IDX_PER_EXPERT_INDEX = 8;
+constexpr uint32_t OUTPUT_OFFSET_INNER_INDEX = 9;
+constexpr uint32_t OUTPUT_COUNT_OUTER_INDEX = 10;
+constexpr uint32_t OUTPUT_EXPAND_IDX_INDEX = 11;
+constexpr uint32_t OUTPUT_TOTAL_RECV_TOKENS_INDEX = 12;
 
 constexpr uint32_t ATTR_SEND_COUNT_INDEX = 0;
 constexpr uint32_t ATTR_NUM_TOKENS_INDEX = 1;
@@ -327,6 +328,20 @@ static bool CheckTensorDataType(gert::TilingContext *context, const char *nodeNa
             static_cast<ge::DataType>(dstOffsetRankTokenIdx->GetDataType())),
         return false);
 
+    auto tokenIdxPerExpert = context->GetOutputDesc(TOKEN_IDX_PER_EXPERT_INDEX);
+    OP_TILING_CHECK(tokenIdxPerExpert == nullptr, OP_LOGE(nodeName, "tokenIdxPerExpert is null."),
+                    return false);
+    OP_TILING_CHECK(
+        (tokenIdxPerExpert->GetDataType() != ge::DT_BF16) &&
+            (tokenIdxPerExpert->GetDataType() != ge::DT_FLOAT16) &&
+            (tokenIdxPerExpert->GetDataType() != ge::DT_FLOAT) &&
+            (tokenIdxPerExpert->GetDataType() != ge::DT_INT32),
+        OP_LOGE(
+            nodeName,
+            "tokenIdxPerExpert datatype is invalid, datatype should be bf16 or float16 or float or int, but is %d.",
+            static_cast<ge::DataType>(tokenIdxPerExpert->GetDataType())),
+        return false);
+
     auto offsetInner = context->GetOutputDesc(OUTPUT_OFFSET_INNER_INDEX);
     OP_TILING_CHECK(offsetInner == nullptr, OP_LOGE(nodeName, "offsetInner is null."), return false);
     OP_TILING_CHECK(
diff --git a/csrc/deepep/ops2/op_host/op_api/aclnn_dispatch_normal_a2.cpp b/csrc/deepep/ops2/op_host/op_api/aclnn_dispatch_normal_a2.cpp
@@ -19,7 +19,7 @@ extern "C" {
 aclnnStatus aclnnDispatchNormalA2GetWorkspaceSize(
     const aclTensor *x, const aclTensor *expertIds, const aclTensor *scales, const aclTensor *xActiveMask,
     const aclTensor *expertScales, const aclTensor *tokenServerIdx, const aclTensor *tokenServerCnt,
-    const aclTensor *epRankTokenCnt, const aclTensor *srcOffsetRankTokenIdx, const aclTensor *dstOffsetRankTokenIdx,
+    const aclTensor *epRankTokenCnt, const aclTensor *srcOffsetRankTokenIdx, const aclTensor *dstOffsetRankTokenIdx, const aclTensor *tokenIdxPerExpert,
     char *groupEp, int64_t epWorldSize, int64_t epRankId, int64_t moeExpertNum, char *groupTp, int64_t tpWorldSize,
     int64_t tpRankId, int64_t expertShardType, int64_t sharedExpertNum, int64_t sharedExpertRankNum, int64_t quantMode,
     int64_t globalBs, int64_t expertTokenNumsType, const aclTensor *recvX, const aclTensor *dynamicScales,
@@ -29,7 +29,7 @@ aclnnStatus aclnnDispatchNormalA2GetWorkspaceSize(
 {
     return aclnnInnerDispatchNormalA2GetWorkspaceSize(
         x, expertIds, scales, xActiveMask, expertScales, tokenServerIdx, tokenServerCnt, epRankTokenCnt,
-        srcOffsetRankTokenIdx, dstOffsetRankTokenIdx, groupEp, epWorldSize, epRankId, moeExpertNum, groupTp,
+        srcOffsetRankTokenIdx, dstOffsetRankTokenIdx,tokenIdxPerExpert, groupEp, epWorldSize, epRankId, moeExpertNum, groupTp,
         tpWorldSize, tpRankId, expertShardType, sharedExpertNum, sharedExpertRankNum, quantMode, globalBs,
         expertTokenNumsType, recvX, dynamicScales, expandIdx, expertTokenNums, epRecvCount, expandScales,
         waitRecvCostStats, workspaceSize, executor);
diff --git a/csrc/deepep/ops2/op_host/op_api/aclnn_dispatch_normal_a2.h b/csrc/deepep/ops2/op_host/op_api/aclnn_dispatch_normal_a2.h
@@ -10,7 +10,7 @@ extern "C" {
 __attribute__((visibility("default"))) aclnnStatus aclnnDispatchNormalA2GetWorkspaceSize(
     const aclTensor *x, const aclTensor *expertIds, const aclTensor *scales, const aclTensor *xActiveMask,
     const aclTensor *expertScales, const aclTensor *tokenServerIdx, const aclTensor *tokenServerCnt,
-    const aclTensor *epRankTokenCnt, const aclTensor *srcOffsetRankTokenIdx, const aclTensor *dstOffsetRankTokenIdx,
+    const aclTensor *epRankTokenCnt, const aclTensor *srcOffsetRankTokenIdx, const aclTensor *dstOffsetRankTokenIdx,const aclTensor *tokenIdxPerExpert,
     char *groupEp, int64_t epWorldSize, int64_t epRankId, int64_t moeExpertNum, char *groupTp, int64_t tpWorldSize,
     int64_t tpRankId, int64_t expertShardType, int64_t sharedExpertNum, int64_t sharedExpertRankNum, int64_t quantMode,
     int64_t globalBs, int64_t expertTokenNumsType, const aclTensor *recvX, const aclTensor *dynamicScales,
diff --git a/csrc/deepep/ops2/op_host/op_api/aclnn_notify_dispatch_a2.cpp b/csrc/deepep/ops2/op_host/op_api/aclnn_notify_dispatch_a2.cpp
@@ -20,14 +20,14 @@ aclnnStatus aclnnNotifyDispatchA2GetWorkspaceSize(
     int64_t numTokens, int64_t topkNum, int64_t numExperts, char *commGroup, int64_t rankSize, int64_t rankId,
     int64_t localRankSize, int64_t localRankId, const aclTensor *sendDataOffset, const aclTensor *recvData,
     const aclTensor *tokenServerIdx, const aclTensor *tokenUniquePerServer, const aclTensor *epRankTokenCnt,
-    const aclTensor *localEpTokenCnt, const aclTensor *srcOffsetRankTokenIdx, const aclTensor *dstOffsetRankTokenIdx,
+    const aclTensor *localEpTokenCnt, const aclTensor *srcOffsetRankTokenIdx, const aclTensor *dstOffsetRankTokenIdx,const aclTensor *tokenIdxPerExpert,
     const aclTensor *offsetInner, const aclTensor *countOuter, const aclTensor *expandIdx,
     const aclTensor *totalRecvTokens, uint64_t *workspaceSize, aclOpExecutor **executor)
 {
     return aclnnInnerNotifyDispatchA2GetWorkspaceSize(
         sendData, tokenPerExpertData, tmpData, sendCount, numTokens, topkNum, numExperts, commGroup, rankSize, rankId,
         localRankSize, localRankId, sendDataOffset, recvData, tokenServerIdx, tokenUniquePerServer, epRankTokenCnt,
-        localEpTokenCnt, srcOffsetRankTokenIdx, dstOffsetRankTokenIdx, offsetInner, countOuter, expandIdx,
+        localEpTokenCnt, srcOffsetRankTokenIdx, dstOffsetRankTokenIdx, tokenIdxPerExpert, offsetInner, countOuter, expandIdx,
         totalRecvTokens, workspaceSize, executor);
 }
 
diff --git a/csrc/deepep/ops2/op_host/op_api/aclnn_notify_dispatch_a2.h b/csrc/deepep/ops2/op_host/op_api/aclnn_notify_dispatch_a2.h
@@ -29,6 +29,7 @@ extern "C" {
  * localEpTokenCnt : required
  * srcOffsetRankTokenIdx : required
  * dstOffsetRankTokenIdx : required
+ * tokenIdxPerExpert : required
  * offsetInner : required
  * countOuter : required
  * expandIdx : required
@@ -40,7 +41,7 @@ __attribute__((visibility("default"))) aclnnStatus aclnnNotifyDispatchA2GetWorks
     int64_t numTokens, int64_t topkNum, int64_t numExperts, char *commGroup, int64_t rankSize, int64_t rankId,
     int64_t localRankSize, int64_t localRankId, const aclTensor *sendDataOffset, const aclTensor *recvData,
     const aclTensor *tokenServerIdx, const aclTensor *tokenUniquePerServer, const aclTensor *epRankTokenCnt,
-    const aclTensor *localEpTokenCnt, const aclTensor *srcOffsetRankTokenIdx, const aclTensor *dstOffsetRankTokenIdx,
+    const aclTensor *localEpTokenCnt, const aclTensor *srcOffsetRankTokenIdx, const aclTensor *dstOffsetRankTokenIdx,const aclTensor *tokenIdxPerExpert,
     const aclTensor *offsetInner, const aclTensor *countOuter, const aclTensor *expandIdx,
     const aclTensor *totalRecvTokens, uint64_t *workspaceSize, aclOpExecutor **executor);
 
diff --git a/csrc/deepep/ops2/op_kernel/a2/moe_distribute_dispatch_a2_pipeline.h b/csrc/deepep/ops2/op_kernel/a2/moe_distribute_dispatch_a2_pipeline.h
diff --git a/csrc/deepep/ops2/op_kernel/dispatch_normal_a2.cpp b/csrc/deepep/ops2/op_kernel/dispatch_normal_a2.cpp
diff --git a/csrc/deepep/ops2/op_kernel/notify_dispatch_a2.cpp b/csrc/deepep/ops2/op_kernel/notify_dispatch_a2.cpp
diff --git a/csrc/deepep/ops2/op_kernel/notify_dispatch_a2.h b/csrc/deepep/ops2/op_kernel/notify_dispatch_a2.h