compile

zzx-study · zzx-study · commit 4059c696a58e · 2026-02-09T11:14:16.000+08:00
diff --git a/csrc/deepep/ops2/op_kernel/a2/moe_distribute_dispatch_a2_pipeline.h b/csrc/deepep/ops2/op_kernel/a2/moe_distribute_dispatch_a2_pipeline.h
@@ -11,7 +11,7 @@ namespace MoeDistributeDispatchA2Impl {
 constexpr uint32_t STATE_OFFSET = 512;                 // 状态空间偏移地址
 constexpr uint32_t STATUS_SIZE_LAYERED = 1024 * 1024;  // 1M
 constexpr uint32_t HCCS_RING_BUFFER_HEAD_TAIL = 8 * 2 * 32;
-constexpr uint32_t EACH_HCCS_RING_BUFFER_HEAD_TAIL = 8 * 2 * 32;
+constexpr uint32_t EACH_HCCS_RING_BUFFER_HEAD_TAIL = 2 * 32;
 constexpr uint32_t RING_BUFFER_HEAD_TAIL = 8 * 32;
 constexpr uint32_t RDMA_BUFFER_ALIGN = 4 * 1024;
 constexpr uint32_t SELF_STATE_OFFSET = 512 * 1024;  // 本卡状态空间偏移地址
@@ -129,9 +129,9 @@ class MoeDistributeDispatchA2Pipeline
 
     LocalTensor<int32_t> tokenServerIdxTensor_;
     LocalTensor<int32_t> serverCountTensor_;
-    LocalTensor<uint32_t> tokenStructInRdmaTensor_;
-    LocalTensor<uint32_t> tokenStructInHccsTensor_;
-    LocalTensor<uint32_t> rdmaUseTokenStructInHccsTensor_;
+    LocalTensor<uint8_t> tokenStructInRdmaTensor_;
+    LocalTensor<uint8_t> tokenStructInHccsTensor_;
+    LocalTensor<uint8_t> rdmaUseTokenStructInHccsTensor_;
 
     TBuf<> tokenServerIdxBuf_;
     TBuf<> serverCountBuf_;
@@ -159,6 +159,7 @@ class MoeDistributeDispatchA2Pipeline
     GM_ADDR expertToServerCntGM_;
     GM_ADDR shareAddrs[8];
     GM_ADDR shareAddrWins[8];
+    GM_ADDR hccsHeadTailGM[8];
 
     // tiling侧已确保数据上限，相乘不会越界，因此统一采用uint32_t进行处理
     uint32_t axisBS_{0};
@@ -316,12 +317,12 @@ __aicore__ inline void MoeDistributeDispatchA2Pipeline<TemplateMC2TypeA2Pipeline
     sendStatusTensor_.SetGlobalBuffer((__gm__ int32_t *)(windowOutGM_ + WIN_SIZE));
     readStatusTensor_.SetGlobalBuffer((__gm__ int32_t *)(windowInGM_ + WIN_SIZE));
     for (int i = 0; i < SERVER_RANK_SIZE; i++) {
-        hccsHeadTailTensor_[i].SetGlobalBuffer((__gm__ int32_t *)(hccl_.GetWindowsInAddr(rankId_ / SERVER_RANK_SIZE * SERVER_RANK_SIZE + i) + halfWinSize_ - 
-                                                        EACH_HCCS_RING_BUFFER_HEAD_TAIL * i));
+        hccsHeadTailGM[i] = (__gm__ uint8_t *)(reinterpret_cast<uint64_t>(hccl_.GetWindowsInAddr(rankId_ / SERVER_RANK_SIZE * SERVER_RANK_SIZE + i) + halfWinSize_ - 
+                                                        EACH_HCCS_RING_BUFFER_HEAD_TAIL));
     }
-    hccsHeadTailTensor_.SetGlobalBuffer((__gm__ int32_t *)(windowInGM_ + halfWinSize_ - 
-                                                        HCCS_RING_BUFFER_HEAD_TAIL));
-    rdmaHeadTailTensor_.SetGlobalBuffer((__gm__ int32_t *)(windowInGM_ + halfWinSize_ - HCCS_RING_BUFFER_HEAD_TAIL - 
+    // hccsHeadTailTensor_.SetGlobalBuffer((__gm__ int32_t *)(windowInGM_ + halfWinSize_ - 
+    //                                                     HCCS_RING_BUFFER_HEAD_TAIL));
+    rdmaHeadTailTensor_.SetGlobalBuffer((__gm__ uint32_t *)(windowInGM_ + halfWinSize_ - HCCS_RING_BUFFER_HEAD_TAIL - 
                                                         RING_BUFFER_HEAD_TAIL * serverNum));
 
     expertTokenNumsOutGM_ = expertTokenNumsOut;  // 无GlobalTensor
@@ -357,13 +358,13 @@ __aicore__ inline void MoeDistributeDispatchA2Pipeline<TemplateMC2TypeA2Pipeline
     expertToServerIdxTensor_ = expertToServerIdxBuf_.Get<uint32_t>();
 
     tpipe_->InitBuffer(tokenStructInRdmaBuf_, tokenLenInStruct_);
-    tokenStructInRdmaTensor_ = tokenStructInRdmaBuf_.Get<uint32_t>();
+    tokenStructInRdmaTensor_ = tokenStructInRdmaBuf_.Get<uint8_t>();
 
     tpipe_->InitBuffer(tokenStructInHccsBuf_, tokenLenInStruct_);
-    tokenStructInHccsTensor_ = tokenStructInHccsBuf_.Get<uint32_t>();
+    tokenStructInHccsTensor_ = tokenStructInHccsBuf_.Get<uint8_t>();
 
     tpipe_->InitBuffer(rdmaUseTokenStructInHccsBuf_, tokenLenInStruct_);
-    rdmaUseTokenStructInHccsTensor_ = rdmaUseTokenStructInHccsBuf_.Get<uint32_t>();
+    rdmaUseTokenStructInHccsTensor_ = rdmaUseTokenStructInHccsBuf_.Get<uint8_t>();
 
     tpipe_->InitBuffer(expertCountBuf_, moeExpertNum_ * sizeof(int32_t));  // moeNum * 4
     expertCountTensor_ = expertCountBuf_.Get<int32_t>();
@@ -466,7 +467,8 @@ __aicore__ inline void MoeDistributeDispatchA2Pipeline<TemplateMC2TypeA2Pipeline
     }
     taskEnd = taskStart + taskNumPerCore;
     DataCopyExtParams tokenStructParams{1, static_cast<uint32_t>(tokenStructLen_), 0, 0, 0};
-    DataCopyPadExtParams<uint32_t> tokenStructPadParams{false, 0U, 0U, 0U};
+    DataCopyPadExtParams<uint8_t> tokenStructPadParams{false, 0U, 0U, 0U};
+    DataCopyParams hccsHesdTailParams{2, sizeof(uint32_t), 0, 0};
     uint32_t processedTokenNum = 0;
     uint32_t tokenGlobalCnt = 0;
     for (int i = taskStart; i < taskEnd; i++) {
@@ -507,16 +509,20 @@ __aicore__ inline void MoeDistributeDispatchA2Pipeline<TemplateMC2TypeA2Pipeline
                     continue;
                 }
                 uint32_t localDstRank = (dstExpert - expertIdxStart) / localMoeExpertNum_; 
-                GlobalTensor<uint8_t> localDstRankRecvRingU8Tensor;
-                localDstRankRecvRingU8Tensor.SetGlobalBuffer((__gm__ uint8_t *) (hccl_.GetWindowsInAddr(rankId_)) + halfWinSize_ / 2);
-                uint32_t hcclTail = hccsHeadTailTensor_.GetValue(localDstRank * 2 + 1); //localDstRank * 2为第localDstRank个rank的hccl头尾，0为hccl头，1为hccl尾
-                uint32_t hcclHead = hccsHeadTailTensor_.GetValue(localDstRank * 2);
+                GlobalTensor<uint8_t> dstRankRecvRingU8Tensor;
+                dstRankRecvRingU8Tensor.SetGlobalBuffer((__gm__ uint8_t *) (hccl_.GetWindowsInAddr(rankId_)) + halfWinSize_ / 2);
+                LocalTensor<uint32_t> localHccsHeadTailTensor;
+                GlobalTensor<uint32_t> globalHccsHeadTailTensor;
+                globalHccsHeadTailTensor.SetGlobalBuffer((__gm__ uint32_t *)hccsHeadTailGM[rankId_ / SERVER_RANK_SIZE * SERVER_RANK_SIZE + i]);
+                DataCopy(localHccsHeadTailTensor, globalHccsHeadTailTensor[localDstRank], hccsHesdTailParams);
+                uint32_t hcclTail = localHccsHeadTailTensor.GetValue(1); 
+                uint32_t hcclHead = localHccsHeadTailTensor.GetValue(0);
                 uint32_t index = 0;
-                while (hcclHead == (hcclTail + 1) % hccsItemNum && !Ascend::AtomicCas(address + localDstRank, 0, 1)) {//谁抢到锁谁出循环
-                    hcclHead = hccsHeadTailTensor_.GetValue(localDstRank * 2); //优化点，当前处理完一整个token后再进行下一个token的处理，此处可以有优化空间，尝试跳过无空闲的hccs环形缓冲区
+                while (hcclHead == (hcclTail + 1) % hccsItemNum) {//谁抢到锁谁出循环 && !Ascend::AtomicCas(address + localDstRank, 0, 1)
+                    hcclHead = localHccsHeadTailTensor.GetValue(0); //优化点，当前处理完一整个token后再进行下一个token的处理，此处可以有优化空间，尝试跳过无空闲的hccs环形缓冲区
                 }
                 for (int k = 0; k < hccsItemNum; k++) {
-                    DataCopyPad(rdmaUseTokenStructInHccsTensor_, localDstRankRecvRingU8Tensor[k * tokenStructLen_], 
+                    DataCopyPad(rdmaUseTokenStructInHccsTensor_, dstRankRecvRingU8Tensor[k * tokenStructLen_], 
                     tokenStructParams, tokenStructPadParams);
                     LocalTensor<int> tokenIdTensor = rdmaUseTokenStructInHccsTensor_[cntOffsetInStruct_].ReinterpretCast<int>();
                     int tokenId = tokenIdTensor.GetValue(0);
@@ -526,15 +532,16 @@ __aicore__ inline void MoeDistributeDispatchA2Pipeline<TemplateMC2TypeA2Pipeline
                     }
                 }
                 SyncFunc<AscendC::HardEvent::S_MTE3>();
-                DataCopyPad(localDstRankRecvRingU8Tensor[tokenStructLen_ * index], tokenStructInRdmaTensor_, 
-                tokenStructParams, tokenStructPadParams);
+                DataCopyPad(dstRankRecvRingU8Tensor[tokenStructLen_ * index], tokenStructInRdmaTensor_, 
+                tokenStructParams);
                 tokenIdxInStructTensor.SetValue(0, -1);
                 DataCopyPad(rdmaRecvRingU8Tensor_[(i * rdmaItemNum + rdmaHead) * tokenStructLen_], tokenStructInRdmaTensor_, 
-                tokenStructParams, tokenStructPadParams);
+                tokenStructParams);
                 rdmaHead = (rdmaHead + 1) % rdmaItemNum;
                 hcclTail = (hcclTail + 1) % hccsItemNum;
                 rdmaHeadTailTensor_.SetValue(i * RING_BUFFER_HEAD_TAIL + 2, rdmaHead);
-                hccsHeadTailTensor_[localDstRank * 2].SetValue(1, hcclTail);
+                localHccsHeadTailTensor.SetValue(1, hcclTail);
+                DataCopy(globalHccsHeadTailTensor[localDstRank], localHccsHeadTailTensor, hccsHesdTailParams);
             }
             processedTokenNum++;
         }
@@ -584,7 +591,7 @@ __aicore__ inline void MoeDistributeDispatchA2Pipeline<TemplateMC2TypeA2Pipeline
     uint32_t processedTokens = 0;
     DataCopyExtParams tokenStructParams{1, static_cast<uint32_t>(tokenStructLen_), 0, 0, 0};
     DataCopyExtParams tokenParams{1, static_cast<uint32_t>(tokenLenInStruct_), 0, 0, 0};
-    DataCopyPadExtParams<uint32_t> tokenStructPadParams{false, 0U, 0U, 0U};
+    DataCopyPadExtParams<uint8_t> tokenStructPadParams{false, 0U, 0U, 0U};
     DataCopyExtParams weightParams{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
     DataCopyPadExtParams<uint32_t> weightExtParams{false, 0U, 0U, 0U};
     DataCopyExtParams scalesParams{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
@@ -601,14 +608,15 @@ __aicore__ inline void MoeDistributeDispatchA2Pipeline<TemplateMC2TypeA2Pipeline
             tokenStructParams, tokenStructPadParams);
             uint32_t expertIdxStart = localMoeExpertNum_ * rankId_;
             uint32_t expertIdxEnd = expertIdxStart + localMoeExpertNum_;
-            LocalTensor<int> tokenIdxInStructTensor = tokenStructInHccsTensor_[cntOffsetInStruct_].Reinterpret<int>();
+            LocalTensor<int> tokenIdxInStructTensor = tokenStructInHccsTensor_[cntOffsetInStruct_].ReinterpretCast<int>();
+            LocalTensor<uint8_t> tokenIdxInStructToGmTensor = tokenStructInHccsTensor_[cntOffsetInStruct_];
             uint32_t tokenIdx = tokenIdxInStructTensor.GetValue(0);
             if (tokenIdx < 0) {
                 continue;
             }
-            LocalTensor<float> weightTensor = tokenStructInHccsTensor_[weightOffsetInStruct_].Reinterpret<float>();
-            LocalTensor<ExpandXOutType> tokenOutTensor = tokenStructInHccsTensor_.Reinterpret<ExpandXOutType>();
-            LocalTensor<int> topkIdxTensor = tokenStructInHccsTensor_[expOffsetInStruct_].Reinterpret<int>();
+            LocalTensor<float> weightTensor = tokenStructInHccsTensor_[weightOffsetInStruct_].ReinterpretCast<float>();
+            LocalTensor<ExpandXOutType> tokenOutTensor = tokenStructInHccsTensor_.ReinterpretCast<ExpandXOutType>();
+            LocalTensor<int> topkIdxTensor = tokenStructInHccsTensor_[expOffsetInStruct_].ReinterpretCast<int>();
             uint32_t dstOffset = 0;
             for (int j = 0; j < axisK_; j++) {
                 SyncFunc<AscendC::HardEvent::MTE3_S>();
@@ -622,15 +630,14 @@ __aicore__ inline void MoeDistributeDispatchA2Pipeline<TemplateMC2TypeA2Pipeline
                 DataCopyPad(expandXOutGMTensor_[dstOffset], tokenOutTensor, tokenParams);
                 // dynamic scales to output
                 if constexpr (DynamicQuant) {
-                    LocalTensor<float> quantTempUB = localUB[scaleOffsetInStruct_].ReinterpretCast<float>();
+                    LocalTensor<float> quantTempUB = tokenStructInHccsTensor_[scaleOffsetInStruct_].ReinterpretCast<float>();
                     DataCopyPad(dynamicScalesOutGMTensor_[dstOffset], quantTempUB, scalesParams);
                 }
             }
             tokenIdxInStructTensor.SetValue(0, -1);
-            DataCopyPad(hccsRecvRingU8Tensor_[tokenStructLen_ * i], tokenIdxInStructTensor, tokenStructParams, 
-            tokenStructPadParams);
+            DataCopyPad(hccsRecvRingU8Tensor_[tokenStructLen_ * i], tokenIdxInStructToGmTensor, tokenStructParams);
             uint32_t hcclHead = hccsHeadTailTensor_.GetValue(localRankId * 2); //需要一个锁，避免多个core同时更新本rank的head
-            hccsTailTensor_.SeteValue(localRankId, hcclHead + 1);
+            hccsHeadTailTensor_.SetValue(localRankId, hcclHead + 1);
             ++processedTokens;
         }
     }