Skip to content

Commit c84376f

Browse files
committed
Optimize the performance of the Combine Ant Moving function.
1 parent ea4949d commit c84376f

File tree

1 file changed

+1
-6
lines changed

1 file changed

+1
-6
lines changed

csrc/deepep/ops/op_kernel/cam_moe_combine_normal_multi_round.h

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,6 @@ __aicore__ inline void CamMoeCombineNormalMultiRound<TemplateMC2TypeFunc>::InitR
293293

294294
// 创建localCopyQueue_, 用于存放从GM拷贝到UB的token
295295
tpipe_->InitBuffer(localCopyQueue_, DOUBLE_BUFFER, h32AlignRecvXLen_); // 28KB
296-
PipeBarrier<PIPE_ALL>();
297296
}
298297

299298
template <TemplateMC2TypeClass>
@@ -339,7 +338,6 @@ __aicore__ inline void CamMoeCombineNormalMultiRound<TemplateMC2TypeFunc>::Init(
339338
localRankGM_ = GetBufferAddrByRankId(epRankId_);
340339
DataCacheCleanAndInvalid<SrcInfoType, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(
341340
epRecvCountGM_[moeExpertNum_ - 1]);
342-
PipeBarrier<PIPE_ALL>();
343341

344342
InitRoundSendData();
345343
InitRoundRecvData();
@@ -453,7 +451,6 @@ __aicore__ inline void CamMoeCombineNormalMultiRound<TemplateMC2TypeFunc>::SetSt
453451
GlobalTensor<uint32_t> stateGMTensor;
454452
stateGMTensor.SetGlobalBuffer((__gm__ uint32_t *)stateGM);
455453
DataCopy<uint32_t>(stateGMTensor, setStateLT_, FLOAT_NUM_PER_ALIGN);
456-
PipeBarrier<PIPE_ALL>();
457454
}
458455

459456
template <TemplateMC2TypeClass>
@@ -516,7 +513,6 @@ __aicore__ inline void CamMoeCombineNormalMultiRound<TemplateMC2TypeFunc>::ReadB
516513
Cast(xOutLocal, sumFloatBufLocal, AscendC::RoundMode::CAST_RINT, axisH_);
517514
SyncFunc<AscendC::HardEvent::V_MTE3>();
518515
DataCopyPad(xOutGlobal_[xOutTokenIdx * axisH_], xOutLocal, xOutCopyParams);
519-
PipeBarrier<PIPE_ALL>();
520516
}
521517

522518
template <TemplateMC2TypeClass>
@@ -535,7 +531,7 @@ __aicore__ inline void CamMoeCombineNormalMultiRound<TemplateMC2TypeFunc>::ReadB
535531
const DataCopyPadExtParams<float> copyPadFloatParams{false, 0U, 0U, 0U};
536532
DataCopyPad(topkWeightsLT_, topkWeightsGM_[(xOutTokenOffset_ + roundRecvStartTokenIdx_) * axisK_], bskParams,
537533
copyPadFloatParams);
538-
PipeBarrier<PIPE_ALL>();
534+
SyncFunc<AscendC::HardEvent::MTE2_S>();
539535

540536
for (uint32_t roundTokenIdx = roundRecvStartTokenIdx_; roundTokenIdx < roundRecvEndTokenIdx_;
541537
roundTokenIdx++) { // 每轮都从从hccl buffer起始位置读put来的数据
@@ -597,7 +593,6 @@ __aicore__ inline void CamMoeCombineNormalMultiRound<TemplateMC2TypeFunc>::WaitR
597593
Duplicate<float>(tempRoundStateTensorLocal, (float)0.0, count);
598594
SyncFunc<AscendC::HardEvent::V_MTE3>();
599595
DataCopy<float>(roundStatusGMTensor, tempRoundStateTensorLocal, count);
600-
PipeBarrier<PIPE_ALL>();
601596
}
602597

603598
template <TemplateMC2TypeClass>

0 commit comments

Comments
 (0)