@@ -293,7 +293,6 @@ __aicore__ inline void CamMoeCombineNormalMultiRound<TemplateMC2TypeFunc>::InitR
293293
294294 // 创建localCopyQueue_, 用于存放从GM拷贝到UB的token
295295 tpipe_->InitBuffer (localCopyQueue_, DOUBLE_BUFFER, h32AlignRecvXLen_); // 28KB
296- PipeBarrier<PIPE_ALL>();
297296}
298297
299298template <TemplateMC2TypeClass>
@@ -339,7 +338,6 @@ __aicore__ inline void CamMoeCombineNormalMultiRound<TemplateMC2TypeFunc>::Init(
339338 localRankGM_ = GetBufferAddrByRankId (epRankId_);
340339 DataCacheCleanAndInvalid<SrcInfoType, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(
341340 epRecvCountGM_[moeExpertNum_ - 1 ]);
342- PipeBarrier<PIPE_ALL>();
343341
344342 InitRoundSendData ();
345343 InitRoundRecvData ();
@@ -453,7 +451,6 @@ __aicore__ inline void CamMoeCombineNormalMultiRound<TemplateMC2TypeFunc>::SetSt
453451 GlobalTensor<uint32_t > stateGMTensor;
454452 stateGMTensor.SetGlobalBuffer ((__gm__ uint32_t *)stateGM);
455453 DataCopy<uint32_t >(stateGMTensor, setStateLT_, FLOAT_NUM_PER_ALIGN);
456- PipeBarrier<PIPE_ALL>();
457454}
458455
459456template <TemplateMC2TypeClass>
@@ -516,7 +513,6 @@ __aicore__ inline void CamMoeCombineNormalMultiRound<TemplateMC2TypeFunc>::ReadB
516513 Cast (xOutLocal, sumFloatBufLocal, AscendC::RoundMode::CAST_RINT, axisH_);
517514 SyncFunc<AscendC::HardEvent::V_MTE3>();
518515 DataCopyPad (xOutGlobal_[xOutTokenIdx * axisH_], xOutLocal, xOutCopyParams);
519- PipeBarrier<PIPE_ALL>();
520516}
521517
522518template <TemplateMC2TypeClass>
@@ -535,7 +531,7 @@ __aicore__ inline void CamMoeCombineNormalMultiRound<TemplateMC2TypeFunc>::ReadB
535531 const DataCopyPadExtParams<float > copyPadFloatParams{false , 0U , 0U , 0U };
536532 DataCopyPad (topkWeightsLT_, topkWeightsGM_[(xOutTokenOffset_ + roundRecvStartTokenIdx_) * axisK_], bskParams,
537533 copyPadFloatParams);
538- PipeBarrier<PIPE_ALL >();
534+ SyncFunc<AscendC::HardEvent::MTE2_S >();
539535
540536 for (uint32_t roundTokenIdx = roundRecvStartTokenIdx_; roundTokenIdx < roundRecvEndTokenIdx_;
541537 roundTokenIdx++) { // 每轮都从从hccl buffer起始位置读put来的数据
@@ -597,7 +593,6 @@ __aicore__ inline void CamMoeCombineNormalMultiRound<TemplateMC2TypeFunc>::WaitR
597593 Duplicate<float >(tempRoundStateTensorLocal, (float )0.0 , count);
598594 SyncFunc<AscendC::HardEvent::V_MTE3>();
599595 DataCopy<float >(roundStatusGMTensor, tempRoundStateTensorLocal, count);
600- PipeBarrier<PIPE_ALL>();
601596}
602597
603598template <TemplateMC2TypeClass>
0 commit comments