diff --git a/llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.cpp b/llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.cpp index a6906d963467..35c47b975854 100644 --- a/llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.cpp +++ b/llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.cpp @@ -14,6 +14,8 @@ #include "AIEBasePipelinerLoopInfo.h" #include "AIEBaseInstrInfo.h" +#include "AIELoopClass.h" +#include "AIESlotStatistics.h" #include "Utils/AIELoopUtils.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -647,7 +649,9 @@ class ZeroOverheadLoop : public AIEBasePipelinerLoopInfo { MachineInstr *DefTripCount; MachineBasicBlock *LoopStartBlock; - // Decide whether the postpipeliner may do a better job + // Decide whether the postpipeliner may do a better job, + // A priori, or after scheduling + bool preferPostPipeliner(); bool preferPostPipeliner(SMSchedule &SMS); public: @@ -665,6 +669,23 @@ class ZeroOverheadLoop : public AIEBasePipelinerLoopInfo { bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS) override; }; +static const std::set RejectedLoopClasses{1000}; + +bool ZeroOverheadLoop::preferPostPipeliner() { + AIE::SlotStatistics Stats = AIE::computeSlotStatistics(*LoopBlock, &TII); + int LoopClass = classifyLoop(Stats); + LLVM_DEBUG(dbgs() << "Stats="; Stats.dumpShort(); + dbgs() << format("\nLoopClass=%d", LoopClass)); + + if (RejectedLoopClasses.count(LoopClass)) { + LLVM_DEBUG(dbgs() << format("PLI: Leaving loopclass %d for PostPipeliner\n", + LoopClass)); + return true; + } + + return false; +} + ZeroOverheadLoop::Assessment ZeroOverheadLoop::accept(MachineInstr *EndLoop) { if (!MinTripCount) { LLVM_DEBUG(dbgs() << "Unbounded loop detected!\n"); @@ -718,6 +739,10 @@ ZeroOverheadLoop::Assessment ZeroOverheadLoop::accept(MachineInstr *EndLoop) { setMinTripCount(InitVal); } + if (preferPostPipeliner()) { + return Assessment::PostPipelinerCandidate; + } + LLVM_DEBUG(dbgs() << "Loop accepted\n"); return Assessment::Accept; } @@ -853,7 +878,6 @@ createAIEBasePipelinerLoopInfo(MachineInstr *EndLoop, const AIEBaseInstrInfo &TII) { LLVM_DEBUG(dbgs() << "PLI: ----START LOOP----\n"); LLVM_DEBUG(dbgs() << " Trying DownCountLoop\n"); - DownCountLoop DCL(EndLoop, TII); auto Outcome = DCL.accept(EndLoop); if (Outcome == AIEBasePipelinerLoopInfo::Assessment::Accept) { diff --git a/llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.h b/llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.h index d7ad42746300..c542fffb54e8 100644 --- a/llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.h +++ b/llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.h @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // @@ -74,7 +74,8 @@ class AIEBasePipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { UnboundedLoop, UnsuitableInitVal, InitStepMismatch, - TooLowMinTripCount + TooLowMinTripCount, + PostPipelinerCandidate }; AIEBasePipelinerLoopInfo(MachineInstr *EndLoop, const AIEBaseInstrInfo &TII); diff --git a/llvm/lib/Target/AIE/AIELoopClass.cpp b/llvm/lib/Target/AIE/AIELoopClass.cpp index 2b30e5b3a028..5625c3b055d0 100644 --- a/llvm/lib/Target/AIE/AIELoopClass.cpp +++ b/llvm/lib/Target/AIE/AIELoopClass.cpp @@ -78,6 +78,10 @@ static const KernelFeatures Kernels[] = { {45, {{900, 300, 0, 2520, 0, 0, 3360}, {45, 45, 0, 45, 45}}}, {46, {{0, 0, 0, 0, 2160, 0, 120, 1080}, {0, 420, 420}}}, {47, {{0, 0, 0, 0, 360, 0, 240, 360}, {0, 60, 60}}}, + // These are pre-regalloc + {1000, {{0, 0, 0, 0, 480, 0, 480, 480}, {0, 120, 120}}}, + {1001, {{0, 0, 0, 0, 360, 0, 480, 720}, {0, 60, 60}}}, + }; std::vector getLoopClassScores(const SlotStatistics &Stats) { diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll index 756027fa70eb..401aa4affd85 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll @@ -111,22 +111,22 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vband x9, x10, x3; nopv ; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh3, wl2 -; CHECK-NEXT: nopx ; vmov wh9, wl2; vmul.f bmh4, x7, x0, r1 -; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x11, x8; vmac.f bmh3, bmh0, x3, x4, r1 +; CHECK-NEXT: nopx ; vmov wh9, wl2; vmul.f bmh5, x7, x0, r1 +; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x11, x8; vmac.f bmh4, bmh0, x3, x4, r1 ; CHECK-NEXT: vband x9, x10, x5; vmul.f bmh2, x6, x9, r1 -; CHECK-NEXT: vmov wh9, wl2; vmul.f bmh5, x7, x0, r1 -; CHECK-NEXT: vsub.f bml1, bmh4, bmh1, r0 -; CHECK-NEXT: vmul.f bmh7, x6, x9, r1 -; CHECK-NEXT: vmul.f bmh6, x0, x7, r1 -; CHECK-NEXT: vmov wh5, wl2; vsub.f bml2, bmh5, bmh1, r0 +; CHECK-NEXT: vmov wh9, wl2; vmul.f bmh6, x7, x0, r1 +; CHECK-NEXT: vsub.f bml0, bmh5, bmh1, r0 +; CHECK-NEXT: vmul.f bmh3, x6, x9, r1 +; CHECK-NEXT: vmul.f bmh7, x0, x7, r1 +; CHECK-NEXT: vmov wh5, wl2; vsub.f bml1, bmh6, bmh1, r0 ; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh2; vmul.f bmh8, x0, x7, r1 -; CHECK-NEXT: vmac.f bml0, bmh0, x5, x4, r1 -; CHECK-NEXT: vmsc.f bml3, bmh3, x7, x3, r1 +; CHECK-NEXT: vmac.f bml2, bmh0, x5, x4, r1 +; CHECK-NEXT: vmsc.f bml3, bmh4, x7, x3, r1 +; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh3 ; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh7 -; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh6 -; CHECK-NEXT: vst.conv.bf16.fp32 bml1, [p1], #32; vmsc.f bml4, bml0, x3, x5, r1 +; CHECK-NEXT: vst.conv.bf16.fp32 bml0, [p1], #32; vmsc.f bml4, bml2, x3, x5, r1 ; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh8; vmin_ge.bf16 x9, r16, x3, x1 -; CHECK-NEXT: vst.conv.bf16.fp32 bml2, [p1], #32; vmax_lt.bf16 x3, r16, x9, x8 +; CHECK-NEXT: vst.conv.bf16.fp32 bml1, [p1], #32; vmax_lt.bf16 x3, r16, x9, x8 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x11, r16, x5, x1; nopv ; CHECK-NEXT: // %bb.2: