diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h index 660670ccdcd75..f44afd48f7f29 100644 --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -1068,6 +1068,15 @@ class SchedBoundary { /// Dump the state of the information that tracks resource usage. void dumpReservedCycles() const; void dumpScheduledState() const; + + /// Bump the cycle until the given SU is released from the pending queue. + void bumpCycleUntilReleaseSUFromPending(SUnit *SU, unsigned ReadyListLimit) { + while (Available.size() < ReadyListLimit && + llvm::find(Pending, SU) != Pending.end()) { + bumpCycle(CurrCycle + 1); + releasePending(); + } + } }; /// Base class for GenericScheduler. This class maintains information about @@ -1262,6 +1271,8 @@ class GenericScheduler : public GenericSchedulerBase { BotCand.SU = nullptr; } + void bumpCycleUntilReleaseSUFromPending(bool IsTop); + void registerRoots() override; protected: diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 114149ff53d85..62e622800d70f 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -1190,6 +1190,22 @@ class TargetRegisterInfo : public MCRegisterInfo { return false; } + /// Based on the target and current register pressure information from the + /// Scheduler, determine whether to release the node in the pending queue + virtual bool + shouldReleasePendingQueue(MachineFunction &MF, + ArrayRef MaxSetPressure) const { + return false; + } + + /// For each SUnit, determine whether to release it from the pending queue + /// based on the register pressure changes associated with that SUnit. + virtual bool shouldReleaseSUFromPendingQueue(MachineFunction &MF, + ArrayRef PSetID, + ArrayRef UnitInc) const { + return false; + } + //===--------------------------------------------------------------------===// /// Debug information queries. diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 393530f56cc27..1aba63df632a4 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -161,6 +161,10 @@ static cl::opt EnableCyclicPath("misched-cyclicpath", cl::Hidden, static cl::opt EnableMemOpCluster("misched-cluster", cl::Hidden, cl::desc("Enable memop clustering."), cl::init(true)); +static cl::opt + EnableReleasePendingQ("misched-release-pending-queue", cl::Hidden, + cl::desc("Release the pending queue"), + cl::init(true)); static cl::opt ForceFastCluster("force-fast-cluster", cl::Hidden, cl::desc("Switch to fast cluster algorithm with the lost " @@ -3656,6 +3660,37 @@ void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone, } } +void GenericScheduler::bumpCycleUntilReleaseSUFromPending(bool IsTop) { + if (!DAG->isTrackingPressure()) + return; + auto ReleasePending = [&](ReadyQueue &Q, const RegPressureTracker &RegP, + ArrayRef MaxSetP, SchedBoundary &SchedB) { + for (SUnit *SU : Q) { + RegPressureTracker &TempTracker = const_cast(RegP); + CandPolicy TempPolicy; + SchedCandidate TryCand(TempPolicy); + initCandidate(TryCand, SU, IsTop, RegP, TempTracker); + PressureDiff PDiff = DAG->getPressureDiff(SU); + SmallVector PSetIDs; + SmallVector UnitIncs; + for (const auto &PChange : PDiff) { + if (!PChange.isValid()) + continue; + PSetIDs.push_back(PChange.getPSet()); + UnitIncs.push_back(PChange.getUnitInc()); + } + if (TRI->shouldReleaseSUFromPendingQueue(DAG->MF, PSetIDs, UnitIncs)) + SchedB.bumpCycleUntilReleaseSUFromPending(SU, ReadyListLimit); + } + }; + if (IsTop) + ReleasePending(Top.Pending, DAG->getTopRPTracker(), + DAG->getTopRPTracker().getPressure().MaxSetPressure, Top); + else + ReleasePending(Bot.Pending, DAG->getBotRPTracker(), + DAG->getBotRPTracker().getPressure().MaxSetPressure, Bot); +} + /// Pick the best candidate node from either the top or bottom queue. SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) { // Schedule as far as possible in the direction of no choice. This is most @@ -3741,6 +3776,14 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) { Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); return nullptr; } + + if(EnableReleasePendingQ) { + if (!RegionPolicy.OnlyBottomUp && TRI->shouldReleasePendingQueue(DAG->MF, DAG->getTopRPTracker().getPressure().MaxSetPressure)) + bumpCycleUntilReleaseSUFromPending(/*IsTop=*/true); + if (!RegionPolicy.OnlyTopDown && TRI->shouldReleasePendingQueue(DAG->MF, DAG->getBotRPTracker().getPressure().MaxSetPressure)) + bumpCycleUntilReleaseSUFromPending(/*IsTop=*/false); + } + SUnit *SU; do { if (RegionPolicy.OnlyTopDown) { diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index b0a52698c1e9f..5caf8158be008 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -37,6 +37,18 @@ static cl::opt cl::desc("Disable two address hints for register " "allocation")); +static cl::opt RVVReleasePendingPressureThreshold( + "riscv-release-pending-queue-rvv-pressure-threshold", cl::Hidden, + cl::desc("Release pending queue when vector register pressure exceeds " + "threshold"), + cl::init(7)); + +static cl::opt UnitIncRVVRegPressureThreshold( + "riscv-release-pending-queue-unit-inc-rvv-pressure-threshold", cl::Hidden, + cl::desc("Release pending queue when vector register unit pressure " + "increase exceeds threshold"), + cl::init(-3)); + static_assert(RISCV::X1 == RISCV::X0 + 1, "Register list not consecutive"); static_assert(RISCV::X31 == RISCV::X0 + 31, "Register list not consecutive"); static_assert(RISCV::F1_H == RISCV::F0_H + 1, "Register list not consecutive"); @@ -954,3 +966,47 @@ bool RISCVRegisterInfo::getRegAllocationHints( return BaseImplRetVal; } + +static bool isRVVPressureSetIdx(unsigned Idx) { + const static std::vector RVVPressureSetIndices = { + RISCV::RegisterPressureSets::VRM8NoV0, RISCV::RegisterPressureSets::VM}; + + return llvm::find(RVVPressureSetIndices, Idx) != RVVPressureSetIndices.end(); +} + +bool RISCVRegisterInfo::shouldReleasePendingQueue( + MachineFunction &MF, ArrayRef MaxSetPressure) const { + + if (!MF.getSubtarget().hasVInstructions()) + return false; + + for (unsigned Idx = 0; Idx < MaxSetPressure.size(); Idx++) { + // Consider only the RVV Register, as RVV spilling/reloading has higher + // potential costs than hazards. + if (!isRVVPressureSetIdx(Idx)) + continue; + if (MaxSetPressure[Idx] + RVVReleasePendingPressureThreshold > + getRegPressureSetLimit(MF, Idx)) { + return true; + } + } + return false; +} + +bool RISCVRegisterInfo::shouldReleaseSUFromPendingQueue( + MachineFunction &MF, ArrayRef PSetID, + ArrayRef UnitInc) const { + + if (!MF.getSubtarget().hasVInstructions()) + return false; + + for (unsigned Idx = 0; Idx < PSetID.size(); Idx++) { + // Consider only the RVV Register, as RVV spilling/reloading has higher + // potential costs than hazards. + if (!isRVVPressureSetIdx(PSetID[Idx])) + continue; + if (UnitInc[Idx] < UnitIncRVVRegPressureThreshold) + return true; + } + return false; +} diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h index 3ab79694e175c..026d00f253e91 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h @@ -144,6 +144,13 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo { static bool isRVVRegClass(const TargetRegisterClass *RC) { return RISCVRI::isVRegClass(RC->TSFlags); } + bool + shouldReleasePendingQueue(MachineFunction &MF, + ArrayRef MaxSetPressure) const override; + + bool shouldReleaseSUFromPendingQueue(MachineFunction &MF, + ArrayRef PSetID, + ArrayRef UnitInc) const override; }; } // namespace llvm diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll index 1ed84316d4484..cd795f722676b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll @@ -1137,38 +1137,38 @@ define @bitreverse_nxv8i64( %va) { ; RV32-NEXT: li a1, 56 ; RV32-NEXT: li a2, 40 ; RV32-NEXT: lui a3, 16 -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: addi a5, sp, 8 -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: vsrl.vx v24, v8, a2 -; RV32-NEXT: addi a0, a3, -256 +; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vsll.vx v0, v8, a1 -; RV32-NEXT: vand.vx v24, v24, a0 +; RV32-NEXT: vand.vx v24, v24, a3 ; RV32-NEXT: vor.vv v16, v24, v16 ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vand.vx v16, v8, a3 ; RV32-NEXT: vsll.vx v16, v16, a2 ; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v0, (a5), zero -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a4 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: lui a1, 4080 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v0, a1 ; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v24 -; RV32-NEXT: vand.vv v16, v8, v0 -; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vand.vx v8, v8, a1 ; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v16, v16, 8 ; RV32-NEXT: vor.vv v8, v8, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll index 4d34621cd5f24..8ae560c07e210 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll @@ -2288,66 +2288,68 @@ define @vp_bitreverse_nxv7i64( %va, @vp_bitreverse_nxv7i64_unmasked( %va ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 ; RV32-NEXT: lui a5, 4080 -; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v16, v8, a2 -; RV32-NEXT: addi a1, a3, -256 ; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vsrl.vx v0, v8, a4 -; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vand.vx v0, v0, a3 ; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v24, v8, a1 +; RV32-NEXT: addi a6, sp, 16 +; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v24, v8, a3 ; RV32-NEXT: vsll.vx v24, v24, a4 ; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v24, (a6), zero -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a5 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v24, a5 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v0, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v24, v8, v16 ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v0, v8 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: lui a3, 349525 @@ -2673,66 +2675,68 @@ define @vp_bitreverse_nxv8i64( %va, @vp_bitreverse_nxv8i64_unmasked( %va ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 ; RV32-NEXT: lui a5, 4080 -; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v16, v8, a2 -; RV32-NEXT: addi a1, a3, -256 ; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vsrl.vx v0, v8, a4 -; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vand.vx v0, v0, a3 ; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v24, v8, a1 +; RV32-NEXT: addi a6, sp, 16 +; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v24, v8, a3 ; RV32-NEXT: vsll.vx v24, v24, a4 ; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v24, (a6), zero -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a5 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v24, a5 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v0, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v24, v8, v16 ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v0, v8 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: lui a3, 349525 diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll index 2cd763afa36b7..2de70a8cf1ce0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll @@ -518,38 +518,38 @@ define @bswap_nxv8i64( %va) { ; RV32-NEXT: li a1, 56 ; RV32-NEXT: li a2, 40 ; RV32-NEXT: lui a3, 16 -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: addi a5, sp, 8 -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: vsrl.vx v24, v8, a2 -; RV32-NEXT: addi a0, a3, -256 +; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vsll.vx v0, v8, a1 -; RV32-NEXT: vand.vx v24, v24, a0 +; RV32-NEXT: vand.vx v24, v24, a3 ; RV32-NEXT: vor.vv v16, v24, v16 ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vand.vx v16, v8, a3 ; RV32-NEXT: vsll.vx v16, v16, a2 ; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v0, (a5), zero -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a4 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: lui a1, 4080 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v0, a1 ; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v24 -; RV32-NEXT: vand.vv v16, v8, v0 -; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vand.vx v8, v8, a1 ; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v16, v16, 8 ; RV32-NEXT: vor.vv v8, v8, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll index 0c58cca0f9472..09df35600bce8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll @@ -1022,59 +1022,61 @@ define @vp_bswap_nxv7i64( %va, @vp_bswap_nxv7i64_unmasked( %va, i32 ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 ; RV32-NEXT: lui a5, 4080 -; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v24, v8, a2 -; RV32-NEXT: addi a0, a3, -256 ; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: addi a2, a3, -256 ; RV32-NEXT: vsrl.vx v0, v8, a4 -; RV32-NEXT: vand.vx v0, v0, a0 +; RV32-NEXT: vand.vx v0, v0, a2 ; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a0 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a2 ; RV32-NEXT: vsll.vx v0, v0, a4 ; RV32-NEXT: vor.vv v16, v24, v0 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v0, (a6), zero -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a5 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v0, a5 ; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 @@ -1292,59 +1295,61 @@ define @vp_bswap_nxv8i64( %va, @vp_bswap_nxv8i64_unmasked( %va, i32 ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 ; RV32-NEXT: lui a5, 4080 -; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v24, v8, a2 -; RV32-NEXT: addi a0, a3, -256 ; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: addi a2, a3, -256 ; RV32-NEXT: vsrl.vx v0, v8, a4 -; RV32-NEXT: vand.vx v0, v0, a0 +; RV32-NEXT: vand.vx v0, v0, a2 ; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a0 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a2 ; RV32-NEXT: vsll.vx v0, v0, a4 ; RV32-NEXT: vor.vv v16, v24, v0 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v0, (a6), zero -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a5 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v0, a5 ; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll index 15f6ca600cb37..799d12247e905 100644 --- a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll @@ -75,17 +75,17 @@ define fastcc @ret_split_nxv64i32(ptr %x) { ; CHECK-NEXT: slli a4, a2, 5 ; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub a4, a4, a3 -; CHECK-NEXT: add a5, a1, a2 -; CHECK-NEXT: vl8re32.v v16, (a5) ; CHECK-NEXT: add a5, a1, a3 +; CHECK-NEXT: vl8re32.v v16, (a5) +; CHECK-NEXT: add a5, a1, a2 ; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: add a1, a1, a4 ; CHECK-NEXT: vl8re32.v v24, (a5) ; CHECK-NEXT: vl8re32.v v0, (a1) ; CHECK-NEXT: vs8r.v v8, (a0) -; CHECK-NEXT: vs8r.v v16, (a2) -; CHECK-NEXT: vs8r.v v24, (a3) +; CHECK-NEXT: vs8r.v v24, (a2) +; CHECK-NEXT: vs8r.v v16, (a3) ; CHECK-NEXT: add a0, a0, a4 ; CHECK-NEXT: vs8r.v v0, (a0) ; CHECK-NEXT: ret @@ -245,59 +245,21 @@ define fastcc @ret_nxv32i1_param_nxv32i1_nxv32i1( @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32( %x, %y, %z, i32 %w) { ; CHECK-LABEL: ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vl8re32.v v8, (a2) -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re32.v v24, (a2) ; CHECK-NEXT: vl8re32.v v0, (a0) ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a2, a2, a1 ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl8re32.v v8, (a0) -; CHECK-NEXT: vl8re32.v v16, (a2) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; CHECK-NEXT: vadd.vv v0, v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v24, v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v8, v0, v8 -; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: vadd.vx v16, v8, a4 -; CHECK-NEXT: vadd.vx v8, v24, a4 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: vl8re32.v v0, (a2) +; CHECK-NEXT: vadd.vv v8, v8, v24 +; CHECK-NEXT: vl8re32.v v24, (a0) +; CHECK-NEXT: vadd.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v16, v16, v0 +; CHECK-NEXT: vadd.vx v16, v16, a4 +; CHECK-NEXT: vadd.vx v8, v8, a4 ; CHECK-NEXT: ret %r = add %x, %y %s = add %r, %z @@ -410,29 +372,31 @@ define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_ ; RV32-NEXT: andi sp, sp, -128 ; RV32-NEXT: addi a1, sp, 128 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv8r.v v0, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: vl8re32.v v16, (a2) +; RV32-NEXT: vl8re32.v v8, (a2) ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 128 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a2, a2, a1 ; RV32-NEXT: add a3, a0, a1 -; RV32-NEXT: vl8re32.v v0, (a2) -; RV32-NEXT: vl8re32.v v24, (a3) -; RV32-NEXT: vl8re32.v v16, (a0) +; RV32-NEXT: vl8re32.v v24, (a2) +; RV32-NEXT: vl8re32.v v16, (a3) +; RV32-NEXT: vl8re32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vs8r.v v0, (a0) ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 128 -; RV32-NEXT: vs8r.v v16, (a3) +; RV32-NEXT: vs8r.v v8, (a3) ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: addi a2, sp, 128 ; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload @@ -447,14 +411,13 @@ define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_ ; RV32-NEXT: addi a2, a2, 128 ; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: li a5, 42 -; RV32-NEXT: vs8r.v v24, (a1) +; RV32-NEXT: vs8r.v v16, (a1) ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 128 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv8r.v v16, v0 +; RV32-NEXT: vmv8r.v v16, v24 ; RV32-NEXT: call ext3 ; RV32-NEXT: addi sp, s0, -144 ; RV32-NEXT: .cfi_def_cfa sp, 144 @@ -483,29 +446,31 @@ define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_ ; RV64-NEXT: andi sp, sp, -128 ; RV64-NEXT: addi a1, sp, 128 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv8r.v v0, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: vl8re32.v v16, (a2) +; RV64-NEXT: vl8re32.v v8, (a2) ; RV64-NEXT: csrr a3, vlenb ; RV64-NEXT: slli a3, a3, 3 ; RV64-NEXT: add a3, sp, a3 ; RV64-NEXT: addi a3, a3, 128 -; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a2, a2, a1 ; RV64-NEXT: add a3, a0, a1 -; RV64-NEXT: vl8re32.v v0, (a2) -; RV64-NEXT: vl8re32.v v24, (a3) -; RV64-NEXT: vl8re32.v v16, (a0) +; RV64-NEXT: vl8re32.v v24, (a2) +; RV64-NEXT: vl8re32.v v16, (a3) +; RV64-NEXT: vl8re32.v v8, (a0) ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vs8r.v v0, (a0) ; RV64-NEXT: csrr a3, vlenb ; RV64-NEXT: slli a3, a3, 5 ; RV64-NEXT: add a3, sp, a3 ; RV64-NEXT: addi a3, a3, 128 -; RV64-NEXT: vs8r.v v16, (a3) +; RV64-NEXT: vs8r.v v8, (a3) ; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: addi a2, sp, 128 ; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload @@ -520,14 +485,13 @@ define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_ ; RV64-NEXT: addi a2, a2, 128 ; RV64-NEXT: add a1, a3, a1 ; RV64-NEXT: li a5, 42 -; RV64-NEXT: vs8r.v v24, (a1) +; RV64-NEXT: vs8r.v v16, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 128 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv8r.v v16, v0 +; RV64-NEXT: vmv8r.v v16, v24 ; RV64-NEXT: call ext3 ; RV64-NEXT: addi sp, s0, -144 ; RV64-NEXT: .cfi_def_cfa sp, 144 @@ -549,9 +513,9 @@ define fastcc @vector_arg_indirect_stack(i32 %0, i32 %1, i32 ; CHECK-LABEL: vector_arg_indirect_stack: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vl8re32.v v24, (t5) ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, t5, a0 -; CHECK-NEXT: vl8re32.v v24, (t5) ; CHECK-NEXT: vl8re32.v v0, (a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll index 2e181e0914c88..5ed7308cd0b94 100644 --- a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll @@ -7,10 +7,10 @@ define @callee_scalable_vector_split_indirect( @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8 ; CHECK-RV32-NEXT: addi sp, sp, -16 ; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV32-NEXT: csrr a2, vlenb -; CHECK-RV32-NEXT: slli a2, a2, 5 +; CHECK-RV32-NEXT: slli a2, a2, 4 ; CHECK-RV32-NEXT: sub sp, sp, a2 -; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-RV32-NEXT: csrr a2, vlenb -; CHECK-RV32-NEXT: li a3, 24 -; CHECK-RV32-NEXT: mul a2, a2, a3 +; CHECK-RV32-NEXT: slli a2, a2, 3 ; CHECK-RV32-NEXT: add a2, sp, a2 ; CHECK-RV32-NEXT: addi a2, a2, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vmv1r.v v7, v8 ; CHECK-RV32-NEXT: li a2, 128 -; CHECK-RV32-NEXT: vslidedown.vi v9, v0, 1 +; CHECK-RV32-NEXT: vslidedown.vi v6, v0, 1 ; CHECK-RV32-NEXT: li a3, 32 ; CHECK-RV32-NEXT: vmv.x.s a4, v0 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-RV32-NEXT: viota.m v16, v0 +; CHECK-RV32-NEXT: addi a5, sp, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vcpop.m a5, v0 +; CHECK-RV32-NEXT: vsetvli zero, a5, e8, m8, ta, ma +; CHECK-RV32-NEXT: vle8.v v24, (a0) +; CHECK-RV32-NEXT: csrr a5, vlenb +; CHECK-RV32-NEXT: slli a5, a5, 3 +; CHECK-RV32-NEXT: add a5, sp, a5 +; CHECK-RV32-NEXT: addi a5, a5, 16 +; CHECK-RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: addi a5, sp, 16 +; CHECK-RV32-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t +; CHECK-RV32-NEXT: csrr a5, vlenb +; CHECK-RV32-NEXT: slli a5, a5, 3 +; CHECK-RV32-NEXT: add a5, sp, a5 +; CHECK-RV32-NEXT: addi a5, a5, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vle8.v v16, (a1) -; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: slli a1, a1, 3 -; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: addi a1, a1, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsrl.vx v10, v9, a3 +; CHECK-RV32-NEXT: vsrl.vx v10, v6, a3 ; CHECK-RV32-NEXT: vsrl.vx v11, v0, a3 -; CHECK-RV32-NEXT: vmv.x.s a1, v9 +; CHECK-RV32-NEXT: vmv.x.s a1, v6 +; CHECK-RV32-NEXT: cpop a3, a4 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-RV32-NEXT: vcpop.m a3, v0 -; CHECK-RV32-NEXT: cpop a4, a4 +; CHECK-RV32-NEXT: vcpop.m a4, v7 ; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; CHECK-RV32-NEXT: vmv.x.s a5, v10 ; CHECK-RV32-NEXT: vmv.x.s a6, v11 -; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; CHECK-RV32-NEXT: vle8.v v8, (a0) -; CHECK-RV32-NEXT: csrr a3, vlenb -; CHECK-RV32-NEXT: slli a3, a3, 4 -; CHECK-RV32-NEXT: add a3, sp, a3 -; CHECK-RV32-NEXT: addi a3, a3, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: cpop a1, a1 -; CHECK-RV32-NEXT: cpop a3, a6 +; CHECK-RV32-NEXT: cpop a6, a6 ; CHECK-RV32-NEXT: cpop a5, a5 -; CHECK-RV32-NEXT: add a3, a4, a3 +; CHECK-RV32-NEXT: add a3, a3, a6 ; CHECK-RV32-NEXT: add a1, a1, a5 ; CHECK-RV32-NEXT: add a1, a3, a1 ; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-RV32-NEXT: vcpop.m a1, v7 -; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma -; CHECK-RV32-NEXT: vle8.v v8, (a0) -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m8, ta, ma +; CHECK-RV32-NEXT: vle8.v v24, (a0) ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, mu -; CHECK-RV32-NEXT: viota.m v24, v0 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: viota.m v16, v7 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: viota.m v8, v7 ; CHECK-RV32-NEXT: vmv1r.v v0, v7 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 3 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add sp, sp, a0 ; CHECK-RV32-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -329,96 +295,56 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8 ; CHECK-RV64-NEXT: addi sp, sp, -16 ; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV64-NEXT: csrr a2, vlenb -; CHECK-RV64-NEXT: slli a2, a2, 5 +; CHECK-RV64-NEXT: slli a2, a2, 4 ; CHECK-RV64-NEXT: sub sp, sp, a2 -; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-RV64-NEXT: csrr a2, vlenb -; CHECK-RV64-NEXT: li a3, 24 -; CHECK-RV64-NEXT: mul a2, a2, a3 -; CHECK-RV64-NEXT: add a2, sp, a2 -; CHECK-RV64-NEXT: addi a2, a2, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV64-NEXT: vmv1r.v v7, v8 ; CHECK-RV64-NEXT: li a2, 128 -; CHECK-RV64-NEXT: vslidedown.vi v9, v0, 1 +; CHECK-RV64-NEXT: vslidedown.vi v6, v0, 1 ; CHECK-RV64-NEXT: vmv.x.s a3, v0 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-RV64-NEXT: vle8.v v16, (a1) -; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: slli a1, a1, 3 -; CHECK-RV64-NEXT: add a1, sp, a1 -; CHECK-RV64-NEXT: addi a1, a1, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-RV64-NEXT: vmv.x.s a1, v9 -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-RV64-NEXT: viota.m v8, v0 +; CHECK-RV64-NEXT: addi a4, sp, 16 +; CHECK-RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vcpop.m a4, v0 ; CHECK-RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma -; CHECK-RV64-NEXT: vle8.v v8, (a0) +; CHECK-RV64-NEXT: vle8.v v24, (a0) +; CHECK-RV64-NEXT: addi a4, sp, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t ; CHECK-RV64-NEXT: csrr a4, vlenb -; CHECK-RV64-NEXT: slli a4, a4, 4 +; CHECK-RV64-NEXT: slli a4, a4, 3 ; CHECK-RV64-NEXT: add a4, sp, a4 ; CHECK-RV64-NEXT: addi a4, a4, 16 -; CHECK-RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vle8.v v16, (a1) +; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a1, v6 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-RV64-NEXT: vcpop.m a4, v7 ; CHECK-RV64-NEXT: cpop a3, a3 +; CHECK-RV64-NEXT: viota.m v24, v7 +; CHECK-RV64-NEXT: addi a5, sp, 16 +; CHECK-RV64-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: cpop a1, a1 ; CHECK-RV64-NEXT: add a0, a0, a3 ; CHECK-RV64-NEXT: add a0, a0, a1 ; CHECK-RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma -; CHECK-RV64-NEXT: vle8.v v8, (a0) -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, mu -; CHECK-RV64-NEXT: viota.m v24, v0 -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: viota.m v16, v7 -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vle8.v v24, (a0) ; CHECK-RV64-NEXT: vmv1r.v v0, v7 -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, mu ; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 5 +; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add sp, sp, a0 ; CHECK-RV64-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV64-NEXT: addi sp, sp, 16 @@ -664,76 +590,66 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x ; CHECK-RV32-NEXT: addi sp, sp, -16 ; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: slli a1, a1, 5 -; CHECK-RV32-NEXT: sub sp, sp, a1 -; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-RV32-NEXT: csrr a1, vlenb ; CHECK-RV32-NEXT: li a2, 24 ; CHECK-RV32-NEXT: mul a1, a1, a2 +; CHECK-RV32-NEXT: sub sp, sp, a1 +; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-RV32-NEXT: csrr a1, vlenb +; CHECK-RV32-NEXT: slli a1, a1, 4 ; CHECK-RV32-NEXT: add a1, sp, a1 ; CHECK-RV32-NEXT: addi a1, a1, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: li a1, 64 +; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-RV32-NEXT: viota.m v16, v0 +; CHECK-RV32-NEXT: vcpop.m a2, v0 +; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-RV32-NEXT: vle16.v v24, (a0) +; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV32-NEXT: csrr a2, vlenb +; CHECK-RV32-NEXT: slli a2, a2, 3 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: addi a2, a2, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 8 +; CHECK-RV32-NEXT: vslidedown.vi v24, v0, 8 ; CHECK-RV32-NEXT: li a2, 32 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-RV32-NEXT: vmv.x.s a3, v0 ; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-RV32-NEXT: vcpop.m a4, v0 +; CHECK-RV32-NEXT: vcpop.m a4, v24 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsrl.vx v25, v0, a2 -; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-RV32-NEXT: vcpop.m a2, v7 -; CHECK-RV32-NEXT: vsetvli zero, a4, e16, m8, ta, ma -; CHECK-RV32-NEXT: vle16.v v16, (a0) -; CHECK-RV32-NEXT: csrr a5, vlenb -; CHECK-RV32-NEXT: slli a5, a5, 4 -; CHECK-RV32-NEXT: add a5, sp, a5 -; CHECK-RV32-NEXT: addi a5, a5, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m1, ta, ma -; CHECK-RV32-NEXT: vmv.x.s a4, v25 -; CHECK-RV32-NEXT: cpop a4, a4 +; CHECK-RV32-NEXT: vsrl.vx v8, v0, a2 +; CHECK-RV32-NEXT: cpop a2, a3 +; CHECK-RV32-NEXT: vmv.x.s a3, v8 ; CHECK-RV32-NEXT: cpop a3, a3 -; CHECK-RV32-NEXT: add a3, a3, a4 -; CHECK-RV32-NEXT: slli a3, a3, 1 -; CHECK-RV32-NEXT: add a0, a0, a3 -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-RV32-NEXT: vle16.v v16, (a0) -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 3 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-RV32-NEXT: viota.m v16, v0 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV32-NEXT: add a2, a2, a3 +; CHECK-RV32-NEXT: slli a2, a2, 1 +; CHECK-RV32-NEXT: add a0, a0, a2 +; CHECK-RV32-NEXT: vsetvli zero, a4, e16, m8, ta, ma +; CHECK-RV32-NEXT: vle16.v v8, (a0) ; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: viota.m v8, v7 -; CHECK-RV32-NEXT: vmv1r.v v0, v7 +; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-RV32-NEXT: viota.m v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v0, v24 ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: addi a0, sp, 16 +; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t -; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 ; CHECK-RV32-NEXT: add sp, sp, a0 ; CHECK-RV32-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -749,67 +665,70 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x ; CHECK-RV64-NEXT: sub sp, sp, a1 ; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: slli a1, a1, 3 +; CHECK-RV64-NEXT: slli a1, a1, 4 ; CHECK-RV64-NEXT: add a1, sp, a1 ; CHECK-RV64-NEXT: addi a1, a1, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: li a1, 64 -; CHECK-RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 8 -; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-RV64-NEXT: viota.m v16, v0 +; CHECK-RV64-NEXT: csrr a2, vlenb +; CHECK-RV64-NEXT: li a3, 24 +; CHECK-RV64-NEXT: mul a2, a2, a3 +; CHECK-RV64-NEXT: add a2, sp, a2 +; CHECK-RV64-NEXT: addi a2, a2, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vcpop.m a2, v0 -; CHECK-RV64-NEXT: vcpop.m a3, v7 ; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-RV64-NEXT: vle16.v v24, (a0) +; CHECK-RV64-NEXT: csrr a3, vlenb +; CHECK-RV64-NEXT: li a4, 24 +; CHECK-RV64-NEXT: mul a3, a3, a4 +; CHECK-RV64-NEXT: add a3, sp, a3 +; CHECK-RV64-NEXT: addi a3, a3, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-RV64-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV64-NEXT: addi a3, sp, 16 +; CHECK-RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v0, v0, 8 +; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-RV64-NEXT: vcpop.m a3, v0 +; CHECK-RV64-NEXT: viota.m v24, v0 ; CHECK-RV64-NEXT: csrr a4, vlenb -; CHECK-RV64-NEXT: slli a4, a4, 4 +; CHECK-RV64-NEXT: li a5, 24 +; CHECK-RV64-NEXT: mul a4, a4, a5 ; CHECK-RV64-NEXT: add a4, sp, a4 ; CHECK-RV64-NEXT: addi a4, a4, 16 ; CHECK-RV64-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: slli a2, a2, 1 ; CHECK-RV64-NEXT: add a0, a0, a2 ; CHECK-RV64-NEXT: vsetvli zero, a3, e16, m8, ta, ma -; CHECK-RV64-NEXT: vle16.v v24, (a0) -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a2, 24 -; CHECK-RV64-NEXT: mul a0, a0, a2 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-RV64-NEXT: viota.m v24, v0 -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: viota.m v16, v7 +; CHECK-RV64-NEXT: vle16.v v16, (a0) ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: vmv1r.v v0, v7 ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: li a2, 24 +; CHECK-RV64-NEXT: mul a0, a0, a2 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t +; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-RV64-NEXT: vrgather.vv v16, v8, v24, v0.t ; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb @@ -1023,67 +942,70 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32> ; CHECK-RV32-NEXT: sub sp, sp, a1 ; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: slli a1, a1, 3 +; CHECK-RV32-NEXT: slli a1, a1, 4 ; CHECK-RV32-NEXT: add a1, sp, a1 ; CHECK-RV32-NEXT: addi a1, a1, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: li a1, 32 -; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 4 -; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-RV32-NEXT: viota.m v16, v0 +; CHECK-RV32-NEXT: csrr a2, vlenb +; CHECK-RV32-NEXT: li a3, 24 +; CHECK-RV32-NEXT: mul a2, a2, a3 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: addi a2, a2, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vcpop.m a2, v0 -; CHECK-RV32-NEXT: vcpop.m a3, v7 ; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-RV32-NEXT: vle32.v v24, (a0) +; CHECK-RV32-NEXT: csrr a3, vlenb +; CHECK-RV32-NEXT: li a4, 24 +; CHECK-RV32-NEXT: mul a3, a3, a4 +; CHECK-RV32-NEXT: add a3, sp, a3 +; CHECK-RV32-NEXT: addi a3, a3, 16 +; CHECK-RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV32-NEXT: addi a3, sp, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v0, v0, 4 +; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-RV32-NEXT: vcpop.m a3, v0 +; CHECK-RV32-NEXT: viota.m v24, v0 ; CHECK-RV32-NEXT: csrr a4, vlenb -; CHECK-RV32-NEXT: slli a4, a4, 4 +; CHECK-RV32-NEXT: li a5, 24 +; CHECK-RV32-NEXT: mul a4, a4, a5 ; CHECK-RV32-NEXT: add a4, sp, a4 ; CHECK-RV32-NEXT: addi a4, a4, 16 ; CHECK-RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: slli a2, a2, 2 ; CHECK-RV32-NEXT: add a0, a0, a2 ; CHECK-RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-RV32-NEXT: vle32.v v24, (a0) -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a2, 24 -; CHECK-RV32-NEXT: mul a0, a0, a2 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-RV32-NEXT: viota.m v24, v0 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: viota.m v16, v7 +; CHECK-RV32-NEXT: vle32.v v16, (a0) ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: vmv1r.v v0, v7 ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: li a2, 24 +; CHECK-RV32-NEXT: mul a0, a0, a2 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 3 +; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t +; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-RV32-NEXT: vrgather.vv v16, v8, v24, v0.t ; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb @@ -1108,23 +1030,40 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32> ; CHECK-RV64-NEXT: addi a1, a1, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: li a1, 32 +; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-RV64-NEXT: viota.m v16, v0 +; CHECK-RV64-NEXT: csrr a2, vlenb +; CHECK-RV64-NEXT: li a3, 24 +; CHECK-RV64-NEXT: mul a2, a2, a3 +; CHECK-RV64-NEXT: add a2, sp, a2 +; CHECK-RV64-NEXT: addi a2, a2, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vcpop.m a2, v0 +; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-RV64-NEXT: vle32.v v24, (a0) +; CHECK-RV64-NEXT: csrr a2, vlenb +; CHECK-RV64-NEXT: li a3, 24 +; CHECK-RV64-NEXT: mul a2, a2, a3 +; CHECK-RV64-NEXT: add a2, sp, a2 +; CHECK-RV64-NEXT: addi a2, a2, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-RV64-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV64-NEXT: addi a2, sp, 16 +; CHECK-RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 4 -; CHECK-RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v24, v0, 4 +; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-RV64-NEXT: vmv.x.s a2, v0 -; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-RV64-NEXT: vcpop.m a3, v0 -; CHECK-RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-RV64-NEXT: vle32.v v24, (a0) -; CHECK-RV64-NEXT: csrr a3, vlenb -; CHECK-RV64-NEXT: li a4, 24 -; CHECK-RV64-NEXT: mul a3, a3, a4 -; CHECK-RV64-NEXT: add a3, sp, a3 -; CHECK-RV64-NEXT: addi a3, a3, 16 -; CHECK-RV64-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-RV64-NEXT: vcpop.m a3, v7 +; CHECK-RV64-NEXT: vcpop.m a3, v24 ; CHECK-RV64-NEXT: cpopw a2, a2 +; CHECK-RV64-NEXT: viota.m v0, v24 +; CHECK-RV64-NEXT: csrr a4, vlenb +; CHECK-RV64-NEXT: li a5, 24 +; CHECK-RV64-NEXT: mul a4, a4, a5 +; CHECK-RV64-NEXT: add a4, sp, a4 +; CHECK-RV64-NEXT: addi a4, a4, 16 +; CHECK-RV64-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: slli a2, a2, 2 ; CHECK-RV64-NEXT: add a0, a0, a2 ; CHECK-RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma @@ -1134,30 +1073,25 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32> ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-RV64-NEXT: viota.m v24, v0 +; CHECK-RV64-NEXT: vmv1r.v v0, v24 ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: viota.m v8, v7 -; CHECK-RV64-NEXT: vmv1r.v v0, v7 ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: li a2, 24 +; CHECK-RV64-NEXT: mul a0, a0, a2 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t +; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-RV64-NEXT: vrgather.vv v16, v8, v24, v0.t ; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb @@ -1329,33 +1263,34 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV32-NEXT: addi sp, sp, -16 ; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: slli a1, a1, 5 +; CHECK-RV32-NEXT: li a2, 24 +; CHECK-RV32-NEXT: mul a1, a1, a2 ; CHECK-RV32-NEXT: sub sp, sp, a1 -; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-RV32-NEXT: csrr a1, vlenb ; CHECK-RV32-NEXT: slli a1, a1, 4 ; CHECK-RV32-NEXT: add a1, sp, a1 ; CHECK-RV32-NEXT: addi a1, a1, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-RV32-NEXT: vcpop.m a1, v0 -; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-RV32-NEXT: viota.m v16, v0 +; CHECK-RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-RV32-NEXT: vle64.v v24, (a0) -; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: li a2, 24 -; CHECK-RV32-NEXT: mul a1, a1, a2 -; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: addi a1, a1, 16 -; CHECK-RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV32-NEXT: addi a1, sp, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-RV32-NEXT: vmv.x.s a1, v0 ; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-RV32-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-RV32-NEXT: zext.h a1, a1 ; CHECK-RV32-NEXT: cpop a1, a1 ; CHECK-RV32-NEXT: slli a1, a1, 3 ; CHECK-RV32-NEXT: add a0, a0, a1 ; CHECK-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-RV32-NEXT: vcpop.m a1, v7 +; CHECK-RV32-NEXT: vcpop.m a1, v0 ; CHECK-RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-RV32-NEXT: vle64.v v16, (a0) ; CHECK-RV32-NEXT: csrr a0, vlenb @@ -1364,18 +1299,7 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-RV32-NEXT: viota.m v24, v0 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: viota.m v8, v7 -; CHECK-RV32-NEXT: vmv1r.v v0, v7 +; CHECK-RV32-NEXT: viota.m v8, v0 ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add a0, sp, a0 @@ -1390,7 +1314,8 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 ; CHECK-RV32-NEXT: add sp, sp, a0 ; CHECK-RV32-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -1402,33 +1327,34 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV64-NEXT: addi sp, sp, -16 ; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: slli a1, a1, 5 +; CHECK-RV64-NEXT: li a2, 24 +; CHECK-RV64-NEXT: mul a1, a1, a2 ; CHECK-RV64-NEXT: sub sp, sp, a1 -; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-RV64-NEXT: csrr a1, vlenb ; CHECK-RV64-NEXT: slli a1, a1, 4 ; CHECK-RV64-NEXT: add a1, sp, a1 ; CHECK-RV64-NEXT: addi a1, a1, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-RV64-NEXT: vcpop.m a1, v0 -; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-RV64-NEXT: viota.m v16, v0 +; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-RV64-NEXT: vle64.v v24, (a0) -; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: li a2, 24 -; CHECK-RV64-NEXT: mul a1, a1, a2 -; CHECK-RV64-NEXT: add a1, sp, a1 -; CHECK-RV64-NEXT: addi a1, a1, 16 -; CHECK-RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-RV64-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV64-NEXT: addi a1, sp, 16 +; CHECK-RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-RV64-NEXT: vmv.x.s a1, v0 ; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-RV64-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-RV64-NEXT: zext.h a1, a1 ; CHECK-RV64-NEXT: cpopw a1, a1 ; CHECK-RV64-NEXT: slli a1, a1, 3 ; CHECK-RV64-NEXT: add a0, a0, a1 ; CHECK-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-RV64-NEXT: vcpop.m a1, v7 +; CHECK-RV64-NEXT: vcpop.m a1, v0 ; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-RV64-NEXT: vle64.v v16, (a0) ; CHECK-RV64-NEXT: csrr a0, vlenb @@ -1437,18 +1363,7 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-RV64-NEXT: viota.m v24, v0 -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: viota.m v8, v7 -; CHECK-RV64-NEXT: vmv1r.v v0, v7 +; CHECK-RV64-NEXT: viota.m v8, v0 ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add a0, sp, a0 @@ -1463,7 +1378,8 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 5 +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 ; CHECK-RV64-NEXT: add sp, sp, a0 ; CHECK-RV64-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV64-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll index 425422417ec78..753a90c22a366 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll @@ -9,10 +9,10 @@ define <512 x i8> @bitcast_1024B(<256 x i16> %a, <512 x i8> %b) { ; VLEN256-NEXT: addi a1, a0, 256 ; VLEN256-NEXT: li a2, 256 ; VLEN256-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; VLEN256-NEXT: vle8.v v24, (a0) -; VLEN256-NEXT: vle8.v v0, (a1) -; VLEN256-NEXT: vadd.vv v8, v24, v8 -; VLEN256-NEXT: vadd.vv v16, v0, v16 +; VLEN256-NEXT: vle8.v v24, (a1) +; VLEN256-NEXT: vle8.v v0, (a0) +; VLEN256-NEXT: vadd.vv v8, v0, v8 +; VLEN256-NEXT: vadd.vv v16, v24, v16 ; VLEN256-NEXT: ret ; ; VLEN512-LABEL: bitcast_1024B: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll index 5ea4924468595..d86fcaf1df5f6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll @@ -1676,35 +1676,36 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex ; RV32-NEXT: addi a3, a4, 819 ; RV32-NEXT: sw a3, 32(sp) ; RV32-NEXT: sw a3, 36(sp) -; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: addi a4, a5, 1365 -; RV32-NEXT: vsll.vx v16, v8, a1, v0.t -; RV32-NEXT: addi a5, a6, -256 ; RV32-NEXT: sw a4, 24(sp) ; RV32-NEXT: sw a4, 28(sp) +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t +; RV32-NEXT: addi a5, a6, -256 ; RV32-NEXT: vand.vx v8, v8, a5, v0.t ; RV32-NEXT: vsll.vx v8, v8, a2, v0.t ; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: slli a6, a6, 4 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 48 +; RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v16, v24, a3, v0.t +; RV32-NEXT: vsll.vi v8, v16, 24, v0.t +; RV32-NEXT: addi a6, sp, 48 +; RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a4), zero ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 +; RV32-NEXT: slli a4, a4, 3 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 48 ; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v16, v24, a3, v0.t -; RV32-NEXT: vsll.vi v16, v16, 24, v0.t -; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vand.vv v16, v24, v8, v0.t ; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: addi a4, sp, 48 ; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a4, vlenb @@ -1739,14 +1740,14 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: addi a2, sp, 32 +; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t @@ -1761,7 +1762,7 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v24, v24, 2, v0.t ; RV32-NEXT: vor.vv v16, v16, v24, v0.t @@ -1869,59 +1870,60 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv8r.v v16, v8 ; RV32-NEXT: lui a1, 1044480 ; RV32-NEXT: lui a2, 61681 ; RV32-NEXT: lui a3, 209715 ; RV32-NEXT: lui a4, 349525 ; RV32-NEXT: li a5, 56 ; RV32-NEXT: lui a6, 16 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a5 -; RV32-NEXT: vsrl.vx v24, v8, a5 +; RV32-NEXT: vsll.vx v8, v8, a5 +; RV32-NEXT: vsrl.vx v24, v16, a5 ; RV32-NEXT: li a5, 40 +; RV32-NEXT: addi a6, a6, -256 +; RV32-NEXT: vsrl.vx v0, v16, a5 +; RV32-NEXT: vand.vx v0, v0, a6 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a7, sp, 48 +; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v16, a6 +; RV32-NEXT: lui a6, 4080 +; RV32-NEXT: vsll.vx v0, v0, a5 +; RV32-NEXT: addi a5, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: slli a7, a7, 3 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 48 +; RV32-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v16, 24 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw zero, 20(sp) ; RV32-NEXT: addi a1, a2, -241 +; RV32-NEXT: addi a2, a3, 819 +; RV32-NEXT: addi a3, a4, 1365 +; RV32-NEXT: vand.vx v0, v0, a6 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 4080 -; RV32-NEXT: addi a2, a3, 819 ; RV32-NEXT: sw a2, 32(sp) ; RV32-NEXT: sw a2, 36(sp) -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: addi a3, a4, 1365 -; RV32-NEXT: addi a4, a6, -256 -; RV32-NEXT: vsrl.vx v0, v8, a5 ; RV32-NEXT: sw a3, 24(sp) ; RV32-NEXT: sw a3, 28(sp) -; RV32-NEXT: vand.vx v0, v0, a4 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a4 -; RV32-NEXT: vsll.vx v0, v0, a5 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a2), zero +; RV32-NEXT: vlse64.v v8, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a1 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v8, v0 -; RV32-NEXT: vand.vx v8, v8, a1 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v0, v8, v24 +; RV32-NEXT: vand.vv v24, v24, v8 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vx v16, v16, a6 +; RV32-NEXT: vsll.vi v16, v16, 24 +; RV32-NEXT: vsll.vi v8, v8, 8 +; RV32-NEXT: vor.vv v0, v16, v8 ; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: addi a2, sp, 32 ; RV32-NEXT: csrr a3, vlenb @@ -2072,35 +2074,36 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex ; RV32-NEXT: addi a3, a4, 819 ; RV32-NEXT: sw a3, 32(sp) ; RV32-NEXT: sw a3, 36(sp) -; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: addi a4, a5, 1365 -; RV32-NEXT: vsll.vx v16, v8, a1, v0.t -; RV32-NEXT: addi a5, a6, -256 ; RV32-NEXT: sw a4, 24(sp) ; RV32-NEXT: sw a4, 28(sp) +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t +; RV32-NEXT: addi a5, a6, -256 ; RV32-NEXT: vand.vx v8, v8, a5, v0.t ; RV32-NEXT: vsll.vx v8, v8, a2, v0.t ; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: slli a6, a6, 4 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 48 +; RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v16, v24, a3, v0.t +; RV32-NEXT: vsll.vi v8, v16, 24, v0.t +; RV32-NEXT: addi a6, sp, 48 +; RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a4), zero ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 +; RV32-NEXT: slli a4, a4, 3 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 48 ; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v16, v24, a3, v0.t -; RV32-NEXT: vsll.vi v16, v16, 24, v0.t -; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vand.vv v16, v24, v8, v0.t ; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: addi a4, sp, 48 ; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a4, vlenb @@ -2135,14 +2138,14 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: addi a2, sp, 32 +; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t @@ -2157,7 +2160,7 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v24, v24, 2, v0.t ; RV32-NEXT: vor.vv v16, v16, v24, v0.t @@ -2265,59 +2268,60 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv8r.v v16, v8 ; RV32-NEXT: lui a1, 1044480 ; RV32-NEXT: lui a2, 61681 ; RV32-NEXT: lui a3, 209715 ; RV32-NEXT: lui a4, 349525 ; RV32-NEXT: li a5, 56 ; RV32-NEXT: lui a6, 16 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a5 -; RV32-NEXT: vsrl.vx v24, v8, a5 +; RV32-NEXT: vsll.vx v8, v8, a5 +; RV32-NEXT: vsrl.vx v24, v16, a5 ; RV32-NEXT: li a5, 40 +; RV32-NEXT: addi a6, a6, -256 +; RV32-NEXT: vsrl.vx v0, v16, a5 +; RV32-NEXT: vand.vx v0, v0, a6 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a7, sp, 48 +; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v16, a6 +; RV32-NEXT: lui a6, 4080 +; RV32-NEXT: vsll.vx v0, v0, a5 +; RV32-NEXT: addi a5, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: slli a7, a7, 3 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 48 +; RV32-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v16, 24 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw zero, 20(sp) ; RV32-NEXT: addi a1, a2, -241 +; RV32-NEXT: addi a2, a3, 819 +; RV32-NEXT: addi a3, a4, 1365 +; RV32-NEXT: vand.vx v0, v0, a6 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 4080 -; RV32-NEXT: addi a2, a3, 819 ; RV32-NEXT: sw a2, 32(sp) ; RV32-NEXT: sw a2, 36(sp) -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: addi a3, a4, 1365 -; RV32-NEXT: addi a4, a6, -256 -; RV32-NEXT: vsrl.vx v0, v8, a5 ; RV32-NEXT: sw a3, 24(sp) ; RV32-NEXT: sw a3, 28(sp) -; RV32-NEXT: vand.vx v0, v0, a4 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a4 -; RV32-NEXT: vsll.vx v0, v0, a5 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a2), zero +; RV32-NEXT: vlse64.v v8, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a1 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v8, v0 -; RV32-NEXT: vand.vx v8, v8, a1 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v0, v8, v24 +; RV32-NEXT: vand.vv v24, v24, v8 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vx v16, v16, a6 +; RV32-NEXT: vsll.vi v16, v16, 24 +; RV32-NEXT: vsll.vi v8, v8, 8 +; RV32-NEXT: vor.vv v0, v16, v8 ; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: addi a2, sp, 32 ; RV32-NEXT: csrr a3, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll index d765e4c0b8f6a..a4e78a750aa64 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll @@ -768,61 +768,63 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv8r.v v16, v8 ; RV32-NEXT: lui a1, 1044480 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 -; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a2, v0.t +; RV32-NEXT: vsll.vx v8, v8, a2, v0.t ; RV32-NEXT: addi a1, a3, -256 -; RV32-NEXT: vand.vx v24, v8, a1, v0.t +; RV32-NEXT: vand.vx v24, v16, a1, v0.t ; RV32-NEXT: vsll.vx v24, v24, a4, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v8, v16, a5, v0.t +; RV32-NEXT: vsll.vi v8, v8, 24, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vlse64.v v8, (a6), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 4080 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v24, v8, a3, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t +; RV32-NEXT: vand.vv v24, v16, v8, v0.t +; RV32-NEXT: vsll.vi v8, v24, 8, v0.t ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsll.vi v16, v24, 8, v0.t ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v8, v16, a2, v0.t +; RV32-NEXT: vsrl.vx v24, v16, a4, v0.t ; RV32-NEXT: vand.vx v24, v24, a1, v0.t -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t -; RV32-NEXT: vand.vx v24, v24, a3, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a5, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 @@ -916,48 +918,48 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 ; RV32-NEXT: lui a5, 4080 -; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v24, v8, a2 -; RV32-NEXT: addi a1, a3, -256 ; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vsrl.vx v0, v8, a4 -; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vand.vx v0, v0, a3 ; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a1 +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: slli a6, a6, 3 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a3 ; RV32-NEXT: vsll.vx v0, v0, a4 ; RV32-NEXT: vor.vv v16, v24, v0 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v0, a5 +; RV32-NEXT: vsrl.vi v24, v8, 8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a6), zero +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a5 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 @@ -1031,61 +1033,63 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv8r.v v16, v8 ; RV32-NEXT: lui a1, 1044480 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 -; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a2, v0.t +; RV32-NEXT: vsll.vx v8, v8, a2, v0.t ; RV32-NEXT: addi a1, a3, -256 -; RV32-NEXT: vand.vx v24, v8, a1, v0.t +; RV32-NEXT: vand.vx v24, v16, a1, v0.t ; RV32-NEXT: vsll.vx v24, v24, a4, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v8, v16, a5, v0.t +; RV32-NEXT: vsll.vi v8, v8, 24, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vlse64.v v8, (a6), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 4080 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v24, v8, a3, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t +; RV32-NEXT: vand.vv v24, v16, v8, v0.t +; RV32-NEXT: vsll.vi v8, v24, 8, v0.t ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsll.vi v16, v24, 8, v0.t ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v8, v16, a2, v0.t +; RV32-NEXT: vsrl.vx v24, v16, a4, v0.t ; RV32-NEXT: vand.vx v24, v24, a1, v0.t -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t -; RV32-NEXT: vand.vx v24, v24, a3, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a5, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 @@ -1179,48 +1183,48 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 ; RV32-NEXT: lui a5, 4080 -; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v24, v8, a2 -; RV32-NEXT: addi a1, a3, -256 ; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vsrl.vx v0, v8, a4 -; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vand.vx v0, v0, a3 ; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a1 +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: slli a6, a6, 3 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a3 ; RV32-NEXT: vsll.vx v0, v0, a4 ; RV32-NEXT: vor.vv v16, v24, v0 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v0, a5 +; RV32-NEXT: vsrl.vi v24, v8, 8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a6), zero +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a5 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll index 60a9948198c8f..506d105fd4665 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll @@ -87,14 +87,14 @@ define fastcc <128 x i32> @ret_split_v128i32(ptr %x) { ; CHECK-NEXT: addi a2, a1, 256 ; CHECK-NEXT: vle32.v v16, (a2) ; CHECK-NEXT: addi a2, a1, 384 -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v0, (a2) -; CHECK-NEXT: addi a2, a0, 256 -; CHECK-NEXT: vse32.v v24, (a0) +; CHECK-NEXT: vle32.v v24, (a2) +; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vse32.v v0, (a0) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vse32.v v0, (a1) -; CHECK-NEXT: vse32.v v16, (a2) +; CHECK-NEXT: vse32.v v24, (a2) +; CHECK-NEXT: vse32.v v16, (a1) ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %v = load <128 x i32>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll index f42b4a3a26aad..9b2c54458acbf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll @@ -87,14 +87,14 @@ define <128 x i32> @ret_split_v128i32(ptr %x) { ; CHECK-NEXT: addi a2, a1, 256 ; CHECK-NEXT: vle32.v v16, (a2) ; CHECK-NEXT: addi a2, a1, 384 -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v0, (a2) -; CHECK-NEXT: addi a2, a0, 256 -; CHECK-NEXT: vse32.v v24, (a0) +; CHECK-NEXT: vle32.v v24, (a2) +; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vse32.v v0, (a0) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vse32.v v0, (a1) -; CHECK-NEXT: vse32.v v16, (a2) +; CHECK-NEXT: vse32.v v24, (a2) +; CHECK-NEXT: vse32.v v16, (a1) ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %v = load <128 x i32>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll index 9d0d42cf754c5..3a4d9e3c11de7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll @@ -1503,38 +1503,29 @@ declare <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32) define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -1547,57 +1538,34 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vor.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1775,38 +1743,29 @@ declare <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32) define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -1819,57 +1778,34 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vor.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -2055,7 +1991,8 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill @@ -2087,7 +2024,6 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: li a1, 32 ; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: addi a4, sp, 32 ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -2102,34 +2038,25 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a5, 40 -; RV32-NEXT: mul a3, a3, a5 +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a4), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 @@ -2137,38 +2064,41 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v16, v16, v8, v0.t +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill @@ -2180,61 +2110,37 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a3, a3, a5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v8, (a4), zero +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill @@ -2244,7 +2150,8 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload @@ -2266,18 +2173,20 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -2290,41 +2199,25 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 +; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2332,21 +2225,20 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -2550,80 +2442,63 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vsrl.vi v0, v16, 8 ; RV32-NEXT: vor.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vnot.v v0, v8 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 16 -; RV32-NEXT: vor.vv v16, v16, v8 +; RV32-NEXT: vsrl.vi v0, v16, 16 +; RV32-NEXT: vor.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v0, 1 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 48 -; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v8, v24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v24, v0, v24 +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vand.vv v0, v0, v24 +; RV32-NEXT: vsub.vv v0, v8, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v0, v16, a2 -; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsrl.vx v8, v16, a2 +; RV32-NEXT: vor.vv v24, v16, v8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v24, v8 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v24, v24, v8 -; RV32-NEXT: vadd.vv v24, v0, v24 +; RV32-NEXT: vand.vv v16, v0, v8 +; RV32-NEXT: vsrl.vi v0, v0, 2 +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: vadd.vv v16, v16, v0 ; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v16 -; RV32-NEXT: vsrl.vi v0, v16, 1 +; RV32-NEXT: vnot.v v24, v24 +; RV32-NEXT: vsrl.vi v0, v24, 1 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v0, v0, v24 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v16 ; RV32-NEXT: addi a2, sp, 24 ; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vsub.vv v0, v16, v0 +; RV32-NEXT: vsub.vv v24, v24, v0 ; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v24, 4 -; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v0, v8 -; RV32-NEXT: vsrl.vi v0, v0, 2 -; RV32-NEXT: vand.vv v8, v0, v8 +; RV32-NEXT: vand.vv v0, v24, v8 +; RV32-NEXT: vsrl.vi v24, v24, 2 +; RV32-NEXT: vand.vv v8, v24, v8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a2), zero +; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a3), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vlse64.v v0, (a3), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v16, v16, v24 +; RV32-NEXT: vmul.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v24, v8, v24 +; RV32-NEXT: vmul.vv v24, v8, v0 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v8, v16, a2 @@ -4213,38 +4088,29 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -4257,57 +4123,34 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vor.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -4483,38 +4326,29 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -4527,57 +4361,34 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vor.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -4761,7 +4572,8 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill @@ -4793,7 +4605,6 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: li a1, 32 ; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: addi a4, sp, 32 ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -4808,34 +4619,25 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a5, 40 -; RV32-NEXT: mul a3, a3, a5 +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a4), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 @@ -4843,38 +4645,41 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v16, v16, v8, v0.t +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill @@ -4886,61 +4691,37 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a3, a3, a5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v8, (a4), zero +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill @@ -4950,7 +4731,8 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload @@ -4972,18 +4754,20 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -4996,41 +4780,25 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 +; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -5038,21 +4806,20 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -5256,80 +5023,63 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vsrl.vi v0, v16, 8 ; RV32-NEXT: vor.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vnot.v v0, v8 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 16 -; RV32-NEXT: vor.vv v16, v16, v8 +; RV32-NEXT: vsrl.vi v0, v16, 16 +; RV32-NEXT: vor.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v0, 1 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 48 -; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vand.vv v0, v0, v24 +; RV32-NEXT: vsub.vv v0, v8, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v8, v16, a2 +; RV32-NEXT: vor.vv v24, v16, v8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v24, v0, v24 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v0, v16, a2 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v24, v8 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v24, v24, v8 -; RV32-NEXT: vadd.vv v24, v0, v24 +; RV32-NEXT: vand.vv v16, v0, v8 +; RV32-NEXT: vsrl.vi v0, v0, 2 +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: vadd.vv v16, v16, v0 ; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v16 -; RV32-NEXT: vsrl.vi v0, v16, 1 +; RV32-NEXT: vnot.v v24, v24 +; RV32-NEXT: vsrl.vi v0, v24, 1 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v0, v0, v24 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v16 ; RV32-NEXT: addi a2, sp, 24 ; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vsub.vv v0, v16, v0 +; RV32-NEXT: vsub.vv v24, v24, v0 ; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v24, 4 -; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v0, v8 -; RV32-NEXT: vsrl.vi v0, v0, 2 -; RV32-NEXT: vand.vv v8, v0, v8 +; RV32-NEXT: vand.vv v0, v24, v8 +; RV32-NEXT: vsrl.vi v24, v24, 2 +; RV32-NEXT: vand.vv v8, v24, v8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a2), zero +; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a3), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vlse64.v v0, (a3), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v16, v16, v24 +; RV32-NEXT: vmul.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v24, v8, v24 +; RV32-NEXT: vmul.vv v24, v8, v0 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v8, v16, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll index a5a1061842427..d853d04093335 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll @@ -1119,70 +1119,55 @@ declare <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64>, <15 x i1>, i32) define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vand.vv v24, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v24, v0.t ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v24, v16, v0.t +; RV32-NEXT: vadd.vv v16, v24, v16, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1318,70 +1303,55 @@ declare <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64>, <16 x i1>, i32) define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vand.vv v24, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v24, v0.t ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v24, v16, v0.t +; RV32-NEXT: vadd.vv v16, v24, v16, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1525,7 +1495,8 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill @@ -1553,87 +1524,89 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB34_2: -; RV32-NEXT: addi a2, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: addi a2, sp, 32 +; RV32-NEXT: addi a2, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v24, v16, v0.t ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: addi a2, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 +; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a2, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t -; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v16, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: csrr a2, vlenb @@ -1647,32 +1620,39 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vmv1r.v v0, v7 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v24, v8, v0.t -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb @@ -1681,6 +1661,17 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: csrr a0, vlenb @@ -1792,12 +1783,9 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv8r.v v24, v16 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 @@ -1821,107 +1809,76 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB35_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vsrl.vi v24, v8, 1 ; RV32-NEXT: addi a2, sp, 40 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: addi a2, a0, -16 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: addi a2, sp, 32 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: vand.vv v24, v24, v8 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 1 +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: addi a2, sp, 32 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a2), zero -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vmv8r.v v8, v24 +; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v24, 1 -; RV32-NEXT: vand.vv v16, v24, v16 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v16, v24, v16 +; RV32-NEXT: vand.vv v0, v8, v24 ; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v8, v8, v24 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v16, v0 +; RV32-NEXT: vand.vv v0, v16, v24 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v16, 2 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v0 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v8, v0 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: addi a3, sp, 48 +; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v8, v16 +; RV32-NEXT: vadd.vv v24, v24, v8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v0, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v24, v0 +; RV32-NEXT: vadd.vv v16, v0, v16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v24, 4 +; RV32-NEXT: vadd.vv v24, v24, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v0, v16, 4 ; RV32-NEXT: vadd.vv v16, v16, v0 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v0, (a2), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v24, 4 -; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v24, v8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma @@ -1936,8 +1893,7 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v16, v24, a2 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 ; RV32-NEXT: addi sp, sp, 48 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll index cd4b19f11d160..3896bb29de988 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll @@ -1263,91 +1263,59 @@ declare <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32) define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1495,91 +1463,59 @@ declare <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32) define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1730,17 +1666,17 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 +; RV32-NEXT: vslidedown.vi v24, v0, 2 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 @@ -1770,7 +1706,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 24 +; RV32-NEXT: li a5, 48 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 48 @@ -1778,94 +1714,108 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 +; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v16, 1, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v24, v16, v24, v0.t -; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v16, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v8, (a4), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t -; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill @@ -1873,71 +1823,88 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v24, v16, v0.t -; RV32-NEXT: vsub.vv v24, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v24, 2, v0.t +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 @@ -2044,29 +2011,25 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: sw a2, 32(sp) -; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a2, 20(sp) ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: li a3, 16 ; RV32-NEXT: addi a1, a2, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: mv a1, a0 ; RV32-NEXT: bltu a0, a3, .LBB35_2 ; RV32-NEXT: # %bb.1: @@ -2075,19 +2038,20 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: li a2, 1 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vnot.v v0, v8 -; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a3), zero ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v8, v8, a2 ; RV32-NEXT: vand.vv v8, v0, v8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v0, v16, a2 +; RV32-NEXT: addi a2, sp, 8 ; RV32-NEXT: vnot.v v16, v16 ; RV32-NEXT: vand.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2099,8 +2063,7 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vand.vv v24, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: mv a3, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2115,19 +2078,14 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v24, 4 ; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v0, (a3), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma @@ -2141,11 +2099,7 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vsrl.vx v8, v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v16, v24, a2 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -3453,91 +3407,59 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -3683,91 +3605,59 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -3916,17 +3806,17 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 +; RV32-NEXT: vslidedown.vi v24, v0, 2 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 @@ -3956,7 +3846,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 24 +; RV32-NEXT: li a5, 48 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 48 @@ -3964,94 +3854,108 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 +; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v16, 1, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v24, v16, v24, v0.t -; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v16, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v8, (a4), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t -; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill @@ -4059,71 +3963,88 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v24, v16, v0.t -; RV32-NEXT: vsub.vv v24, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v24, 2, v0.t +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 @@ -4230,29 +4151,25 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: sw a2, 32(sp) -; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a2, 20(sp) ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: li a3, 16 ; RV32-NEXT: addi a1, a2, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: mv a1, a0 ; RV32-NEXT: bltu a0, a3, .LBB71_2 ; RV32-NEXT: # %bb.1: @@ -4261,19 +4178,20 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: li a2, 1 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vnot.v v0, v8 -; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a3), zero ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v8, v8, a2 ; RV32-NEXT: vand.vv v8, v0, v8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v0, v16, a2 +; RV32-NEXT: addi a2, sp, 8 ; RV32-NEXT: vnot.v v16, v16 ; RV32-NEXT: vand.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -4285,8 +4203,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vand.vv v24, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: mv a3, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -4301,19 +4218,14 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v24, 4 ; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v0, (a3), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma @@ -4327,11 +4239,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vsrl.vx v8, v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v16, v24, a2 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll index 4f11e6c3c386a..95cc3ee860976 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll @@ -587,7 +587,7 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vmv1r.v v25, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 @@ -601,29 +601,29 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: bltu a2, a1, .LBB24_2 +; CHECK-NEXT: bltu a2, a3, .LBB24_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB24_2: ; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: mul a0, a0, a3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v26, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v26 ; CHECK-NEXT: vmv8r.v v8, v16 @@ -680,10 +680,10 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmfeq.vv v25, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v16, v8, v16, v0.t +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v24, v8, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 @@ -718,13 +718,13 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: bltu a2, a1, .LBB25_2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll index 2e2103ad5e06d..e695819104bf6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll @@ -587,7 +587,7 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vmv1r.v v25, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 @@ -601,29 +601,29 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: bltu a2, a1, .LBB24_2 +; CHECK-NEXT: bltu a2, a3, .LBB24_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB24_2: ; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: mul a0, a0, a3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v26, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v26 ; CHECK-NEXT: vmv8r.v v8, v16 @@ -680,10 +680,10 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmfeq.vv v25, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v16, v8, v16, v0.t +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v24, v8, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 @@ -718,13 +718,13 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: bltu a2, a1, .LBB25_2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 0c7d7925edf39..4914d6393e530 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -203,67 +203,73 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vle32.v v8, (a4) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 68 -; RV32-NEXT: mul a1, a1, a4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a5, a5, 3 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vle16.v v6, (a6) -; RV32-NEXT: vmv.s.x v0, a5 ; RV32-NEXT: lui a1, %hi(.LCPI8_1) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_1) -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v24, v16, v6 +; RV32-NEXT: vle32.v v24, (a4) ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 52 -; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: li t1, 60 +; RV32-NEXT: mul a4, a4, t1 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v16, (a3) +; RV32-NEXT: addi a5, a5, 3 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vle16.v v8, (a6) +; RV32-NEXT: vmv.s.x v7, a7 ; RV32-NEXT: addi t0, t0, 12 -; RV32-NEXT: vmv.s.x v0, a7 -; RV32-NEXT: vmv.s.x v7, t0 -; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v4, (a1) -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vmv.s.x v0, a5 +; RV32-NEXT: vmv.s.x v3, t0 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vrgatherei16.vv v24, v16, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 60 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a4, 52 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vle32.v v8, (a3) ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a3, 84 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v16, v8, 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 68 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v7 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v20, v24, v16, v0 +; RV32-NEXT: vmerge.vvm v8, v16, v8, v0 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 76 +; RV32-NEXT: li a3, 60 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 76 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v24, v8, v16, v0 +; RV32-NEXT: vmerge.vvm v24, v16, v8, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v8, v24, v4 ; RV32-NEXT: csrr a1, vlenb @@ -279,45 +285,44 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: slli a1, a1, 10 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: vle16.v v14, (a4) -; RV32-NEXT: vmv.s.x v12, a3 +; RV32-NEXT: vle16.v v4, (a4) +; RV32-NEXT: vmv.s.x v7, a3 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a3, 84 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv4r.v v8, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 60 +; RV32-NEXT: li a3, 68 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v24, v8, v0 +; RV32-NEXT: vmerge.vvm v8, v8, v24, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 24 +; RV32-NEXT: li a3, 28 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vmv1r.v v0, v7 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 68 +; RV32-NEXT: li a3, 76 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v24, v24, v16, v0 +; RV32-NEXT: vmerge.vvm v24, v16, v24, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v24, v14 +; RV32-NEXT: vrgatherei16.vv v8, v24, v4 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 3 ; RV32-NEXT: lui a3, 786624 ; RV32-NEXT: lui a4, 12 @@ -330,20 +335,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a4, a4, 12 ; RV32-NEXT: addi a5, a5, 768 ; RV32-NEXT: addi a7, a7, -1024 -; RV32-NEXT: vmv.s.x v1, a6 -; RV32-NEXT: vmv.s.x v12, t0 +; RV32-NEXT: vmv.s.x v7, a6 +; RV32-NEXT: vmv.s.x v1, t0 ; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: vmv.s.x v3, a3 -; RV32-NEXT: vmv.s.x v2, a4 -; RV32-NEXT: vmv.s.x v13, a5 -; RV32-NEXT: vmv.s.x v14, a7 +; RV32-NEXT: vmv.s.x v8, a3 +; RV32-NEXT: vmv.s.x v6, a4 +; RV32-NEXT: vmv.s.x v3, a5 +; RV32-NEXT: vmv.s.x v2, a7 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 60 +; RV32-NEXT: li a3, 68 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv4r.v v8, v16 +; RV32-NEXT: vmv4r.v v24, v16 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a3, 84 ; RV32-NEXT: mul a1, a1, a3 @@ -351,103 +356,108 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v20, v8, v16, v0 +; RV32-NEXT: vmerge.vvm v12, v24, v16, v0 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 68 +; RV32-NEXT: li a3, 60 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a3, 76 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v24, v16, v24, v0 +; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v2 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v6 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a3, 84 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v24, v8, v24, v0 +; RV32-NEXT: vmerge.vvm v16, v24, v16, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 12 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v13 +; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a3, 76 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v24, v16, v24, v0 +; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v7 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a3, 84 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 68 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v4, v8, v24, v0 +; RV32-NEXT: vmerge.vvm v20, v24, v16, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 28 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v14 +; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v2 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 76 +; RV32-NEXT: li a3, 60 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 68 +; RV32-NEXT: li a3, 76 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 76 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 84 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32-NEXT: vmerge.vvm v8, v24, v8, v0 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 68 ; RV32-NEXT: mul a1, a1, a2 @@ -485,17 +495,17 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a2, a2, %lo(.LCPI8_3) ; RV32-NEXT: addi a1, a1, 5 ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v28, (a2) +; RV32-NEXT: vle16.v v24, (a2) ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v20, a1 +; RV32-NEXT: vmv.v.x v25, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 +; RV32-NEXT: li a2, 28 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v8, v12, v20 +; RV32-NEXT: vrgatherei16.vv v8, v12, v25 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 44 ; RV32-NEXT: mul a1, a1, a2 @@ -505,118 +515,109 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v8, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 52 +; RV32-NEXT: li a2, 60 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v24, v12, v28 +; RV32-NEXT: vrgatherei16.vv v12, v8, v24 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v24, v16 +; RV32-NEXT: vmv.v.v v12, v16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 52 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI8_4) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_4) ; RV32-NEXT: lui a2, %hi(.LCPI8_5) ; RV32-NEXT: addi a2, a2, %lo(.LCPI8_5) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vle16.v v26, (a1) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v28, (a2) +; RV32-NEXT: vle16.v v24, (a2) ; RV32-NEXT: lui a1, %hi(.LCPI8_6) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_6) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle16.v v30, (a1) +; RV32-NEXT: vle16.v v2, (a1) ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v16, v0, v12 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v8, v26 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 12 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v12, v20, v28 +; RV32-NEXT: vrgatherei16.vv v12, v8, v24 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v12, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v0, v30 +; RV32-NEXT: vrgatherei16.vv v24, v16, v2 ; RV32-NEXT: lui a1, %hi(.LCPI8_7) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_7) ; RV32-NEXT: lui a2, %hi(.LCPI8_8) ; RV32-NEXT: addi a2, a2, %lo(.LCPI8_8) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v20, (a1) +; RV32-NEXT: vle16.v v16, (a1) ; RV32-NEXT: lui a1, %hi(.LCPI8_9) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v8, (a2) +; RV32-NEXT: vle16.v v18, (a2) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vle16.v v10, (a1) +; RV32-NEXT: vle16.v v17, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 28 +; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v28, v0, v20 +; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v20, v28, v16 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v28, v16 +; RV32-NEXT: vmv.v.v v20, v24 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 76 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v0, v8 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 60 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vrgatherei16.vv v0, v24, v18 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 68 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v4, v10 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 60 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v28, v24, v17 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v0 +; RV32-NEXT: vmv.v.v v28, v0 ; RV32-NEXT: addi a1, a0, 320 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vse32.v v16, (a1) -; RV32-NEXT: addi a1, a0, 256 ; RV32-NEXT: vse32.v v28, (a1) +; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vse32.v v20, (a1) ; RV32-NEXT: addi a1, a0, 192 ; RV32-NEXT: vse32.v v12, (a1) ; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: vse32.v v24, (a1) -; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: li a3, 52 ; RV32-NEXT: mul a2, a2, a3 @@ -624,6 +625,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: addi a1, a0, 64 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: li a3, 60 +; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 84 ; RV32-NEXT: mul a1, a1, a2 @@ -645,162 +654,171 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 88 +; RV64-NEXT: li a3, 96 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 88 * vlenb -; RV64-NEXT: addi a3, a1, 128 -; RV64-NEXT: addi a6, a1, 256 -; RV64-NEXT: li a4, 128 -; RV64-NEXT: lui a2, 1 -; RV64-NEXT: lui a5, %hi(.LCPI8_0) -; RV64-NEXT: addi a5, a5, %lo(.LCPI8_0) +; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 96 * vlenb +; RV64-NEXT: addi a2, a1, 256 +; RV64-NEXT: li a3, 128 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vmv.v.i v16, 6 +; RV64-NEXT: lui a4, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v8, (a6) -; RV64-NEXT: lui a6, 16 -; RV64-NEXT: addi a6, a6, 7 +; RV64-NEXT: vle64.v v8, (a2) +; RV64-NEXT: vmv.s.x v0, a3 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 40 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs1r.v v0, (a2) # Unknown-size Folded Spill +; RV64-NEXT: addi a4, a4, 7 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v17, a6 -; RV64-NEXT: addi a6, a2, 65 +; RV64-NEXT: vmv.v.x v17, a4 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vrgather.vi v24, v8, 4 -; RV64-NEXT: vrgather.vi v20, v8, 5 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 68 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill -; RV64-NEXT: vrgatherei16.vv v20, v8, v16 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 84 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill -; RV64-NEXT: vrgatherei16.vv v20, v8, v17 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 72 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill +; RV64-NEXT: vrgather.vi v20, v8, 4 +; RV64-NEXT: vrgather.vi v24, v8, 5 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 92 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v24, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vrgatherei16.vv v24, v8, v16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 80 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v24, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vrgatherei16.vv v24, v8, v17 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 72 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v24, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vrgather.vi v16, v8, 2 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: slli a7, a7, 6 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v16, (a7) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 88 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vrgather.vi v16, v8, 3 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 56 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v16, (a7) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 84 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; RV64-NEXT: vslidedown.vi v16, v8, 8 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 48 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs8r.v v16, (a7) # Unknown-size Folded Spill -; RV64-NEXT: vmv.s.x v21, a4 +; RV64-NEXT: vslidedown.vi v8, v8, 8 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 6 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: vrgather.vi v20, v8, 2, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 76 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v20, (a2) # Unknown-size Folded Spill +; RV64-NEXT: addi a3, a1, 128 +; RV64-NEXT: lui a2, 1 +; RV64-NEXT: lui a4, %hi(.LCPI8_0) +; RV64-NEXT: addi a4, a4, %lo(.LCPI8_0) +; RV64-NEXT: addi a5, a2, 65 +; RV64-NEXT: vmv.s.x v0, a5 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle16.v v8, (a4) +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: li a5, 24 +; RV64-NEXT: mul a4, a4, a5 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs2r.v v8, (a4) # Unknown-size Folded Spill ; RV64-NEXT: vle64.v v8, (a1) -; RV64-NEXT: vle64.v v0, (a3) -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 40 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vle16.v v2, (a5) -; RV64-NEXT: vmv.s.x v20, a6 -; RV64-NEXT: vmv1r.v v0, v21 -; RV64-NEXT: vmv1r.v v7, v21 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v24, v16, 2, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 60 -; RV64-NEXT: mul a1, a1, a3 +; RV64-NEXT: li a4, 56 +; RV64-NEXT: mul a1, a1, a4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v20 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vle64.v v16, (a3) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 40 +; RV64-NEXT: li a3, 48 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vmerge.vvm v24, v16, v8, v0 -; RV64-NEXT: vmv8r.v v16, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 76 +; RV64-NEXT: li a3, 24 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vrgatherei16.vv v8, v24, v2 +; RV64-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v0, v24, v16 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 5 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill ; RV64-NEXT: lui a1, 2 ; RV64-NEXT: lui a3, %hi(.LCPI8_1) ; RV64-NEXT: addi a3, a3, %lo(.LCPI8_1) ; RV64-NEXT: addi a1, a1, 130 -; RV64-NEXT: vle16.v v8, (a3) +; RV64-NEXT: vle16.v v16, (a3) ; RV64-NEXT: csrr a3, vlenb ; RV64-NEXT: slli a3, a3, 4 ; RV64-NEXT: add a3, sp, a3 ; RV64-NEXT: addi a3, a3, 16 -; RV64-NEXT: vs2r.v v8, (a3) # Unknown-size Folded Spill -; RV64-NEXT: vmv.s.x v2, a1 -; RV64-NEXT: vmv1r.v v0, v7 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs1r.v v7, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs2r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vmv.s.x v24, a1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 68 +; RV64-NEXT: li a3, 40 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 48 +; RV64-NEXT: li a3, 92 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 6 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v24, v8, 3, v0.t +; RV64-NEXT: vrgather.vi v28, v16, 3, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 68 +; RV64-NEXT: li a3, 92 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v2 +; RV64-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 40 +; RV64-NEXT: li a3, 48 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vmerge.vvm v24, v8, v16, v0 +; RV64-NEXT: vmerge.vvm v24, v16, v8, v0 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v0, v24, v16 +; RV64-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v0, v24, v8 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a3, 24 ; RV64-NEXT: mul a1, a1, a3 @@ -814,95 +832,93 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: vmv.s.x v2, a3 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 76 +; RV64-NEXT: li a3, 56 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmerge.vvm v24, v8, v16, v0 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmerge.vvm v24, v16, v8, v0 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl1r.v v7, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv1r.v v0, v7 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 84 +; RV64-NEXT: li a3, 40 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv1r.v v7, v0 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 48 +; RV64-NEXT: li a3, 80 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 6 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv4r.v v8, v24 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v24, v16, 4, v0.t +; RV64-NEXT: vrgather.vi v12, v24, 4, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 84 +; RV64-NEXT: li a3, 80 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vmv1r.v v0, v2 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 76 +; RV64-NEXT: li a3, 56 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vmerge.vvm v24, v8, v16, v0 +; RV64-NEXT: vmerge.vvm v24, v16, v24, v0 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv8r.v v16, v8 ; RV64-NEXT: vmv1r.v v0, v7 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a3, 72 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 48 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv4r.v v8, v24 +; RV64-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v12, v24, 5, v0.t +; RV64-NEXT: vrgather.vi v4, v8, 5, v0.t ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a3, 72 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill ; RV64-NEXT: lui a1, 96 ; RV64-NEXT: li a3, 192 -; RV64-NEXT: vmv.s.x v3, a3 +; RV64-NEXT: vmv.s.x v1, a3 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v12, a1 -; RV64-NEXT: vmv1r.v v0, v3 +; RV64-NEXT: vmv.v.x v3, a1 +; RV64-NEXT: vmv1r.v v0, v1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 6 +; RV64-NEXT: li a3, 88 +; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgatherei16.vv v24, v8, v12, v0.t +; RV64-NEXT: vrgatherei16.vv v4, v8, v3, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 6 +; RV64-NEXT: li a3, 88 +; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill ; RV64-NEXT: lui a1, %hi(.LCPI8_2) ; RV64-NEXT: addi a1, a1, %lo(.LCPI8_2) ; RV64-NEXT: li a3, 1040 @@ -910,34 +926,48 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a4, a4, 1 ; RV64-NEXT: vmv.s.x v0, a3 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v12, a4 +; RV64-NEXT: vmv.v.x v24, a4 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle16.v v6, (a1) -; RV64-NEXT: vmv8r.v v24, v16 +; RV64-NEXT: vle16.v v2, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 76 +; RV64-NEXT: li a3, 56 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmerge.vvm v16, v24, v16, v0 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v3 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 56 +; RV64-NEXT: li a3, 48 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a3, 40 +; RV64-NEXT: mul a1, a1, a3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a3, 84 +; RV64-NEXT: mul a1, a1, a3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 6 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgatherei16.vv v16, v8, v12, v0.t +; RV64-NEXT: vrgatherei16.vv v4, v8, v24, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 56 +; RV64-NEXT: li a3, 84 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill ; RV64-NEXT: addi a1, a2, -2016 ; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: csrr a1, vlenb @@ -946,82 +976,91 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v16, v8, v6 +; RV64-NEXT: vrgatherei16.vv v24, v8, v2 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 76 +; RV64-NEXT: li a2, 48 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmerge.vvm v8, v24, v8, v0 +; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 76 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a1, a1, 6 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: lui a1, %hi(.LCPI8_3) ; RV64-NEXT: addi a1, a1, %lo(.LCPI8_3) -; RV64-NEXT: vle16.v v24, (a1) +; RV64-NEXT: vle16.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: li a2, 76 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 60 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a1, a1, 5 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v8, v0 +; RV64-NEXT: vmv.v.v v12, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 60 +; RV64-NEXT: li a2, 76 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 68 +; RV64-NEXT: li a2, 92 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 24 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv.v.v v0, v8 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv.v.v v12, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 84 +; RV64-NEXT: li a2, 92 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v8, v16 +; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 84 +; RV64-NEXT: li a2, 80 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma +; RV64-NEXT: vmv.v.v v16, v24 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v16, v8, v24 +; RV64-NEXT: vrgatherei16.vv v0, v24, v8 ; RV64-NEXT: lui a1, %hi(.LCPI8_4) ; RV64-NEXT: addi a1, a1, %lo(.LCPI8_4) ; RV64-NEXT: vle16.v v8, (a1) ; RV64-NEXT: lui a1, %hi(.LCPI8_5) ; RV64-NEXT: addi a1, a1, %lo(.LCPI8_5) -; RV64-NEXT: vle16.v v6, (a1) +; RV64-NEXT: vle16.v v10, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 80 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs2r.v v10, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 72 ; RV64-NEXT: mul a1, a1, a2 @@ -1029,60 +1068,70 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v12, v16 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv.v.v v12, v0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 40 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v24, v16, v8 +; RV64-NEXT: vrgatherei16.vv v24, v0, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 6 +; RV64-NEXT: li a2, 88 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma ; RV64-NEXT: vmv.v.v v8, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 76 +; RV64-NEXT: slli a1, a1, 6 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 80 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl2r.v v20, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v24, v16, v6 +; RV64-NEXT: vrgatherei16.vv v24, v0, v20 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 56 +; RV64-NEXT: li a2, 84 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v16, v24 +; RV64-NEXT: vmv.v.v v28, v24 ; RV64-NEXT: addi a1, a0, 256 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 320 -; RV64-NEXT: vse64.v v16, (a1) +; RV64-NEXT: vse64.v v28, (a1) ; RV64-NEXT: addi a1, a0, 192 ; RV64-NEXT: vse64.v v12, (a1) ; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: vse64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 64 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 84 +; RV64-NEXT: li a3, 92 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a1) -; RV64-NEXT: addi a1, a0, 64 -; RV64-NEXT: vse64.v v0, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 60 +; RV64-NEXT: li a2, 76 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 88 +; RV64-NEXT: li a1, 96 ; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: .cfi_def_cfa sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index 2b279389253b0..1cab7370e55ba 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -1344,17 +1344,17 @@ define float @vreduce_fmin_v128f32(ptr %x) { ; CHECK-LABEL: vreduce_fmin_v128f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: addi a2, a0, 256 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a2) -; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vfmin.vv v8, v24, v8 -; CHECK-NEXT: vfmin.vv v16, v16, v0 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v0, (a0) +; CHECK-NEXT: vfmin.vv v24, v0, v24 ; CHECK-NEXT: vfmin.vv v8, v16, v8 +; CHECK-NEXT: vfmin.vv v8, v8, v24 ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1591,17 +1591,17 @@ define float @vreduce_fmax_v128f32(ptr %x) { ; CHECK-LABEL: vreduce_fmax_v128f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: addi a2, a0, 256 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a2) -; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vfmax.vv v8, v24, v8 -; CHECK-NEXT: vfmax.vv v16, v16, v0 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v0, (a0) +; CHECK-NEXT: vfmax.vv v24, v0, v24 ; CHECK-NEXT: vfmax.vv v8, v16, v8 +; CHECK-NEXT: vfmax.vv v8, v8, v24 ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1997,57 +1997,64 @@ define float @vreduce_fminimum_v128f32(ptr %x) { ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: addi a2, a0, 384 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a2) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vle32.v v16, (a2) +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vle32.v v24, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v8, v16, v8 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v24, v8, v24 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 @@ -2077,17 +2084,17 @@ define float @vreduce_fminimum_v128f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v128f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: addi a2, a0, 256 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a2) -; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vfmin.vv v8, v24, v8 -; CHECK-NEXT: vfmin.vv v16, v16, v0 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v0, (a0) +; CHECK-NEXT: vfmin.vv v24, v0, v24 ; CHECK-NEXT: vfmin.vv v8, v16, v8 +; CHECK-NEXT: vfmin.vv v8, v8, v24 ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2314,57 +2321,64 @@ define double @vreduce_fminimum_v64f64(ptr %x) { ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: addi a1, a0, 384 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 ; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmfeq.vv v7, v8, v8 +; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v8, v16, v8 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v24, v8, v24 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 @@ -2395,15 +2409,15 @@ define double @vreduce_fminimum_v64f64_nonans(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: vle64.v v0, (a1) -; CHECK-NEXT: vfmin.vv v16, v24, v16 -; CHECK-NEXT: vfmin.vv v8, v8, v0 +; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vle64.v v0, (a0) +; CHECK-NEXT: vfmin.vv v24, v0, v24 ; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vfmin.vv v8, v8, v24 ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2711,57 +2725,64 @@ define float @vreduce_fmaximum_v128f32(ptr %x) { ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: addi a2, a0, 384 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a2) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vle32.v v16, (a2) +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vle32.v v24, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v8, v16, v8 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v24, v8, v24 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 @@ -2791,17 +2812,17 @@ define float @vreduce_fmaximum_v128f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v128f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: addi a2, a0, 256 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a2) -; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vfmax.vv v8, v24, v8 -; CHECK-NEXT: vfmax.vv v16, v16, v0 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v0, (a0) +; CHECK-NEXT: vfmax.vv v24, v0, v24 ; CHECK-NEXT: vfmax.vv v8, v16, v8 +; CHECK-NEXT: vfmax.vv v8, v8, v24 ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -3028,57 +3049,64 @@ define double @vreduce_fmaximum_v64f64(ptr %x) { ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: addi a1, a0, 384 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 ; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmfeq.vv v7, v8, v8 +; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v8, v16, v8 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v24, v8, v24 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 @@ -3109,15 +3137,15 @@ define double @vreduce_fmaximum_v64f64_nonans(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: vle64.v v0, (a1) -; CHECK-NEXT: vfmax.vv v16, v24, v16 -; CHECK-NEXT: vfmax.vv v8, v8, v0 +; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vle64.v v0, (a0) +; CHECK-NEXT: vfmax.vv v24, v0, v24 ; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vfmax.vv v8, v8, v24 ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll index 707d1202aca0f..84dda70d042a4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -1471,14 +1471,14 @@ declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>) define i64 @vreduce_add_v64i64(ptr %x) nounwind { ; RV32-LABEL: vreduce_add_v64i64: ; RV32: # %bb.0: -; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle64.v v24, (a1) -; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: vle64.v v0, (a1) ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a0, a0, 256 -; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 +; RV32-NEXT: addi a0, a0, 128 +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vadd.vv v24, v0, v24 ; RV32-NEXT: vmv.s.x v7, zero ; RV32-NEXT: li a1, 32 @@ -1495,15 +1495,15 @@ define i64 @vreduce_add_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vadd.vv v16, v24, v16 -; RV64-NEXT: vadd.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vadd.vv v24, v0, v24 ; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vadd.vv v8, v8, v24 ; RV64-NEXT: vmv.s.x v16, zero ; RV64-NEXT: vredsum.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 @@ -1550,15 +1550,15 @@ define i64 @vwreduce_add_v64i64(ptr %x) { ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: li a2, 32 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vle32.v v16, (a1) +; RV64-NEXT: vle32.v v8, (a1) +; RV64-NEXT: vle32.v v16, (a0) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV64-NEXT: vslidedown.vi v0, v16, 16 +; RV64-NEXT: vslidedown.vi v0, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwadd.vv v24, v8, v16 +; RV64-NEXT: vwadd.vv v24, v16, v8 ; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vwadd.vv v8, v16, v0 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma @@ -1616,15 +1616,15 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) { ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: li a2, 32 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vle32.v v16, (a1) +; RV64-NEXT: vle32.v v8, (a1) +; RV64-NEXT: vle32.v v16, (a0) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV64-NEXT: vslidedown.vi v0, v16, 16 +; RV64-NEXT: vslidedown.vi v0, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwaddu.vv v24, v8, v16 +; RV64-NEXT: vwaddu.vv v24, v16, v8 ; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vwaddu.vv v8, v16, v0 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma @@ -2201,16 +2201,16 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vle64.v v24, (a1) ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vand.vv v24, v0, v24 ; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vredand.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2222,15 +2222,15 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vand.vv v16, v24, v16 -; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vand.vv v24, v0, v24 ; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: vredand.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret @@ -2793,16 +2793,16 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vle64.v v24, (a1) ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vor.vv v24, v0, v24 ; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v8, v24 ; RV32-NEXT: vredor.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2814,15 +2814,15 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vor.vv v16, v24, v16 -; RV64-NEXT: vor.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vor.vv v24, v0, v24 ; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: vredor.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret @@ -3414,14 +3414,14 @@ declare i64 @llvm.vector.reduce.xor.v64i64(<64 x i64>) define i64 @vreduce_xor_v64i64(ptr %x) nounwind { ; RV32-LABEL: vreduce_xor_v64i64: ; RV32: # %bb.0: -; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle64.v v24, (a1) -; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: vle64.v v0, (a1) ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a0, a0, 256 -; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 +; RV32-NEXT: addi a0, a0, 128 +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vxor.vv v24, v0, v24 ; RV32-NEXT: vmv.s.x v7, zero ; RV32-NEXT: li a1, 32 @@ -3438,15 +3438,15 @@ define i64 @vreduce_xor_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vxor.vv v16, v24, v16 -; RV64-NEXT: vxor.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vxor.vv v24, v0, v24 ; RV64-NEXT: vxor.vv v8, v8, v16 +; RV64-NEXT: vxor.vv v8, v8, v24 ; RV64-NEXT: vmv.s.x v16, zero ; RV64-NEXT: vredxor.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 @@ -4011,16 +4011,16 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vle64.v v24, (a1) ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vmin.vv v16, v0, v16 -; RV32-NEXT: vmin.vv v8, v8, v24 +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vmin.vv v24, v0, v24 ; RV32-NEXT: vmin.vv v8, v8, v16 +; RV32-NEXT: vmin.vv v8, v8, v24 ; RV32-NEXT: vredmin.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -4032,15 +4032,15 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vmin.vv v16, v24, v16 -; RV64-NEXT: vmin.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vmin.vv v24, v0, v24 ; RV64-NEXT: vmin.vv v8, v8, v16 +; RV64-NEXT: vmin.vv v8, v8, v24 ; RV64-NEXT: vredmin.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret @@ -4604,16 +4604,16 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vle64.v v24, (a1) ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vmax.vv v16, v0, v16 -; RV32-NEXT: vmax.vv v8, v8, v24 +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vmax.vv v24, v0, v24 ; RV32-NEXT: vmax.vv v8, v8, v16 +; RV32-NEXT: vmax.vv v8, v8, v24 ; RV32-NEXT: vredmax.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -4625,15 +4625,15 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vmax.vv v16, v24, v16 -; RV64-NEXT: vmax.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vmax.vv v24, v0, v24 ; RV64-NEXT: vmax.vv v8, v8, v16 +; RV64-NEXT: vmax.vv v8, v8, v24 ; RV64-NEXT: vredmax.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret @@ -5197,16 +5197,16 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vle64.v v24, (a1) ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vminu.vv v16, v0, v16 -; RV32-NEXT: vminu.vv v8, v8, v24 +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vminu.vv v24, v0, v24 ; RV32-NEXT: vminu.vv v8, v8, v16 +; RV32-NEXT: vminu.vv v8, v8, v24 ; RV32-NEXT: vredminu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -5218,15 +5218,15 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vminu.vv v16, v24, v16 -; RV64-NEXT: vminu.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vminu.vv v24, v0, v24 ; RV64-NEXT: vminu.vv v8, v8, v16 +; RV64-NEXT: vminu.vv v8, v8, v24 ; RV64-NEXT: vredminu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret @@ -5789,16 +5789,16 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vle64.v v24, (a1) ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vmaxu.vv v16, v0, v16 -; RV32-NEXT: vmaxu.vv v8, v8, v24 +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vmaxu.vv v24, v0, v24 ; RV32-NEXT: vmaxu.vv v8, v8, v16 +; RV32-NEXT: vmaxu.vv v8, v8, v24 ; RV32-NEXT: vredmaxu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -5810,15 +5810,15 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vmaxu.vv v16, v24, v16 -; RV64-NEXT: vmaxu.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vmaxu.vv v24, v0, v24 ; RV64-NEXT: vmaxu.vv v8, v8, v16 +; RV64-NEXT: vmaxu.vv v8, v8, v24 ; RV64-NEXT: vredmaxu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret @@ -6585,15 +6585,15 @@ define i64 @vreduce_mul_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v24, (a0) -; RV32-NEXT: vle64.v v0, (a1) -; RV32-NEXT: vmul.vv v16, v24, v16 -; RV32-NEXT: vmul.vv v8, v8, v0 +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vmul.vv v24, v0, v24 ; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: vslidedown.vi v16, v8, 8 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 4 @@ -6612,15 +6612,15 @@ define i64 @vreduce_mul_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vmul.vv v16, v24, v16 -; RV64-NEXT: vmul.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vmul.vv v24, v0, v24 ; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vmul.vv v8, v8, v24 ; RV64-NEXT: vslidedown.vi v16, v8, 8 ; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: vslidedown.vi v16, v8, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll index 318f38839851c..77c5d1b3bf641 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll @@ -149,18 +149,17 @@ define <64 x i32> @select_addsub_v64i32(<64 x i1> %cc, <64 x i32> %a, <64 x i32> ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vmv8r.v v16, v8 -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vslidedown.vi v7, v0, 4 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle32.v v24, (a0) ; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 4 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vle32.v v24, (a1) ; CHECK-NEXT: vadd.vv v8, v16, v8 +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vrsub.vi v24, v24, 0, v0.t ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll index 03d5762b4903e..c026b8427578b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll @@ -1073,19 +1073,19 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFH-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; ZVFH-NEXT: addi a1, a0, 128 ; ZVFH-NEXT: li a3, 64 +; ZVFH-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; ZVFH-NEXT: vslidedown.vi v24, v0, 8 ; ZVFH-NEXT: vsetvli zero, a3, e16, m8, ta, ma ; ZVFH-NEXT: vle16.v v16, (a1) ; ZVFH-NEXT: addi a1, sp, 16 ; ZVFH-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFH-NEXT: mv a1, a2 ; ZVFH-NEXT: vle16.v v16, (a0) -; ZVFH-NEXT: mv a0, a2 -; ZVFH-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; ZVFH-NEXT: vslidedown.vi v24, v0, 8 ; ZVFH-NEXT: bltu a2, a3, .LBB43_2 ; ZVFH-NEXT: # %bb.1: -; ZVFH-NEXT: li a0, 64 +; ZVFH-NEXT: li a1, 64 ; ZVFH-NEXT: .LBB43_2: -; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; ZVFH-NEXT: vmfeq.vv v7, v8, v16, v0.t ; ZVFH-NEXT: addi a0, a2, -64 ; ZVFH-NEXT: sltu a1, a2, a0 @@ -1152,14 +1152,14 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: addi a3, sp, 640 ; ZVFHMIN32-NEXT: addi a4, sp, 384 ; ZVFHMIN32-NEXT: addi a5, sp, 512 +; ZVFHMIN32-NEXT: addi a6, sp, 256 ; ZVFHMIN32-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; ZVFHMIN32-NEXT: vle16.v v0, (a0) -; ZVFHMIN32-NEXT: addi a0, sp, 256 ; ZVFHMIN32-NEXT: vle16.v v24, (a1) +; ZVFHMIN32-NEXT: vle16.v v0, (a0) ; ZVFHMIN32-NEXT: vse16.v v8, (a3) ; ZVFHMIN32-NEXT: vse16.v v0, (a4) ; ZVFHMIN32-NEXT: vse16.v v16, (a5) -; ZVFHMIN32-NEXT: vse16.v v24, (a0) +; ZVFHMIN32-NEXT: vse16.v v24, (a6) ; ZVFHMIN32-NEXT: lh a0, 704(sp) ; ZVFHMIN32-NEXT: lh a1, 448(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 @@ -2286,14 +2286,14 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: addi a3, sp, 640 ; ZVFHMIN64-NEXT: addi a4, sp, 384 ; ZVFHMIN64-NEXT: addi a5, sp, 512 +; ZVFHMIN64-NEXT: addi a6, sp, 256 ; ZVFHMIN64-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; ZVFHMIN64-NEXT: vle16.v v0, (a0) -; ZVFHMIN64-NEXT: addi a0, sp, 256 ; ZVFHMIN64-NEXT: vle16.v v24, (a1) +; ZVFHMIN64-NEXT: vle16.v v0, (a0) ; ZVFHMIN64-NEXT: vse16.v v8, (a3) ; ZVFHMIN64-NEXT: vse16.v v0, (a4) ; ZVFHMIN64-NEXT: vse16.v v16, (a5) -; ZVFHMIN64-NEXT: vse16.v v24, (a0) +; ZVFHMIN64-NEXT: vse16.v v24, (a6) ; ZVFHMIN64-NEXT: lh a0, 704(sp) ; ZVFHMIN64-NEXT: lh a1, 448(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 @@ -3953,20 +3953,20 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 2 -; CHECK-NEXT: bltu a2, a1, .LBB87_2 +; CHECK-NEXT: bltu a2, a3, .LBB87_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB87_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v7, v8, v16, v0.t ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 @@ -3977,13 +3977,13 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v8, v16, v24, v0.t +; CHECK-NEXT: vmfeq.vv v16, v8, v24, v0.t ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v7, v8, 2 +; CHECK-NEXT: vslideup.vi v7, v16, 2 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll index 69d6ffa9f300c..56d94dd55696b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll @@ -599,31 +599,31 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a1, 128 -; CHECK-NEXT: addi a4, a0, 128 +; CHECK-NEXT: addi a4, a3, -128 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a2) -; CHECK-NEXT: addi a2, a3, -128 -; CHECK-NEXT: vle8.v v8, (a4) -; CHECK-NEXT: sltu a4, a3, a2 -; CHECK-NEXT: vle8.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a2, a4, a2 +; CHECK-NEXT: sltu a2, a3, a4 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a4 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: addi a4, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vmseq.vv v6, v16, v8, v0.t +; CHECK-NEXT: vmseq.vv v7, v16, v8, v0.t ; CHECK-NEXT: bltu a3, a1, .LBB51_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 128 ; CHECK-NEXT: .LBB51_2: -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 @@ -634,7 +634,7 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vmv1r.v v8, v6 +; CHECK-NEXT: vmv1r.v v8, v7 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 @@ -1263,19 +1263,19 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 4 ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a1) ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 4 ; CHECK-NEXT: bltu a2, a3, .LBB99_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: .LBB99_2: -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vmseq.vv v7, v8, v16, v0.t ; CHECK-NEXT: addi a0, a2, -32 ; CHECK-NEXT: sltu a1, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll index 4b7f82f94f5e4..9e887dfa8dc06 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll @@ -636,16 +636,16 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV32-NEXT: add a5, a1, a5 ; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-RV32-NEXT: vlse64.v v24, (a5), a2, v0.t +; CHECK-RV32-NEXT: addi a3, a0, 128 +; CHECK-RV32-NEXT: addi a5, a0, 256 ; CHECK-RV32-NEXT: vmv1r.v v0, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; CHECK-RV32-NEXT: vlse64.v v8, (a1), a2, v0.t -; CHECK-RV32-NEXT: addi a1, a0, 128 -; CHECK-RV32-NEXT: addi a2, a0, 256 ; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-RV32-NEXT: vse64.v v8, (a0) -; CHECK-RV32-NEXT: vse64.v v24, (a1) +; CHECK-RV32-NEXT: vse64.v v24, (a3) ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vse64.v v16, (a2) +; CHECK-RV32-NEXT: vse64.v v16, (a5) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: strided_load_v33f64: @@ -687,16 +687,16 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV64-NEXT: add a5, a1, a5 ; CHECK-RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; CHECK-RV64-NEXT: vlse64.v v24, (a5), a2, v0.t +; CHECK-RV64-NEXT: addi a4, a0, 128 +; CHECK-RV64-NEXT: addi a5, a0, 256 ; CHECK-RV64-NEXT: vmv1r.v v0, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-RV64-NEXT: vlse64.v v8, (a1), a2, v0.t -; CHECK-RV64-NEXT: addi a1, a0, 128 -; CHECK-RV64-NEXT: addi a2, a0, 256 ; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-RV64-NEXT: vse64.v v8, (a0) -; CHECK-RV64-NEXT: vse64.v v24, (a1) +; CHECK-RV64-NEXT: vse64.v v24, (a4) ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV64-NEXT: vse64.v v16, (a2) +; CHECK-RV64-NEXT: vse64.v v16, (a5) ; CHECK-RV64-NEXT: ret %v = call <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 %evl) ret <33 x double> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll index a91dee1cb245f..6444968a04a83 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll @@ -245,64 +245,62 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vslidedown.vi v6, v0, 8 -; CHECK-NEXT: addi a2, a1, 512 -; CHECK-NEXT: addi a3, a1, 640 -; CHECK-NEXT: addi a4, a7, -64 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v26, v0, 4 +; CHECK-NEXT: addi a3, a1, 128 +; CHECK-NEXT: addi a2, a1, 640 +; CHECK-NEXT: addi a4, a7, -64 ; CHECK-NEXT: vslidedown.vi v27, v6, 4 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a3) -; CHECK-NEXT: sltu a3, a7, a4 +; CHECK-NEXT: vle64.v v8, (a2) +; CHECK-NEXT: sltu a2, a7, a4 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v27, 2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a4, a3, a4 -; CHECK-NEXT: addi a3, a4, -32 -; CHECK-NEXT: sltu a5, a4, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a4, a2, a4 +; CHECK-NEXT: addi a2, a4, -32 +; CHECK-NEXT: sltu a5, a4, a2 ; CHECK-NEXT: addi a5, a5, -1 -; CHECK-NEXT: and a3, a5, a3 -; CHECK-NEXT: addi a5, a3, -16 -; CHECK-NEXT: sltu a6, a3, a5 +; CHECK-NEXT: and a5, a5, a2 +; CHECK-NEXT: addi a2, a5, -16 +; CHECK-NEXT: sltu a6, a5, a2 ; CHECK-NEXT: addi a6, a6, -1 -; CHECK-NEXT: and a5, a6, a5 -; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma +; CHECK-NEXT: and a2, a6, a2 +; CHECK-NEXT: addi a6, a1, 512 +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: slli a5, a5, 4 -; CHECK-NEXT: add a5, sp, a5 -; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a2) -; CHECK-NEXT: addi a5, a1, 128 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v26, v7, 4 -; CHECK-NEXT: bltu a3, a2, .LBB16_2 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v8, (a6) +; CHECK-NEXT: bltu a5, a2, .LBB16_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: li a5, 16 ; CHECK-NEXT: .LBB16_2: ; CHECK-NEXT: vmv1r.v v0, v27 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a5) -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: li a6, 56 -; CHECK-NEXT: mul a5, a5, a6 -; CHECK-NEXT: add a5, sp, a5 -; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v27, v26, 2 -; CHECK-NEXT: li a5, 64 -; CHECK-NEXT: vsetvli zero, a3, e32, m4, ta, ma -; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t +; CHECK-NEXT: vle64.v v16, (a3) ; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: slli a3, a3, 6 +; CHECK-NEXT: li a6, 56 +; CHECK-NEXT: mul a3, a3, a6 ; CHECK-NEXT: add a3, sp, a3 ; CHECK-NEXT: addi a3, a3, 16 ; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v27, v26, 2 +; CHECK-NEXT: li a3, 64 +; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma +; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: slli a5, a5, 6 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill ; CHECK-NEXT: mv a6, a7 -; CHECK-NEXT: bltu a7, a5, .LBB16_4 +; CHECK-NEXT: bltu a7, a3, .LBB16_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a6, 64 ; CHECK-NEXT: .LBB16_4: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll index fa82065f3b413..0b32818dba15f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll @@ -298,46 +298,36 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 2 -; CHECK-NEXT: bltu a2, a1, .LBB26_2 +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: bltu a2, a3, .LBB26_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfsgnj.vv v8, v8, v16, v0.t +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vfsgnj.vv v8, v8, v24, v0.t ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfsgnj.vv v16, v16, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -352,8 +342,8 @@ define <32 x double> @vfsgnj_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK: # %bb.0: ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: vle64.v v0, (a0) +; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: bltu a2, a1, .LBB27_2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll index bde842dcc7600..a6c51ced93ddc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll @@ -849,35 +849,35 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, a2, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a2) -; CHECK-NEXT: addi a2, a0, 128 -; CHECK-NEXT: vle64.v v8, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v8, (a2) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a4 +; CHECK-NEXT: addi a3, a2, 128 +; CHECK-NEXT: addi a5, a0, 128 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: bltu a4, a1, .LBB50_2 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a2) +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: mv a1, a4 +; CHECK-NEXT: vle64.v v8, (a3) +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v8, (a5) +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: bltu a4, a2, .LBB50_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB50_2: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 @@ -893,16 +893,16 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t ; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 @@ -941,26 +941,26 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> % ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a2, 128 +; CHECK-NEXT: addi a3, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a2) -; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a2) +; CHECK-NEXT: mv a1, a4 +; CHECK-NEXT: vle64.v v24, (a3) ; CHECK-NEXT: vle64.v v0, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a4 -; CHECK-NEXT: bltu a4, a1, .LBB51_2 +; CHECK-NEXT: bltu a4, a2, .LBB51_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB51_2: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfmadd.vv v0, v8, v16 ; CHECK-NEXT: addi a0, a4, -16 ; CHECK-NEXT: sltu a1, a4, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll index b37c47a32ba21..16043d33fb4df 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll @@ -390,46 +390,36 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 2 -; CHECK-NEXT: bltu a2, a1, .LBB26_2 +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: bltu a2, a3, .LBB26_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmax.vv v8, v8, v16, v0.t +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vfmax.vv v8, v8, v24, v0.t ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmax.vv v16, v16, v24, v0.t +; CHECK-NEXT: vfmax.vv v16, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -444,8 +434,8 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK: # %bb.0: ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: vle64.v v0, (a0) +; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: bltu a2, a1, .LBB27_2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll index 261523e8ace50..5a4a85ca32a70 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll @@ -390,46 +390,36 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 2 -; CHECK-NEXT: bltu a2, a1, .LBB26_2 +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: bltu a2, a3, .LBB26_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmin.vv v8, v8, v16, v0.t +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vfmin.vv v8, v8, v24, v0.t ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmin.vv v16, v16, v24, v0.t +; CHECK-NEXT: vfmin.vv v16, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -444,8 +434,8 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK: # %bb.0: ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: vle64.v v0, (a0) +; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: bltu a2, a1, .LBB27_2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll index a5d9b3439e29b..eb4ce757a8385 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll @@ -621,35 +621,35 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, a2, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a2) -; CHECK-NEXT: addi a2, a0, 128 -; CHECK-NEXT: vle64.v v8, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v8, (a2) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a4 +; CHECK-NEXT: addi a3, a2, 128 +; CHECK-NEXT: addi a5, a0, 128 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: bltu a4, a1, .LBB50_2 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a2) +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: mv a1, a4 +; CHECK-NEXT: vle64.v v8, (a3) +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v8, (a5) +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: bltu a4, a2, .LBB50_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB50_2: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 @@ -665,16 +665,16 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t ; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 @@ -713,26 +713,26 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> % ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a2, 128 +; CHECK-NEXT: addi a3, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a2) -; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a2) +; CHECK-NEXT: mv a1, a4 +; CHECK-NEXT: vle64.v v24, (a3) ; CHECK-NEXT: vle64.v v0, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a4 -; CHECK-NEXT: bltu a4, a1, .LBB51_2 +; CHECK-NEXT: bltu a4, a2, .LBB51_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB51_2: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfmadd.vv v0, v8, v16 ; CHECK-NEXT: addi a0, a4, -16 ; CHECK-NEXT: sltu a1, a4, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll index f2960b9600aca..fc893ae00585d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll @@ -99,8 +99,8 @@ define <64 x float> @vfwadd_v64f16(ptr %x, ptr %y) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll index 325302cb2bb8e..350f5efa46a66 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll @@ -99,8 +99,8 @@ define <64 x float> @vfwmul_v64f16(ptr %x, ptr %y) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll index 599a3f9099403..7c901494c033b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll @@ -99,8 +99,8 @@ define <64 x float> @vfwsub_v64f16(ptr %x, ptr %y) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll index 6394542479d1b..bb364b1d4bcf5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll @@ -1360,15 +1360,15 @@ define <32 x double> @vpmerge_vv_v32f64(<32 x double> %va, <32 x double> %vb, <3 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bltu a2, a1, .LBB83_2 +; CHECK-NEXT: bltu a2, a3, .LBB83_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB83_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma +; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, ma ; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll index d691dcd5c54b6..1dff4a3feeaab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll @@ -1778,16 +1778,16 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m ; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: li a1, 16 -; RV64-NEXT: mv a0, a2 -; RV64-NEXT: bltu a2, a1, .LBB83_2 +; RV64-NEXT: bltu a2, a3, .LBB83_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a0, 16 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB83_2: -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t ; RV64-NEXT: addi a0, a2, -16 ; RV64-NEXT: sltu a1, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll index dc83edba5ae8c..c5506e175ce00 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll @@ -8,104 +8,48 @@ define <512 x i8> @vadd_v512i8_zvl128(<512 x i8> %a, <512 x i8> %b) #0 { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a4, 48 -; CHECK-NEXT: mul a2, a2, a4 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 5 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a4, 40 -; CHECK-NEXT: mul a2, a2, a4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: li a2, 128 -; CHECK-NEXT: addi a4, a3, 128 -; CHECK-NEXT: addi a5, a3, 384 +; CHECK-NEXT: addi a4, a3, 256 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a5) -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a5, 24 -; CHECK-NEXT: mul a2, a2, a5 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: addi a2, a1, 128 -; CHECK-NEXT: vle8.v v8, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, a3, 256 -; CHECK-NEXT: vle8.v v8, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle8.v v8, (a2) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vle8.v v24, (a4) +; CHECK-NEXT: addi a2, a3, 384 +; CHECK-NEXT: vle8.v v0, (a1) +; CHECK-NEXT: addi a1, a1, 128 +; CHECK-NEXT: vadd.vv v8, v0, v24 +; CHECK-NEXT: addi a4, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v0, (a2) +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vadd.vv v24, v24, v0 +; CHECK-NEXT: addi a1, a3, 128 +; CHECK-NEXT: vle8.v v0, (a1) +; CHECK-NEXT: vadd.vv v16, v16, v0 ; CHECK-NEXT: vle8.v v0, (a3) ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 24 -; CHECK-NEXT: mul a1, a1, a2 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v16, v16, v8 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 5 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v24, v8, v24 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 40 -; CHECK-NEXT: mul a1, a1, a2 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vadd.vv v0, v8, v0 ; CHECK-NEXT: vse8.v v0, (a0) ; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vse8.v v16, (a1) +; CHECK-NEXT: vse8.v v24, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: addi a2, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vse8.v v24, (a0) +; CHECK-NEXT: vse8.v v16, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 48 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -121,10 +65,10 @@ define <512 x i8> @vadd_v512i8_zvl256(<512 x i8> %a, <512 x i8> %b) #1 { ; CHECK-NEXT: addi a1, a0, 256 ; CHECK-NEXT: li a2, 256 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v24, (a0) -; CHECK-NEXT: vle8.v v0, (a1) -; CHECK-NEXT: vadd.vv v8, v8, v24 -; CHECK-NEXT: vadd.vv v16, v16, v0 +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v0, (a0) +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: vadd.vv v16, v16, v24 ; CHECK-NEXT: ret %c = add <512 x i8> %a, %b ret <512 x i8> %c diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll index 05254e60b65b7..6e173b25adce9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll @@ -167,15 +167,15 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3 ; CHECK-NEXT: vmv1r.v v6, v8 ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: li a2, 128 -; CHECK-NEXT: addi a4, a1, 128 +; CHECK-NEXT: addi a4, a3, -128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v24, (a0) -; CHECK-NEXT: addi a0, a3, -128 -; CHECK-NEXT: vle8.v v8, (a4) -; CHECK-NEXT: sltu a4, a3, a0 +; CHECK-NEXT: sltu a0, a3, a4 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a4 ; CHECK-NEXT: vle8.v v16, (a1) -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a0, a4, a0 +; CHECK-NEXT: addi a1, a1, 128 +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vmv1r.v v0, v6 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0 @@ -206,40 +206,30 @@ define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a2, a2, a3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, a1, 128 +; CHECK-NEXT: vle8.v v16, (a1) +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a1, 128 ; CHECK-NEXT: vle8.v v24, (a0) ; CHECK-NEXT: vle8.v v16, (a1) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0 +; CHECK-NEXT: vmerge.vvm v24, v16, v24, v0 ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -249,8 +239,7 @@ define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c ; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vmv8r.v v16, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -418,16 +407,16 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bltu a2, a1, .LBB25_2 +; CHECK-NEXT: bltu a2, a3, .LBB25_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB25_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 @@ -456,39 +445,50 @@ define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c) ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 24 -; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload @@ -497,8 +497,7 @@ define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c) ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma ; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -621,13 +620,13 @@ define <64 x float> @select_v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> % ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: bltu a2, a3, .LBB35_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: .LBB35_2: -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 ; CHECK-NEXT: addi a0, a2, -32 ; CHECK-NEXT: sltu a1, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll index 50184796b38f5..ab44b864d68fa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll @@ -257,8 +257,8 @@ define <128 x i16> @vwadd_v128i16(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 @@ -302,8 +302,8 @@ define <64 x i32> @vwadd_v64i32(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll index 98f246b8741dc..01d38fe5daeab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll @@ -257,8 +257,8 @@ define <128 x i16> @vwaddu_v128i16(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 @@ -302,8 +302,8 @@ define <64 x i32> @vwaddu_v64i32(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll index eb7be14abe431..e92e3d554e494 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -283,8 +283,8 @@ define <128 x i16> @vwmul_v128i16(ptr %x, ptr %y) { ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 @@ -332,8 +332,8 @@ define <64 x i32> @vwmul_v64i32(ptr %x, ptr %y) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll index 8626b25a9d323..a4b0623b06373 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll @@ -275,8 +275,8 @@ define <128 x i16> @vwmulsu_v128i16(ptr %x, ptr %y) { ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 @@ -324,8 +324,8 @@ define <64 x i32> @vwmulsu_v64i32(ptr %x, ptr %y) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll index 007b561a2247a..1d5b255d51098 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll @@ -259,8 +259,8 @@ define <128 x i16> @vwmulu_v128i16(ptr %x, ptr %y) { ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 @@ -308,8 +308,8 @@ define <64 x i32> @vwmulu_v64i32(ptr %x, ptr %y) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll index 7a925165d9816..0654fbbd05f8c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll @@ -257,8 +257,8 @@ define <128 x i16> @vwsub_v128i16(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 @@ -302,8 +302,8 @@ define <64 x i32> @vwsub_v64i32(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll index 4c08a8c15a388..57f95877b1e36 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -257,8 +257,8 @@ define <128 x i16> @vwsubu_v128i16(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 @@ -302,8 +302,8 @@ define <64 x i32> @vwsubu_v64i32(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vslidedown.vx v8, v0, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll index d56e46f7db3ab..d981e5b3ce6ef 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll @@ -1615,26 +1615,24 @@ define @vfmax_vv_nxv16f64( %va, @vfmax_vv_nxv16f64( %va, @vfmax_vv_nxv16f64_unmasked( ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill @@ -1727,15 +1722,10 @@ define @vfmax_vv_nxv16f64_unmasked( ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vl8re64.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: vl8re64.v v24, (a0) ; CHECK-NEXT: vfmax.vv v8, v16, v8 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill @@ -1744,27 +1734,21 @@ define @vfmax_vv_nxv16f64_unmasked( ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: .LBB41_2: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v8, v8 -; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 -; CHECK-NEXT: vfmax.vv v8, v8, v24 +; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0 +; CHECK-NEXT: vfmax.vv v8, v24, v16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll index 81e4a548f560e..d2a9916b26e22 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll @@ -1615,26 +1615,24 @@ define @vfmin_vv_nxv16f64( %va, @vfmin_vv_nxv16f64( %va, @vfmin_vv_nxv16f64_unmasked( ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill @@ -1727,15 +1722,10 @@ define @vfmin_vv_nxv16f64_unmasked( ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vl8re64.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: vl8re64.v v24, (a0) ; CHECK-NEXT: vfmin.vv v8, v16, v8 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill @@ -1744,27 +1734,21 @@ define @vfmin_vv_nxv16f64_unmasked( ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: .LBB41_2: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v8, v8 -; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 -; CHECK-NEXT: vfmin.vv v8, v8, v24 +; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0 +; CHECK-NEXT: vfmin.vv v8, v24, v16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll index b569efc7447da..f52200b4e7c34 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll @@ -984,10 +984,10 @@ define @fshr_v16i64( %a, @fshr_v16i64( %a, @fshr_v16i64( %a, @fshr_v16i64( %a, @fshr_v16i64( %a, @fshl_v16i64( %a, @fshl_v16i64( %a, @fshl_v16i64( %a, @fshl_v16i64( %a, @fshl_v16i64( %a, @fshl_v16i64( %a, %ptrs0, %ptr ; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; RV64-NEXT: vmv8r.v v16, v8 -; RV64-NEXT: vl8re64.v v24, (a0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: vl8re64.v v8, (a1) -; RV64-NEXT: srli a1, a0, 3 -; RV64-NEXT: vslidedown.vx v7, v0, a1 -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vluxei64.v v24, (zero), v16, v0.t -; RV64-NEXT: vmv1r.v v0, v7 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8re64.v v24, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a3, a1, 3 +; RV64-NEXT: vslidedown.vx v7, v0, a3 +; RV64-NEXT: vl8re64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; RV64-NEXT: vluxei64.v v8, (zero), v16, v0.t -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add a0, a2, a0 -; RV64-NEXT: vs8r.v v8, (a0) -; RV64-NEXT: vs8r.v v24, (a2) +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vluxei64.v v24, (zero), v16, v0.t +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: vs8r.v v24, (a1) +; RV64-NEXT: vs8r.v v8, (a2) ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll index 77a1f508d2218..6efb61f53614e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll @@ -1894,57 +1894,25 @@ define void @mscatter_nxv16f64( %val0, @bar(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, %w, %x, %y, %z) { ; CHECK-LABEL: bar: ; CHECK: # %bb.0: -; CHECK-NEXT: ld a0, 0(sp) -; CHECK-NEXT: ld a1, 8(sp) +; CHECK-NEXT: ld a0, 8(sp) +; CHECK-NEXT: ld a1, 0(sp) ; CHECK-NEXT: vl8re32.v v24, (a0) ; CHECK-NEXT: vl8re32.v v0, (a1) ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v24 -; CHECK-NEXT: vadd.vv v16, v16, v0 +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: vadd.vv v16, v16, v24 ; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %s0 = add %w, %y diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll index 30ef3dccd426b..9924591257f6c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll @@ -1466,13 +1466,10 @@ define @fcmp_oeq_vv_nxv64bf16( %va, @fcmp_oeq_vv_nxv64bf16( %va, @fcmp_oeq_vv_nxv64bf16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @icmp_eq_vv_nxv128i8( %va, @icmp_eq_vv_nxv128i8( %va, @icmp_eq_vv_nxv32i32( %va, , } @vector_deinterleave_load_nxv8i6 ; CHECK-NEXT: li a1, 85 ; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: li a1, 170 +; CHECK-NEXT: vmv.v.x v17, a1 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vl8re64.v v24, (a0) +; CHECK-NEXT: vl8re64.v v0, (a0) ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: li a1, 170 -; CHECK-NEXT: vl8re64.v v0, (a0) -; CHECK-NEXT: vmv.v.x v17, a1 +; CHECK-NEXT: vl8re64.v v24, (a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vcompress.vm v8, v24, v16 +; CHECK-NEXT: vcompress.vm v8, v0, v16 ; CHECK-NEXT: vmv1r.v v12, v16 ; CHECK-NEXT: vmv1r.v v13, v17 -; CHECK-NEXT: vcompress.vm v16, v24, v13 -; CHECK-NEXT: vcompress.vm v24, v0, v12 +; CHECK-NEXT: vcompress.vm v16, v0, v13 +; CHECK-NEXT: vcompress.vm v0, v24, v12 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vcompress.vm v24, v0, v13 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vcompress.vm v0, v24, v13 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmv4r.v v12, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll index 5ee5d40d8313d..4316d5cb403dd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll @@ -3576,24 +3576,17 @@ define @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64_unmasked( %va, %b, %c, i32 zeroext %evl) { ; CHECK-LABEL: vfma_vv_nxv16f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 5 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: mv a3, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 ; CHECK-NEXT: add a5, a2, a3 -; CHECK-NEXT: vl8re64.v v8, (a5) -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: slli a5, a5, 3 -; CHECK-NEXT: add a5, sp, a5 -; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re64.v v0, (a5) ; CHECK-NEXT: sub a5, a4, a1 ; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vl8re64.v v24, (a3) ; CHECK-NEXT: sltu a3, a4, a5 -; CHECK-NEXT: vl8re64.v v8, (a2) -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vl8re64.v v0, (a0) ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a3, a3, a5 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v24, v16, v8 +; CHECK-NEXT: vfmadd.vv v24, v16, v0 +; CHECK-NEXT: vl8re64.v v0, (a2) +; CHECK-NEXT: vl8re64.v v16, (a0) ; CHECK-NEXT: bltu a4, a1, .LBB129_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a1 ; CHECK-NEXT: .LBB129_2: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v0, v16, v8 -; CHECK-NEXT: vmv.v.v v8, v0 +; CHECK-NEXT: vfmadd.vv v16, v8, v0 +; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: vmv8r.v v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call @llvm.vp.fma.nxv16f64( %va, %b, %c, splat (i1 true), i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll index 901f3cd63fa9e..432994de33321 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll @@ -1108,20 +1108,15 @@ define @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64_unmasked( %va, %b, %c, i32 zeroext %evl) { ; CHECK-LABEL: vfma_vv_nxv16f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 5 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 ; CHECK-NEXT: add a5, a2, a3 -; CHECK-NEXT: vl8re64.v v8, (a5) -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: slli a5, a5, 3 -; CHECK-NEXT: add a5, sp, a5 -; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re64.v v0, (a5) ; CHECK-NEXT: sub a5, a4, a1 ; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vl8re64.v v24, (a3) ; CHECK-NEXT: sltu a3, a4, a5 -; CHECK-NEXT: vl8re64.v v8, (a2) -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vl8re64.v v0, (a0) ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a3, a3, a5 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v24, v16, v8 +; CHECK-NEXT: vfmadd.vv v24, v16, v0 +; CHECK-NEXT: vl8re64.v v0, (a2) +; CHECK-NEXT: vl8re64.v v16, (a0) ; CHECK-NEXT: bltu a4, a1, .LBB93_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a1 ; CHECK-NEXT: .LBB93_2: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v0, v16, v8 -; CHECK-NEXT: vmv.v.v v8, v0 +; CHECK-NEXT: vfmadd.vv v16, v8, v0 +; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: vmv8r.v v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call @llvm.vp.fmuladd.nxv16f64( %va, %b, %c, splat (i1 true), i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll index 63156e1399293..30fa2a110b295 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll @@ -149,12 +149,12 @@ define @vfptrunc_nxv32f32_nxv32f64( ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmv1r.v v7, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a3, a1, 3 @@ -165,18 +165,18 @@ define @vfptrunc_nxv32f32_nxv32f64( ; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: sub a5, a2, a4 ; CHECK-NEXT: vl8re64.v v24, (a6) +; CHECK-NEXT: vsetvli a6, zero, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v16, a3 ; CHECK-NEXT: sltu a6, a2, a5 ; CHECK-NEXT: addi a6, a6, -1 ; CHECK-NEXT: and a5, a6, a5 ; CHECK-NEXT: sub a6, a5, a1 ; CHECK-NEXT: sltu a7, a5, a6 ; CHECK-NEXT: addi a7, a7, -1 -; CHECK-NEXT: vl8re64.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v16, a3 -; CHECK-NEXT: and a0, a7, a6 -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: and a6, a7, a6 +; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, ma ; CHECK-NEXT: vfncvt.f.f.w v20, v24, v0.t +; CHECK-NEXT: vl8re64.v v24, (a0) ; CHECK-NEXT: bltu a5, a1, .LBB8_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a5, a1 @@ -185,7 +185,7 @@ define @vfptrunc_nxv32f32_nxv32f64( ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v6, v7, a3 ; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma -; CHECK-NEXT: vfncvt.f.f.w v16, v8, v0.t +; CHECK-NEXT: vfncvt.f.f.w v16, v24, v0.t ; CHECK-NEXT: bltu a2, a4, .LBB8_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a2, a4 @@ -195,23 +195,22 @@ define @vfptrunc_nxv32f32_nxv32f64( ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a0, a3, a0 ; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vfncvt.f.f.w v28, v8, v0.t +; CHECK-NEXT: vfncvt.f.f.w v12, v24, v0.t ; CHECK-NEXT: bltu a2, a1, .LBB8_6 ; CHECK-NEXT: # %bb.5: ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: .LBB8_6: ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; CHECK-NEXT: vfncvt.f.f.w v24, v8, v0.t -; CHECK-NEXT: vmv8r.v v8, v24 +; CHECK-NEXT: vfncvt.f.f.w v8, v24, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll index 4cd77185e6930..cbafda801b2d3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll @@ -566,15 +566,15 @@ define @vpmerge_vv_nxv128i8( %va, %val, ; RV64-NEXT: slli a3, a1, 3 ; RV64-NEXT: add a3, a0, a3 ; RV64-NEXT: vl8re64.v v16, (a3) +; RV64-NEXT: mv a3, a2 ; RV64-NEXT: vl8re64.v v24, (a0) -; RV64-NEXT: mv a0, a2 ; RV64-NEXT: bltu a2, a1, .LBB108_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a1 +; RV64-NEXT: mv a3, a1 ; RV64-NEXT: .LBB108_2: -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t ; RV64-NEXT: sub a0, a2, a1 ; RV64-NEXT: srli a1, a1, 3 diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll index d4ebe27420d7b..3642d0fadf5f7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll @@ -365,20 +365,20 @@ define @select_nxv32i32( %a, @select_evl_nxv32i32( %a, @select_nxv16f64( %a, @vtrunc_nxv32i64_nxv32i32( %a, @vtrunc_nxv32i64_nxv32i32( %a, @vtrunc_nxv32i64_nxv32i32( %a, @vtrunc_nxv32i64_nxv32i32( %a,