diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index b5b83c7ff1164..4a5b2fb0dd613 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -418,6 +418,22 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { return true; } + /// Returns true if CopyMI should be considered for register + /// definition rematerialization. Otherwise, returns false. + /// + /// Rematerialization can replace a source register with its value + /// from its definition. Its applied in the register coalescer, + /// after instruction selection and before register allocation. + /// + /// Subtargets can override this method to classify rematerialization + /// candidates. Note that this cannot be defined in tablegen because it + /// operates at a higher level. + virtual bool shouldReMaterializeTrivialRegDef(const MachineInstr *CopyMI, + const Register &DestReg, + const Register &SrcReg) const { + return true; + } + /// Re-issue the specified 'original' instruction at the /// specific location targeting a new destination register. /// The register in Orig->getOperand(0).getReg() will be substituted by diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h index 45e67d80629cb..c5a7ed19d54dd 100644 --- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -185,6 +185,48 @@ class LLVM_ABI TargetSubtargetInfo : public MCSubtargetInfo { return false; } + /// Returns true if CopyMI can be lowered to a zero cycle register move. + /// Otherwise, returns false. + /// + /// Lowering to zero cycle register moves depend on the microarchitecture + /// for the specific architectural registers and instructions supported. + /// Thus, currently its applied after register allocation, + /// when `ExpandPostRAPseudos` pass calls `TargetInstrInfo::lowerCopy` + /// which in turn calls `TargetInstrInfo::copyPhysReg`. + /// + /// Subtargets can override this method to classify lowering candidates. + /// Note that this cannot be defined in tablegen because it operates at + /// a higher level. + /// + /// NOTE: Subtargets must maintain consistency between the logic here and + /// on lowering. + virtual bool canLowerToZeroCycleRegMove(const MachineInstr *CopyMI, + const Register &DestReg, + const Register &SrcReg) const { + return false; + } + + /// Returns true if CopyMI can be lowered to a zero cycle register zeroing. + /// Otherwise, returns false. + /// + /// Lowering to zero cycle register zeroing depends on the microarchitecture + /// for the specific architectural registers and instructions supported. + /// Thus, currently it takes place after register allocation, + /// when `ExpandPostRAPseudos` pass calls `TargetInstrInfo::lowerCopy` + /// which in turn calls `TargetInstrInfo::copyPhysReg`. + /// + /// Subtargets can override this method to classify lowering candidates. + /// Note that this cannot be defined in tablegen because it operates at + /// a higher level. + /// + /// NOTE: Subtargets must maintain consistency between the logic here and + /// on lowering. + virtual bool canLowerToZeroCycleRegZeroing(const MachineInstr *CopyMI, + const Register &DestReg, + const Register &SrcReg) const { + return false; + } + /// True if the subtarget should run MachineScheduler after aggressive /// coalescing. /// diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 2d25f097348af..088518914a2b1 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -69,6 +69,9 @@ STATISTIC(numCrossRCs, "Number of cross class joins performed"); STATISTIC(numCommutes, "Number of instruction commuting performed"); STATISTIC(numExtends, "Number of copies extended"); STATISTIC(NumReMats, "Number of instructions re-materialized"); +STATISTIC(NumReMatsPrevented, + "Number of instruction rematerialization prevented by " + "`shouldReMaterializeTrivialRegDef` hook"); STATISTIC(NumInflated, "Number of register classes inflated"); STATISTIC(NumLaneConflicts, "Number of dead lane conflicts tested"); STATISTIC(NumLaneResolves, "Number of dead lane conflicts resolved"); @@ -1400,6 +1403,12 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, if (!Edit.canRematerializeAt(RM, ValNo, CopyIdx)) return false; + if (!TII->shouldReMaterializeTrivialRegDef(CopyMI, DstReg, SrcReg)) { + LLVM_DEBUG(dbgs() << "Remat prevented: " << CopyIdx << "\t" << *CopyMI); + ++NumReMatsPrevented; + return false; + } + DebugLoc DL = CopyMI->getDebugLoc(); MachineBasicBlock *MBB = CopyMI->getParent(); MachineBasicBlock::iterator MII = diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 05562516e5198..2fd449ef3f5f8 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -624,6 +624,9 @@ def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFP def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true", "Has zero-cycle register moves for FPR32 registers">; +def FeatureZCRegMoveFPR128 : SubtargetFeature<"zcm-fpr128", "HasZeroCycleRegMoveFPR128", "true", + "Has zero-cycle register moves for FPR128 registers">; + def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true", "Has zero-cycle zeroing instructions for generic registers">; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 8847c62690714..ee178a1d64234 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1029,6 +1029,13 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { } } +bool AArch64InstrInfo::shouldReMaterializeTrivialRegDef( + const MachineInstr *CopyMI, const Register &DestReg, + const Register &SrcReg) const { + return !Subtarget.canLowerToZeroCycleRegMove(CopyMI, DestReg, SrcReg) && + !Subtarget.canLowerToZeroCycleRegZeroing(CopyMI, DestReg, SrcReg); +} + bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { switch (MI.getOpcode()) { default: @@ -5025,6 +5032,9 @@ void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, } } +/// NOTE: must maintain consistency with +/// `AArch64Subtarget::canLowerToZeroCycleRegMove` and +/// `AArch64Subtarget::canLowerToZeroCycleRegZeroing`. void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 7c255da333e4b..e858e93cab2a4 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -189,6 +189,10 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo { bool isAsCheapAsAMove(const MachineInstr &MI) const override; + bool shouldReMaterializeTrivialRegDef(const MachineInstr *CopyMI, + const Register &DestReg, + const Register &SrcReg) const override; + bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override; diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index dcccde4a4d666..e6cc78d213e8e 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -313,6 +313,7 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR64, + FeatureZCRegMoveFPR128, FeatureZCZeroing, FeatureZCZeroingFPWorkaround]>; @@ -327,6 +328,7 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR64, + FeatureZCRegMoveFPR128, FeatureZCZeroing]>; def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", @@ -340,6 +342,7 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR64, + FeatureZCRegMoveFPR128, FeatureZCZeroing]>; def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", @@ -353,6 +356,7 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR64, + FeatureZCRegMoveFPR128, FeatureZCZeroing]>; def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", @@ -366,6 +370,7 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR64, + FeatureZCRegMoveFPR128, FeatureZCZeroing]>; def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", @@ -384,6 +389,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR64, + FeatureZCRegMoveFPR128, FeatureZCZeroing]>; def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", @@ -402,6 +408,7 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR64, + FeatureZCRegMoveFPR128, FeatureZCZeroing]>; def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", @@ -420,6 +427,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR64, + FeatureZCRegMoveFPR128, FeatureZCZeroing]>; def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", @@ -438,6 +446,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR64, + FeatureZCRegMoveFPR128, FeatureZCZeroing]>; def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", @@ -455,6 +464,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", FeatureFuseLiterals, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR64, + FeatureZCRegMoveFPR128, FeatureZCZeroing ]>; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 68ed10570a52f..015c80371e52a 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -667,3 +667,84 @@ AArch64Subtarget::getPtrAuthBlockAddressDiscriminatorIfEnabled( bool AArch64Subtarget::enableMachinePipeliner() const { return getSchedModel().hasInstrSchedModel(); } + +bool AArch64Subtarget::isRegInClass(const MachineInstr *MI, const Register &Reg, + const TargetRegisterClass *TRC) const { + if (Reg.isPhysical()) { + return TRC->contains(Reg); + } else { + const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); + return TRC->hasSubClassEq(MRI.getRegClass(Reg)); + } +} + +/// NOTE: must maintain consistency with `AArch64InstrInfo::copyPhysReg`. +bool AArch64Subtarget::canLowerToZeroCycleRegMove( + const MachineInstr *CopyMI, const Register &DestReg, + const Register &SrcReg) const { + if (isRegInClass(CopyMI, DestReg, &AArch64::GPR32allRegClass) && + isRegInClass(CopyMI, SrcReg, &AArch64::GPR32allRegClass) && + DestReg != AArch64::WZR) { + if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP || + SrcReg != AArch64::WZR || !hasZeroCycleZeroingGP()) { + return hasZeroCycleRegMoveGPR64() || hasZeroCycleRegMoveGPR32(); + } + return false; + } + + if (isRegInClass(CopyMI, DestReg, &AArch64::GPR64allRegClass) && + isRegInClass(CopyMI, SrcReg, &AArch64::GPR64allRegClass) && + DestReg != AArch64::XZR) { + if (DestReg == AArch64::SP || SrcReg == AArch64::SP || + SrcReg != AArch64::XZR || !hasZeroCycleZeroingGP()) { + return hasZeroCycleRegMoveGPR64(); + } + return false; + } + + if (isRegInClass(CopyMI, DestReg, &AArch64::FPR128RegClass) && + isRegInClass(CopyMI, SrcReg, &AArch64::FPR128RegClass)) { + return isNeonAvailable() && hasZeroCycleRegMoveFPR128(); + } + + if (isRegInClass(CopyMI, DestReg, &AArch64::FPR64RegClass) && + isRegInClass(CopyMI, SrcReg, &AArch64::FPR64RegClass)) { + return hasZeroCycleRegMoveFPR64(); + } + + if (isRegInClass(CopyMI, DestReg, &AArch64::FPR32RegClass) && + isRegInClass(CopyMI, SrcReg, &AArch64::FPR32RegClass)) { + return hasZeroCycleRegMoveFPR32() || hasZeroCycleRegMoveFPR64(); + } + + if (isRegInClass(CopyMI, DestReg, &AArch64::FPR16RegClass) && + isRegInClass(CopyMI, SrcReg, &AArch64::FPR16RegClass)) { + return hasZeroCycleRegMoveFPR32() || hasZeroCycleRegMoveFPR64(); + } + + if (isRegInClass(CopyMI, DestReg, &AArch64::FPR8RegClass) && + isRegInClass(CopyMI, SrcReg, &AArch64::FPR8RegClass)) { + return hasZeroCycleRegMoveFPR32() || hasZeroCycleRegMoveFPR64(); + } + + return false; +} + +/// NOTE: must maintain consistency with `AArch64InstrInfo::copyPhysReg`. +bool AArch64Subtarget::canLowerToZeroCycleRegZeroing( + const MachineInstr *CopyMI, const Register &DestReg, + const Register &SrcReg) const { + if (isRegInClass(CopyMI, DestReg, &AArch64::GPR32allRegClass) && + isRegInClass(CopyMI, SrcReg, &AArch64::GPR32allRegClass) && + DestReg != AArch64::WZR) { + return AArch64::WZR == SrcReg && hasZeroCycleZeroingGP(); + } + + if (isRegInClass(CopyMI, DestReg, &AArch64::GPR64allRegClass) && + isRegInClass(CopyMI, SrcReg, &AArch64::GPR64allRegClass) && + DestReg != AArch64::XZR) { + return AArch64::XZR == SrcReg && hasZeroCycleZeroingGP(); + } + + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index f95b0fafc607f..e08bd3e496479 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -120,6 +120,12 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { /// Initialize properties based on the selected processor family. void initializeProperties(bool HasMinSize); + /// Returns true if Reg is virtual and is assigned to, + /// or is physcial and is a member of, the TRC register class. + /// Otherwise, returns false. + bool isRegInClass(const MachineInstr *MI, const Register &Reg, + const TargetRegisterClass *TRC) const; + public: /// This constructor initializes the data members to match that /// of the specified triple. @@ -163,6 +169,13 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { bool enableMachinePipeliner() const override; bool useDFAforSMS() const override { return false; } + bool canLowerToZeroCycleRegMove(const MachineInstr *CopyMI, + const Register &DestReg, + const Register &SrcReg) const override; + bool canLowerToZeroCycleRegZeroing(const MachineInstr *CopyMI, + const Register &DestReg, + const Register &SrcReg) const override; + /// Returns ARM processor family. /// Avoid this function! CPU specifics should be kept local to this class /// and preferably modeled with SubtargetFeatures or properties in diff --git a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll index 1b22514a59d60..890367a761281 100644 --- a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll @@ -64,18 +64,18 @@ define i32 @main() nounwind ssp { ; CHECK: ; %bb.0: ; CHECK-NEXT: sub sp, sp, #96 ; CHECK-NEXT: stp x29, x30, [sp, #80] ; 16-byte Folded Spill -; CHECK-NEXT: mov w9, #1 ; =0x1 -; CHECK-NEXT: mov w8, #2 ; =0x2 -; CHECK-NEXT: stp w8, w9, [sp, #72] -; CHECK-NEXT: mov w9, #3 ; =0x3 -; CHECK-NEXT: mov w8, #4 ; =0x4 -; CHECK-NEXT: stp w8, w9, [sp, #64] -; CHECK-NEXT: mov w9, #5 ; =0x5 -; CHECK-NEXT: mov w8, #6 ; =0x6 -; CHECK-NEXT: stp w8, w9, [sp, #56] -; CHECK-NEXT: mov w9, #7 ; =0x7 -; CHECK-NEXT: mov w8, #8 ; =0x8 -; CHECK-NEXT: stp w8, w9, [sp, #48] +; CHECK-NEXT: mov w8, #1 ; =0x1 +; CHECK-NEXT: mov w1, #2 ; =0x2 +; CHECK-NEXT: stp w1, w8, [sp, #72] +; CHECK-NEXT: mov w2, #3 ; =0x3 +; CHECK-NEXT: mov w3, #4 ; =0x4 +; CHECK-NEXT: stp w3, w2, [sp, #64] +; CHECK-NEXT: mov w4, #5 ; =0x5 +; CHECK-NEXT: mov w5, #6 ; =0x6 +; CHECK-NEXT: stp w5, w4, [sp, #56] +; CHECK-NEXT: mov w6, #7 ; =0x7 +; CHECK-NEXT: mov w7, #8 ; =0x8 +; CHECK-NEXT: stp w7, w6, [sp, #48] ; CHECK-NEXT: mov w8, #9 ; =0x9 ; CHECK-NEXT: mov w9, #10 ; =0xa ; CHECK-NEXT: stp w9, w8, [sp, #40] @@ -86,13 +86,6 @@ define i32 @main() nounwind ssp { ; CHECK-NEXT: str x9, [sp, #8] ; CHECK-NEXT: str w8, [sp] ; CHECK-NEXT: add x0, sp, #76 -; CHECK-NEXT: mov w1, #2 ; =0x2 -; CHECK-NEXT: mov w2, #3 ; =0x3 -; CHECK-NEXT: mov w3, #4 ; =0x4 -; CHECK-NEXT: mov w4, #5 ; =0x5 -; CHECK-NEXT: mov w5, #6 ; =0x6 -; CHECK-NEXT: mov w6, #7 ; =0x7 -; CHECK-NEXT: mov w7, #8 ; =0x8 ; CHECK-NEXT: bl _fn9 ; CHECK-NEXT: mov w0, #0 ; =0x0 ; CHECK-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-fpr.mir b/llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-fpr.mir new file mode 100644 index 0000000000000..303e25edb2b18 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-fpr.mir @@ -0,0 +1,174 @@ +# RUN: llc -o - -mtriple=arm64-linux-gnu -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=NOTCPU-LINUX +# RUN: llc -o - -mtriple=arm64-apple-macosx -mcpu=generic -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=NOTCPU-APPLE +# RUN: llc -o - -mtriple=arm64-apple-macosx -mcpu=apple-m1 -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=CPU +# RUN: llc -o - -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr64 -mattr=-zcm-fpr128 -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=NOTATTR +# RUN: llc -o - -mtriple=arm64-apple-macosx -mattr=+zcm-fpr64 -mattr=+zcm-fpr128 -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=ATTR + +--- | + define void @remat_FPR128() { + ret void + } + declare void @foo_v4i32(<4 x float>, <4 x float>) + + define void @remat_FPR64() { + ret void + } + declare void @foo_double(double, double) + + define void @remat_FPR32() { + ret void + } + declare void @foo_float(float, float) + + define void @remat_FPR16() { + ret void + } + declare void @foo_half(half, half) +... +--- +name: remat_FPR128 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: remat_FPR128 + + ; NOTCPU-LINUX: %0:fpr128 = MOVIv2d_ns 64 + ; NOTCPU-LINUX-NEXT: %1:fpr128 = MOVIv2d_ns 64 + ; NOTCPU-LINUX: BL @foo_v4i32 + + ; NOTCPU-APPLE: %0:fpr128 = MOVIv2d_ns 64 + ; NOTCPU-APPLE-NEXT: %1:fpr128 = MOVIv2d_ns 64 + ; NOTCPU-APPLE: BL @foo_v4i32 + + ; CPU: %0:fpr128 = MOVIv2d_ns 64 + ; CPU-NEXT: %1:fpr128 = COPY %0 + ; CPU: BL @foo_v4i32 + + ; NOTATTR: %0:fpr128 = MOVIv2d_ns 64 + ; NOTATTR-NEXT: %1:fpr128 = MOVIv2d_ns 64 + ; NOTATTR: BL @foo_v4i32 + + ; ATTR: %0:fpr128 = MOVIv2d_ns 64 + ; ATTR-NEXT: %1:fpr128 = COPY %0 + ; ATTR: BL @foo_v4i32 + + %0:fpr128 = MOVIv2d_ns 64 + %1:fpr128 = COPY %0 + + ; Creates a live range interference to prevent coalescing and force + ; trying to rematerialize the previous COPY. + %1 = ADDv4i32 %1, %1 + + BL @foo_v4i32, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit %0, implicit %1 + RET_ReallyLR + +--- +name: remat_FPR64 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: remat_FPR64 + + ; NOTCPU-LINUX: %0:fpr64 = FMOVDi 64 + ; NOTCPU-LINUX-NEXT: %1:fpr64 = FMOVDi 64 + ; NOTCPU-LINUX: BL @foo_double + + ; NOTCPU-APPLE: %0:fpr64 = FMOVDi 64 + ; NOTCPU-APPLE-NEXT: %1:fpr64 = FMOVDi 64 + ; NOTCPU-APPLE: BL @foo_double + + ; CPU: %0:fpr64 = FMOVDi 64 + ; CPU-NEXT: %1:fpr64 = COPY %0 + ; CPU: BL @foo_double + + ; NOTATTR: %0:fpr64 = FMOVDi 64 + ; NOTATTR-NEXT: %1:fpr64 = FMOVDi 64 + ; NOTATTR: BL @foo_double + + ; ATTR: %0:fpr64 = FMOVDi 64 + ; ATTR-NEXT: %1:fpr64 = COPY %0 + ; ATTR: BL @foo_double + + %0:fpr64 = FMOVDi 64 + %1:fpr64 = COPY %0 + + ; Creates a live range interference to prevent coalescing and force + ; trying to rematerialize the previous COPY. + %1 = FADDDrr %1, %1, implicit $fpcr + + BL @foo_double, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit %0, implicit %1 + RET_ReallyLR + +--- +name: remat_FPR32 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: remat_FPR32 + + ; NOTCPU-LINUX: %0:fpr32 = FMOVSi 64 + ; NOTCPU-LINUX-NEXT: %1:fpr32 = FMOVSi 64 + ; NOTCPU-LINUX: BL @foo_float + + ; NOTCPU-APPLE: %0:fpr32 = FMOVSi 64 + ; NOTCPU-APPLE-NEXT: %1:fpr32 = FMOVSi 64 + ; NOTCPU-APPLE: BL @foo_float + + ; CPU: %0:fpr32 = FMOVSi 64 + ; CPU-NEXT: %1:fpr32 = COPY %0 + ; CPU: BL @foo_float + + ; NOTATTR: %0:fpr32 = FMOVSi 64 + ; NOTATTR-NEXT: %1:fpr32 = FMOVSi 64 + ; NOTATTR: BL @foo_float + + ; ATTR: %0:fpr32 = FMOVSi 64 + ; ATTR-NEXT: %1:fpr32 = COPY %0 + ; ATTR: BL @foo_float + + %0:fpr32 = FMOVSi 64 + %1:fpr32 = COPY %0 + + ; Creates a live range interference to prevent coalescing and force + ; trying to rematerialize the previous COPY. + %1 = FADDSrr %1, %1, implicit $fpcr + + BL @foo_float, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit %0, implicit %1 + RET_ReallyLR + +--- +name: remat_FPR16 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: remat_FPR16 + + ; NOTCPU-LINUX: %0:fpr16 = FMOVHi 64 + ; NOTCPU-LINUX-NEXT: %1:fpr16 = FMOVHi 64 + ; NOTCPU-LINUX: BL @foo_half + + ; NOTCPU-APPLE: %0:fpr16 = FMOVHi 64 + ; NOTCPU-APPLE-NEXT: %1:fpr16 = FMOVHi 64 + ; NOTCPU-APPLE: BL @foo_half + + ; CPU: %0:fpr16 = FMOVHi 64 + ; CPU-NEXT: %1:fpr16 = COPY %0 + ; CPU: BL @foo_half + + ; NOTATTR: %0:fpr16 = FMOVHi 64 + ; NOTATTR-NEXT: %1:fpr16 = FMOVHi 64 + ; NOTATTR: BL @foo_half + + ; ATTR: %0:fpr16 = FMOVHi 64 + ; ATTR-NEXT: %1:fpr16 = COPY %0 + ; ATTR: BL @foo_half + + %0:fpr16 = FMOVHi 64 + %1:fpr16 = COPY %0 + + ; Creates a live range interference to prevent coalescing and force + ; trying to rematerialize the previous COPY. + %1 = FADDHrr %1, %1, implicit $fpcr + + BL @foo_half, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit %0, implicit %1 + RET_ReallyLR diff --git a/llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-gpr.mir b/llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-gpr.mir new file mode 100644 index 0000000000000..6247572b2cf2c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-gpr.mir @@ -0,0 +1,90 @@ +# RUN: llc -o - -mtriple=arm64-linux-gnu -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=NOTCPU-LINUX +# RUN: llc -o - -mtriple=arm64-apple-macosx -mcpu=generic -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=NOTCPU-APPLE +# RUN: llc -o - -mtriple=arm64-apple-macosx -mcpu=apple-m1 -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=CPU +# RUN: llc -o - -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-gpr64 -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=NOTATTR +# RUN: llc -o - -mtriple=arm64-apple-macosx -mattr=+zcm-gpr64 -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=ATTR + +--- | + define void @remat_GPR32() { + ret void + } + declare void @foo_i32(i32, i32) + + define void @remat_GPR64() { + ret void + } + declare void @foo_i64(i64, i64) +... +--- +name: remat_GPR32 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: remat_GPR32 + + ; NOTCPU-LINUX: %0:gpr32 = MOVi32imm 32 + ; NOTCPU-LINUX-NEXT: %1:gpr32common = MOVi32imm 32 + ; NOTCPU-LINUX: BL @foo_i32 + + ; NOTCPU-APPLE: %0:gpr32 = MOVi32imm 32 + ; NOTCPU-APPLE-NEXT: %1:gpr32common = MOVi32imm 32 + ; NOTCPU-APPLE: BL @foo_i32 + + ; CPU: %0:gpr32 = MOVi32imm 32 + ; CPU-NEXT: %1:gpr32sp = COPY %0 + ; CPU: BL @foo_i32 + + ; NOTATTR: %0:gpr32 = MOVi32imm 32 + ; NOTATTR-NEXT: %1:gpr32common = MOVi32imm 32 + ; NOTATTR: BL @foo_i32 + + ; ATTR: %0:gpr32 = MOVi32imm 32 + ; ATTR-NEXT: %1:gpr32sp = COPY %0 + ; ATTR: BL @foo_i32 + + %0:gpr32 = MOVi32imm 32 + %1:gpr32sp = COPY %0 + + ; Creates a live range interference to prevent coalescing and force + ; trying to rematerialize the previous COPY. + %1 = ADDWri %1, 1, 0 + + BL @foo_i32, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit %0, implicit %1 + RET_ReallyLR + +--- +name: remat_GPR64 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: remat_GPR64 + + ; NOTCPU-LINUX: %0:gpr64 = MOVi64imm 64 + ; NOTCPU-LINUX-NEXT: %1:gpr64common = MOVi64imm 64 + ; NOTCPU-LINUX: BL @foo_i64 + + ; NOTCPU-APPLE: %0:gpr64 = MOVi64imm 64 + ; NOTCPU-APPLE-NEXT: %1:gpr64common = MOVi64imm 64 + ; NOTCPU-APPLE: BL @foo_i64 + + ; CPU: %0:gpr64 = MOVi64imm 64 + ; CPU-NEXT: %1:gpr64sp = COPY %0 + ; CPU: BL @foo_i64 + + ; NOTATTR: %0:gpr64 = MOVi64imm 64 + ; NOTATTR-NEXT: %1:gpr64common = MOVi64imm 64 + ; NOTATTR: BL @foo_i64 + + ; ATTR: %0:gpr64 = MOVi64imm 64 + ; ATTR-NEXT: %1:gpr64sp = COPY %0 + ; ATTR: BL @foo_i64 + + %0:gpr64 = MOVi64imm 64 + %1:gpr64sp = COPY %0 + + ; Creates a live range interference to prevent coalescing and force + ; trying to rematerialize the previous COPY. + %1 = ADDXri %1, 1, 0 + + BL @foo_i64, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit %0, implicit %1 + RET_ReallyLR diff --git a/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll b/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll index b225d9a1acaf5..3edec9c9d8fc6 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll @@ -4,6 +4,7 @@ define <8 x i1> @test1() { ; CHECK-LABEL: test1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.16b v0, #0 +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %Shuff = shufflevector <8 x i1> @test4(ptr %ptr, i32 %v) { -; CHECK-LABEL: _test4: -; CHECK: adrp x[[REG3:[0-9]+]], lCPI3_0@PAGE -; CHECK: ldr q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_0@PAGEOFF] +; CHECK-LABEL: test4: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x8, lCPI3_0@PAGE +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: ldr q0, [x8, lCPI3_0@PAGEOFF] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 bb: %Shuff = shufflevector <16 x i1> zeroinitializer, <16 x i1>