Skip to content

Commit 579bf00

Browse files
committed
[ARM][AArch64] Allow the CSE to take into consideration uses of the carry and overflow flags in ARM and AArch64
On both of these platforms, we know that the cmp will not stomp on these flags and overwrite them if doing so would be poison, or in ANDS case, it will always have the V flag cleared during an ANDS.
1 parent 82e4b83 commit 579bf00

File tree

3 files changed

+268
-71
lines changed

3 files changed

+268
-71
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 137 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1745,8 +1745,24 @@ static unsigned sForm(MachineInstr &Instr) {
17451745
return AArch64::SBCSXr;
17461746
case AArch64::ANDWri:
17471747
return AArch64::ANDSWri;
1748+
case AArch64::ANDWrr:
1749+
return AArch64::ANDSWrr;
1750+
case AArch64::ANDWrs:
1751+
return AArch64::ANDSWrs;
1752+
case AArch64::BICWrr:
1753+
return AArch64::BICSWrr;
1754+
case AArch64::BICWrs:
1755+
return AArch64::BICSWrs;
17481756
case AArch64::ANDXri:
17491757
return AArch64::ANDSXri;
1758+
case AArch64::ANDXrr:
1759+
return AArch64::ANDSXrr;
1760+
case AArch64::ANDXrs:
1761+
return AArch64::ANDSXrs;
1762+
case AArch64::BICXrr:
1763+
return AArch64::BICSXrr;
1764+
case AArch64::BICXrs:
1765+
return AArch64::BICSXrs;
17501766
}
17511767
}
17521768

@@ -1884,6 +1900,24 @@ static bool isSUBSRegImm(unsigned Opcode) {
18841900
return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
18851901
}
18861902

1903+
static bool isANDSOpcode(MachineInstr &MI) {
1904+
switch (sForm(MI)) {
1905+
case AArch64::ANDSWri:
1906+
case AArch64::ANDSWrr:
1907+
case AArch64::ANDSWrs:
1908+
case AArch64::ANDSXri:
1909+
case AArch64::ANDSXrr:
1910+
case AArch64::ANDSXrs:
1911+
case AArch64::BICSWrr:
1912+
case AArch64::BICSWrs:
1913+
case AArch64::BICSXrr:
1914+
case AArch64::BICSXrs:
1915+
return true;
1916+
default:
1917+
return false;
1918+
}
1919+
}
1920+
18871921
/// Check if CmpInstr can be substituted by MI.
18881922
///
18891923
/// CmpInstr can be substituted:
@@ -1892,11 +1926,11 @@ static bool isSUBSRegImm(unsigned Opcode) {
18921926
/// - and, condition flags are not alive in successors of the CmpInstr parent
18931927
/// - and, if MI opcode is the S form there must be no defs of flags between
18941928
/// MI and CmpInstr
1895-
/// or if MI opcode is not the S form there must be neither defs of flags
1896-
/// nor uses of flags between MI and CmpInstr.
1929+
/// or if MI opcode is not the S form there must be neither defs of
1930+
/// flags nor uses of flags between MI and CmpInstr.
18971931
/// - and, if C/V flags are not used after CmpInstr
1898-
/// or if N flag is used but MI produces poison value if signed overflow
1899-
/// occurs.
1932+
/// or if N flag is used but MI produces poison value if signed
1933+
/// overflow occurs.
19001934
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
19011935
const TargetRegisterInfo &TRI) {
19021936
// NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
@@ -1912,7 +1946,17 @@ static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
19121946
"Caller guarantees that CmpInstr compares with constant 0");
19131947

19141948
std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1915-
if (!NZVCUsed || NZVCUsed->C)
1949+
if (!NZVCUsed)
1950+
return false;
1951+
1952+
// CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1953+
// '%vreg = add ...' or '%vreg = sub ...'.
1954+
// Condition flag C is used to indicate unsigned overflow.
1955+
// 1) MI and CmpInstr set N and C to the same value if Cmp is an adds
1956+
// 2) ADDS x, 0, always sets C to 0.
1957+
// In practice we should not really get here, as an unsigned comparison with
1958+
// 0 should have been optimized out anyway, but just in case.
1959+
if (NZVCUsed->C && !isADDSRegImm(CmpOpcode))
19161960
return false;
19171961

19181962
// CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
@@ -1921,7 +1965,8 @@ static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
19211965
// 1) MI and CmpInstr set N and V to the same value.
19221966
// 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
19231967
// signed overflow occurs, so CmpInstr could still be simplified away.
1924-
if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
1968+
// 3) ANDS also always sets V to 0.
1969+
if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDSOpcode(MI))
19251970
return false;
19261971

19271972
AccessKind AccessToCheck = AK_Write;
@@ -2099,8 +2144,7 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
20992144

21002145
if (MI.getOpcode() == AArch64::CATCHRET) {
21012146
// Skip to the first instruction before the epilog.
2102-
const TargetInstrInfo *TII =
2103-
MBB.getParent()->getSubtarget().getInstrInfo();
2147+
const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
21042148
MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
21052149
auto MBBI = MachineBasicBlock::iterator(MI);
21062150
MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
@@ -2168,16 +2212,16 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
21682212
.addUse(Reg, RegState::Kill)
21692213
.addImm(0);
21702214
} else {
2171-
// Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2172-
// than 23760.
2173-
// It might be nice to use AArch64::MOVi32imm here, which would get
2174-
// expanded in PreSched2 after PostRA, but our lone scratch Reg already
2175-
// contains the MRS result. findScratchNonCalleeSaveRegister() in
2176-
// AArch64FrameLowering might help us find such a scratch register
2177-
// though. If we failed to find a scratch register, we could emit a
2178-
// stream of add instructions to build up the immediate. Or, we could try
2179-
// to insert a AArch64::MOVi32imm before register allocation so that we
2180-
// didn't need to scavenge for a scratch register.
2215+
// Cases that are larger than +/- 4095 and not a multiple of 8, or
2216+
// larger than 23760. It might be nice to use AArch64::MOVi32imm here,
2217+
// which would get expanded in PreSched2 after PostRA, but our lone
2218+
// scratch Reg already contains the MRS result.
2219+
// findScratchNonCalleeSaveRegister() in AArch64FrameLowering might help
2220+
// us find such a scratch register though. If we failed to find a
2221+
// scratch register, we could emit a stream of add instructions to build
2222+
// up the immediate. Or, we could try to insert a AArch64::MOVi32imm
2223+
// before register allocation so that we didn't need to scavenge for a
2224+
// scratch register.
21812225
report_fatal_error("Unable to encode Stack Protector Guard Offset");
21822226
}
21832227
MBB.erase(MI);
@@ -2437,31 +2481,56 @@ bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
24372481

24382482
std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
24392483
switch (Opc) {
2440-
default: return {};
2441-
case AArch64::PRFMui: return AArch64::PRFUMi;
2442-
case AArch64::LDRXui: return AArch64::LDURXi;
2443-
case AArch64::LDRWui: return AArch64::LDURWi;
2444-
case AArch64::LDRBui: return AArch64::LDURBi;
2445-
case AArch64::LDRHui: return AArch64::LDURHi;
2446-
case AArch64::LDRSui: return AArch64::LDURSi;
2447-
case AArch64::LDRDui: return AArch64::LDURDi;
2448-
case AArch64::LDRQui: return AArch64::LDURQi;
2449-
case AArch64::LDRBBui: return AArch64::LDURBBi;
2450-
case AArch64::LDRHHui: return AArch64::LDURHHi;
2451-
case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2452-
case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2453-
case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2454-
case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2455-
case AArch64::LDRSWui: return AArch64::LDURSWi;
2456-
case AArch64::STRXui: return AArch64::STURXi;
2457-
case AArch64::STRWui: return AArch64::STURWi;
2458-
case AArch64::STRBui: return AArch64::STURBi;
2459-
case AArch64::STRHui: return AArch64::STURHi;
2460-
case AArch64::STRSui: return AArch64::STURSi;
2461-
case AArch64::STRDui: return AArch64::STURDi;
2462-
case AArch64::STRQui: return AArch64::STURQi;
2463-
case AArch64::STRBBui: return AArch64::STURBBi;
2464-
case AArch64::STRHHui: return AArch64::STURHHi;
2484+
default:
2485+
return {};
2486+
case AArch64::PRFMui:
2487+
return AArch64::PRFUMi;
2488+
case AArch64::LDRXui:
2489+
return AArch64::LDURXi;
2490+
case AArch64::LDRWui:
2491+
return AArch64::LDURWi;
2492+
case AArch64::LDRBui:
2493+
return AArch64::LDURBi;
2494+
case AArch64::LDRHui:
2495+
return AArch64::LDURHi;
2496+
case AArch64::LDRSui:
2497+
return AArch64::LDURSi;
2498+
case AArch64::LDRDui:
2499+
return AArch64::LDURDi;
2500+
case AArch64::LDRQui:
2501+
return AArch64::LDURQi;
2502+
case AArch64::LDRBBui:
2503+
return AArch64::LDURBBi;
2504+
case AArch64::LDRHHui:
2505+
return AArch64::LDURHHi;
2506+
case AArch64::LDRSBXui:
2507+
return AArch64::LDURSBXi;
2508+
case AArch64::LDRSBWui:
2509+
return AArch64::LDURSBWi;
2510+
case AArch64::LDRSHXui:
2511+
return AArch64::LDURSHXi;
2512+
case AArch64::LDRSHWui:
2513+
return AArch64::LDURSHWi;
2514+
case AArch64::LDRSWui:
2515+
return AArch64::LDURSWi;
2516+
case AArch64::STRXui:
2517+
return AArch64::STURXi;
2518+
case AArch64::STRWui:
2519+
return AArch64::STURWi;
2520+
case AArch64::STRBui:
2521+
return AArch64::STURBi;
2522+
case AArch64::STRHui:
2523+
return AArch64::STURHi;
2524+
case AArch64::STRSui:
2525+
return AArch64::STURSi;
2526+
case AArch64::STRDui:
2527+
return AArch64::STURDi;
2528+
case AArch64::STRQui:
2529+
return AArch64::STURQi;
2530+
case AArch64::STRBBui:
2531+
return AArch64::STURBBi;
2532+
case AArch64::STRHHui:
2533+
return AArch64::STURHHi;
24652534
}
24662535
}
24672536

@@ -2909,8 +2978,8 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
29092978
MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
29102979
"Expected a reg or frame index operand.");
29112980

2912-
// For Pre-indexed addressing quadword instructions, the third operand is the
2913-
// immediate value.
2981+
// For Pre-indexed addressing quadword instructions, the third operand is
2982+
// the immediate value.
29142983
bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
29152984

29162985
if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
@@ -2951,17 +3020,18 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
29513020
return false;
29523021

29533022
// Do not pair any callee-save store/reload instructions in the
2954-
// prologue/epilogue if the CFI information encoded the operations as separate
2955-
// instructions, as that will cause the size of the actual prologue to mismatch
2956-
// with the prologue size recorded in the Windows CFI.
3023+
// prologue/epilogue if the CFI information encoded the operations as
3024+
// separate instructions, as that will cause the size of the actual prologue
3025+
// to mismatch with the prologue size recorded in the Windows CFI.
29573026
const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
29583027
bool NeedsWinCFI = MAI->usesWindowsCFI() &&
29593028
MI.getMF()->getFunction().needsUnwindTableEntry();
29603029
if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
29613030
MI.getFlag(MachineInstr::FrameDestroy)))
29623031
return false;
29633032

2964-
// On some CPUs quad load/store pairs are slower than two single load/stores.
3033+
// On some CPUs quad load/store pairs are slower than two single
3034+
// load/stores.
29653035
if (Subtarget.isPaired128Slow()) {
29663036
switch (MI.getOpcode()) {
29673037
default:
@@ -3138,8 +3208,8 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
31383208
OffsetScale = 1;
31393209

31403210
// If the address instructions is folded into the base register, then the
3141-
// addressing mode must not have a scale. Then we can swap the base and the
3142-
// scaled registers.
3211+
// addressing mode must not have a scale. Then we can swap the base and
3212+
// the scaled registers.
31433213
if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
31443214
return false;
31453215

@@ -3344,8 +3414,8 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
33443414
}
33453415

33463416
// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3347-
// return the opcode of an instruction performing the same operation, but using
3348-
// the [Reg, Reg] addressing mode.
3417+
// return the opcode of an instruction performing the same operation, but
3418+
// using the [Reg, Reg] addressing mode.
33493419
static unsigned regOffsetOpcode(unsigned Opcode) {
33503420
switch (Opcode) {
33513421
default:
@@ -3417,9 +3487,9 @@ static unsigned regOffsetOpcode(unsigned Opcode) {
34173487
}
34183488
}
34193489

3420-
// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3421-
// the opcode of an instruction performing the same operation, but using the
3422-
// [Reg, #Imm] addressing mode with scaled offset.
3490+
// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3491+
// return the opcode of an instruction performing the same operation, but
3492+
// using the [Reg, #Imm] addressing mode with scaled offset.
34233493
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
34243494
switch (Opcode) {
34253495
default:
@@ -3522,9 +3592,9 @@ unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
35223592
}
35233593
}
35243594

3525-
// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3526-
// the opcode of an instruction performing the same operation, but using the
3527-
// [Reg, #Imm] addressing mode with unscaled offset.
3595+
// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3596+
// return the opcode of an instruction performing the same operation, but
3597+
// using the [Reg, #Imm] addressing mode with unscaled offset.
35283598
unsigned unscaledOffsetOpcode(unsigned Opcode) {
35293599
switch (Opcode) {
35303600
default:
@@ -3597,10 +3667,9 @@ unsigned unscaledOffsetOpcode(unsigned Opcode) {
35973667
}
35983668
}
35993669

3600-
// Given the opcode of a memory load/store instruction, return the opcode of an
3601-
// instruction performing the same operation, but using
3602-
// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3603-
// offset register.
3670+
// Given the opcode of a memory load/store instruction, return the opcode of
3671+
// an instruction performing the same operation, but using the [Reg, Reg,
3672+
// {s,u}xtw #N] addressing mode with sign-/zero-extend of the offset register.
36043673
static unsigned offsetExtendOpcode(unsigned Opcode) {
36053674
switch (Opcode) {
36063675
default:
@@ -3740,7 +3809,8 @@ MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
37403809

37413810
if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
37423811
AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3743-
// The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3812+
// The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw
3813+
// #N]`.
37443814
assert(AM.ScaledReg && !AM.Displacement &&
37453815
"Address offset can be a register or an immediate, but not both");
37463816
unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
@@ -4023,8 +4093,8 @@ bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
40234093
return false;
40244094

40254095
// Compute the offset. Offset is calculated as the immediate operand
4026-
// multiplied by the scaling factor. Unscaled instructions have scaling factor
4027-
// set to 1. Postindex are a special case which have an offset of 0.
4096+
// multiplied by the scaling factor. Unscaled instructions have scaling
4097+
// factor set to 1. Postindex are a special case which have an offset of 0.
40284098
if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
40294099
BaseOp = &LdSt.getOperand(2);
40304100
Offset = 0;
@@ -4728,8 +4798,7 @@ bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
47284798
if (Reg.isPhysical())
47294799
return AArch64::FPR16RegClass.contains(Reg);
47304800
const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4731-
return TRC == &AArch64::FPR16RegClass ||
4732-
TRC == &AArch64::FPR16_loRegClass;
4801+
return TRC == &AArch64::FPR16RegClass || TRC == &AArch64::FPR16_loRegClass;
47334802
};
47344803
return llvm::any_of(MI.operands(), IsHFPR);
47354804
}

llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3089,17 +3089,41 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
30893089
break;
30903090
case ARMCC::HS: // C
30913091
case ARMCC::LO: // C
3092-
case ARMCC::VS: // V
3093-
case ARMCC::VC: // V
30943092
case ARMCC::HI: // C Z
30953093
case ARMCC::LS: // C Z
3094+
// The instruction uses the C bit which is not safe.
3095+
return false;
3096+
case ARMCC::VS: // V
3097+
case ARMCC::VC: // V
30963098
case ARMCC::GE: // N V
30973099
case ARMCC::LT: // N V
30983100
case ARMCC::GT: // Z N V
30993101
case ARMCC::LE: // Z N V
3100-
// The instruction uses the V bit or C bit which is not safe.
3102+
{
3103+
// We MAY be able to do this if signed overflow is
3104+
// poison.
3105+
3106+
if (I->getFlag(MachineInstr::NoSWrap)) {
3107+
// Only adds and subs can set the V bit.
3108+
unsigned Opc = I->getOpcode();
3109+
bool IsSub = Opc == ARM::SUBrr || Opc == ARM::t2SUBrr ||
3110+
Opc == ARM::SUBri || Opc == ARM::t2SUBri ||
3111+
Opc == ARM::tSUBrr || Opc == ARM::tSUBi3 ||
3112+
Opc == ARM::tSUBi8;
3113+
3114+
bool IsAdd = Opc == ARM::ADDrr || Opc == ARM::t2ADDrr ||
3115+
Opc == ARM::ADDri || Opc == ARM::t2ADDri ||
3116+
Opc == ARM::tADDrr || Opc == ARM::tADDi3 ||
3117+
Opc == ARM::tADDi8;
3118+
3119+
if (IsSub || IsAdd)
3120+
break;
3121+
}
3122+
3123+
// The instruction uses the V bit which is not safe.
31013124
return false;
31023125
}
3126+
}
31033127
}
31043128
}
31053129
}

0 commit comments

Comments
 (0)