Skip to content

Commit 626656a

Browse files
committed
reapply the patch reverted in r116033:
"Reimplement (part of) the or -> add optimization. Matching 'or' into 'add'" With a critical fix: the add pseudos clobber EFLAGS. llvm-svn: 116039
1 parent 00ee155 commit 626656a

File tree

4 files changed

+126
-65
lines changed

4 files changed

+126
-65
lines changed

llvm/lib/Target/X86/X86InstrCompiler.td

Lines changed: 59 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,63 @@ def def32 : PatLeaf<(i32 GR32:$src), [{
997997
def : Pat<(i64 (zext def32:$src)),
998998
(SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
999999

1000+
//===----------------------------------------------------------------------===//
1001+
// Pattern match OR as ADD
1002+
//===----------------------------------------------------------------------===//
1003+
1004+
// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
1005+
// 3-addressified into an LEA instruction to avoid copies. However, we also
1006+
// want to finally emit these instructions as an or at the end of the code
1007+
// generator to make the generated code easier to read. To do this, we select
1008+
// into "disjoint bits" pseudo ops.
1009+
1010+
// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
1011+
def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
1012+
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
1013+
return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
1014+
1015+
unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
1016+
APInt Mask = APInt::getAllOnesValue(BitWidth);
1017+
APInt KnownZero0, KnownOne0;
1018+
CurDAG->ComputeMaskedBits(N->getOperand(0), Mask, KnownZero0, KnownOne0, 0);
1019+
APInt KnownZero1, KnownOne1;
1020+
CurDAG->ComputeMaskedBits(N->getOperand(1), Mask, KnownZero1, KnownOne1, 0);
1021+
return (~KnownZero0 & ~KnownZero1) == 0;
1022+
}]>;
1023+
1024+
1025+
// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
1026+
let AddedComplexity = 5 in { // Try this before the selecting to OR
1027+
1028+
let isCommutable = 1, isConvertibleToThreeAddress = 1,
1029+
Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
1030+
def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
1031+
"", // orw/addw REG, REG
1032+
[(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
1033+
def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
1034+
"", // orl/addl REG, REG
1035+
[(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
1036+
def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
1037+
"", // orq/addq REG, REG
1038+
[(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
1039+
}
1040+
1041+
def : Pat<(or_is_add GR16:$src1, imm:$src2),
1042+
(ADD16ri GR16:$src1, imm:$src2)>;
1043+
def : Pat<(or_is_add GR32:$src1, imm:$src2),
1044+
(ADD32ri GR32:$src1, imm:$src2)>;
1045+
def : Pat<(or_is_add GR64:$src1, i64immSExt32:$src2),
1046+
(ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
1047+
1048+
def : Pat<(or_is_add GR16:$src1, i16immSExt8:$src2),
1049+
(ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
1050+
def : Pat<(or_is_add GR32:$src1, i32immSExt8:$src2),
1051+
(ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
1052+
def : Pat<(or_is_add GR64:$src1, i64immSExt8:$src2),
1053+
(ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
1054+
} // AddedComplexity
1055+
1056+
10001057
//===----------------------------------------------------------------------===//
10011058
// Some peepholes
10021059
//===----------------------------------------------------------------------===//
@@ -1309,27 +1366,8 @@ def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
13091366
def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
13101367
(SETB_C32r)>;
13111368

1312-
// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
1313-
let AddedComplexity = 5 in { // Try this before the selecting to OR
1314-
def : Pat<(or_is_add GR16:$src1, imm:$src2),
1315-
(ADD16ri GR16:$src1, imm:$src2)>;
1316-
def : Pat<(or_is_add GR32:$src1, imm:$src2),
1317-
(ADD32ri GR32:$src1, imm:$src2)>;
1318-
def : Pat<(or_is_add GR16:$src1, i16immSExt8:$src2),
1319-
(ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
1320-
def : Pat<(or_is_add GR32:$src1, i32immSExt8:$src2),
1321-
(ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
1322-
def : Pat<(or_is_add GR16:$src1, GR16:$src2),
1323-
(ADD16rr GR16:$src1, GR16:$src2)>;
1324-
def : Pat<(or_is_add GR32:$src1, GR32:$src2),
1325-
(ADD32rr GR32:$src1, GR32:$src2)>;
1326-
def : Pat<(or_is_add GR64:$src1, i64immSExt8:$src2),
1327-
(ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
1328-
def : Pat<(or_is_add GR64:$src1, i64immSExt32:$src2),
1329-
(ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
1330-
def : Pat<(or_is_add GR64:$src1, GR64:$src2),
1331-
(ADD64rr GR64:$src1, GR64:$src2)>;
1332-
} // AddedComplexity
1369+
1370+
13331371

13341372
//===----------------------------------------------------------------------===//
13351373
// EFLAGS-defining Patterns

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 59 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ ReMatPICStubLoad("remat-pic-stub-load",
5454
X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
5555
: TargetInstrInfoImpl(X86Insts, array_lengthof(X86Insts)),
5656
TM(tm), RI(tm, *this) {
57+
enum {
58+
TB_NOT_REVERSABLE = 1U << 31,
59+
TB_FLAGS = TB_NOT_REVERSABLE
60+
};
61+
5762
static const unsigned OpTbl2Addr[][2] = {
5863
{ X86::ADC32ri, X86::ADC32mi },
5964
{ X86::ADC32ri8, X86::ADC32mi8 },
@@ -64,12 +69,15 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
6469
{ X86::ADD16ri, X86::ADD16mi },
6570
{ X86::ADD16ri8, X86::ADD16mi8 },
6671
{ X86::ADD16rr, X86::ADD16mr },
72+
{ X86::ADD16rr_DB, X86::ADD16mr | TB_NOT_REVERSABLE },
6773
{ X86::ADD32ri, X86::ADD32mi },
6874
{ X86::ADD32ri8, X86::ADD32mi8 },
6975
{ X86::ADD32rr, X86::ADD32mr },
76+
{ X86::ADD32rr_DB, X86::ADD32mr | TB_NOT_REVERSABLE },
7077
{ X86::ADD64ri32, X86::ADD64mi32 },
7178
{ X86::ADD64ri8, X86::ADD64mi8 },
7279
{ X86::ADD64rr, X86::ADD64mr },
80+
{ X86::ADD64rr_DB, X86::ADD64mr | TB_NOT_REVERSABLE },
7381
{ X86::ADD8ri, X86::ADD8mi },
7482
{ X86::ADD8rr, X86::ADD8mr },
7583
{ X86::AND16ri, X86::AND16mi },
@@ -214,16 +222,21 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
214222

215223
for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
216224
unsigned RegOp = OpTbl2Addr[i][0];
217-
unsigned MemOp = OpTbl2Addr[i][1];
218-
if (!RegOp2MemOpTable2Addr.insert(std::make_pair(RegOp,
219-
std::make_pair(MemOp,0))).second)
220-
assert(false && "Duplicated entries?");
225+
unsigned MemOp = OpTbl2Addr[i][1] & ~TB_FLAGS;
226+
assert(!RegOp2MemOpTable2Addr.count(RegOp) && "Duplicated entries?");
227+
RegOp2MemOpTable2Addr[RegOp] = std::make_pair(MemOp, 0U);
228+
229+
// If this is not a reversable operation (because there is a many->one)
230+
// mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
231+
if (OpTbl2Addr[i][1] & TB_NOT_REVERSABLE)
232+
continue;
233+
221234
// Index 0, folded load and store, no alignment requirement.
222235
unsigned AuxInfo = 0 | (1 << 4) | (1 << 5);
223-
if (!MemOp2RegOpTable.insert(std::make_pair(MemOp,
224-
std::make_pair(RegOp,
225-
AuxInfo))).second)
226-
assert(false && "Duplicated entries in unfolding maps?");
236+
237+
assert(!MemOp2RegOpTable.count(MemOp) &&
238+
"Duplicated entries in unfolding maps?");
239+
MemOp2RegOpTable[MemOp] = std::make_pair(RegOp, AuxInfo);
227240
}
228241

229242
// If the third value is 1, then it's folding either a load or a store.
@@ -453,8 +466,11 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
453466
{ X86::ADC32rr, X86::ADC32rm, 0 },
454467
{ X86::ADC64rr, X86::ADC64rm, 0 },
455468
{ X86::ADD16rr, X86::ADD16rm, 0 },
469+
{ X86::ADD16rr_DB, X86::ADD16rm | TB_NOT_REVERSABLE, 0 },
456470
{ X86::ADD32rr, X86::ADD32rm, 0 },
471+
{ X86::ADD32rr_DB, X86::ADD32rm | TB_NOT_REVERSABLE, 0 },
457472
{ X86::ADD64rr, X86::ADD64rm, 0 },
473+
{ X86::ADD64rr_DB, X86::ADD64rm | TB_NOT_REVERSABLE, 0 },
458474
{ X86::ADD8rr, X86::ADD8rm, 0 },
459475
{ X86::ADDPDrr, X86::ADDPDrm, 16 },
460476
{ X86::ADDPSrr, X86::ADDPSrm, 16 },
@@ -649,16 +665,23 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
649665

650666
for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
651667
unsigned RegOp = OpTbl2[i][0];
652-
unsigned MemOp = OpTbl2[i][1];
668+
unsigned MemOp = OpTbl2[i][1] & ~TB_FLAGS;
653669
unsigned Align = OpTbl2[i][2];
654-
if (!RegOp2MemOpTable2.insert(std::make_pair(RegOp,
655-
std::make_pair(MemOp,Align))).second)
656-
assert(false && "Duplicated entries?");
670+
671+
assert(!RegOp2MemOpTable2.count(RegOp) && "Duplicate entry!");
672+
RegOp2MemOpTable2[RegOp] = std::make_pair(MemOp, Align);
673+
674+
675+
// If this is not a reversable operation (because there is a many->one)
676+
// mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
677+
if (OpTbl2[i][1] & TB_NOT_REVERSABLE)
678+
continue;
679+
657680
// Index 2, folded load
658681
unsigned AuxInfo = 2 | (1 << 4);
659-
if (!MemOp2RegOpTable.insert(std::make_pair(MemOp,
660-
std::make_pair(RegOp, AuxInfo))).second)
661-
assert(false && "Duplicated entries in unfolding maps?");
682+
assert(!MemOp2RegOpTable.count(MemOp) &&
683+
"Duplicated entries in unfolding maps?");
684+
MemOp2RegOpTable[MemOp] = std::make_pair(RegOp, AuxInfo);
662685
}
663686
}
664687

@@ -1133,7 +1156,8 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
11331156
case X86::ADD16ri8:
11341157
addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());
11351158
break;
1136-
case X86::ADD16rr: {
1159+
case X86::ADD16rr:
1160+
case X86::ADD16rr_DB: {
11371161
unsigned Src2 = MI->getOperand(2).getReg();
11381162
bool isKill2 = MI->getOperand(2).isKill();
11391163
unsigned leaInReg2 = 0;
@@ -1346,18 +1370,27 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
13461370
Src, isKill, -1);
13471371
break;
13481372
case X86::ADD64rr:
1349-
case X86::ADD32rr: {
1373+
case X86::ADD64rr_DB:
1374+
case X86::ADD32rr:
1375+
case X86::ADD32rr_DB: {
13501376
assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
1351-
unsigned Opc = MIOpc == X86::ADD64rr ? X86::LEA64r
1352-
: (is64Bit ? X86::LEA64_32r : X86::LEA32r);
1377+
unsigned Opc;
1378+
TargetRegisterClass *RC;
1379+
if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) {
1380+
Opc = X86::LEA64r;
1381+
RC = X86::GR64_NOSPRegisterClass;
1382+
} else {
1383+
Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
1384+
RC = X86::GR32_NOSPRegisterClass;
1385+
}
1386+
1387+
13531388
unsigned Src2 = MI->getOperand(2).getReg();
13541389
bool isKill2 = MI->getOperand(2).isKill();
13551390

13561391
// LEA can't handle RSP.
13571392
if (TargetRegisterInfo::isVirtualRegister(Src2) &&
1358-
!MF.getRegInfo().constrainRegClass(Src2,
1359-
MIOpc == X86::ADD64rr ? X86::GR64_NOSPRegisterClass :
1360-
X86::GR32_NOSPRegisterClass))
1393+
!MF.getRegInfo().constrainRegClass(Src2, RC))
13611394
return 0;
13621395

13631396
NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
@@ -1368,7 +1401,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
13681401
LV->replaceKillInstruction(Src2, MI, NewMI);
13691402
break;
13701403
}
1371-
case X86::ADD16rr: {
1404+
case X86::ADD16rr:
1405+
case X86::ADD16rr_DB: {
13721406
if (DisableLEA16)
13731407
return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
13741408
assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
@@ -2596,13 +2630,8 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
25962630
OpcodeTablePtr = &RegOp2MemOpTable2;
25972631
}
25982632

2599-
if (OpcodeTablePtr) {
2600-
// Find the Opcode to fuse
2601-
DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
2602-
OpcodeTablePtr->find(Opc);
2603-
if (I != OpcodeTablePtr->end())
2604-
return true;
2605-
}
2633+
if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
2634+
return true;
26062635
return TargetInstrInfoImpl::canFoldMemoryOperand(MI, Ops);
26072636
}
26082637

llvm/lib/Target/X86/X86InstrInfo.td

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -544,20 +544,6 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
544544
return N->hasOneUse();
545545
}]>;
546546

547-
// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
548-
def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
549-
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
550-
return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
551-
552-
unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
553-
APInt Mask = APInt::getAllOnesValue(BitWidth);
554-
APInt KnownZero0, KnownOne0;
555-
CurDAG->ComputeMaskedBits(N->getOperand(0), Mask, KnownZero0, KnownOne0, 0);
556-
APInt KnownZero1, KnownOne1;
557-
CurDAG->ComputeMaskedBits(N->getOperand(1), Mask, KnownZero1, KnownOne1, 0);
558-
return (~KnownZero0 & ~KnownZero1) == 0;
559-
}]>;
560-
561547
//===----------------------------------------------------------------------===//
562548
// Instruction list.
563549
//

llvm/lib/Target/X86/X86MCInstLower.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
347347
}
348348

349349
// Handle a few special cases to eliminate operand modifiers.
350+
ReSimplify:
350351
switch (OutMI.getOpcode()) {
351352
case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand.
352353
lower_lea64_32mem(&OutMI, 1);
@@ -433,6 +434,13 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
433434
break;
434435
}
435436

437+
// These are pseudo-ops for OR to help with the OR->ADD transformation. We do
438+
// this with an ugly goto in case the resultant OR uses EAX and needs the
439+
// short form.
440+
case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
441+
case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
442+
case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
443+
436444
// The assembler backend wants to see branches in their small form and relax
437445
// them to their large form. The JIT can only handle the large form because
438446
// it does not do relaxation. For now, translate the large form to the

0 commit comments

Comments
 (0)