Skip to content

Commit 25c79d9

Browse files
committed
Merging r354128 and r354131:
------------------------------------------------------------------------ r354128 | courbet | 2019-02-15 13:58:06 +0100 (Fri, 15 Feb 2019) | 1 line [MergeICmps][NFC] Improve doc. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r354131 | courbet | 2019-02-15 15:17:17 +0100 (Fri, 15 Feb 2019) | 15 lines [MergeICmps] Make base ordering really deterministic. Summary: The idea is that we now manipulate bases through a `unsigned BaseID` based on order of appearance in the comparison chain rather than through the `Value*`. Fixes 40714. Reviewers: gchatelet Subscribers: mgrang, jfb, jdoerfert, llvm-commits, hans Tags: #llvm Differential Revision: https://reviews.llvm.org/D58274 ------------------------------------------------------------------------ llvm-svn: 354249
1 parent 07a7439 commit 25c79d9

File tree

1 file changed

+135
-92
lines changed

1 file changed

+135
-92
lines changed

llvm/lib/Transforms/Scalar/MergeICmps.cpp

Lines changed: 135 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -11,21 +11,37 @@
1111
// later typically inlined as a chain of efficient hardware comparisons). This
1212
// typically benefits c++ member or nonmember operator==().
1313
//
14-
// The basic idea is to replace a larger chain of integer comparisons loaded
15-
// from contiguous memory locations into a smaller chain of such integer
14+
// The basic idea is to replace a longer chain of integer comparisons loaded
15+
// from contiguous memory locations into a shorter chain of larger integer
1616
// comparisons. Benefits are double:
1717
// - There are less jumps, and therefore less opportunities for mispredictions
1818
// and I-cache misses.
1919
// - Code size is smaller, both because jumps are removed and because the
2020
// encoding of a 2*n byte compare is smaller than that of two n-byte
2121
// compares.
22-
22+
//
23+
// Example:
24+
//
25+
// struct S {
26+
// int a;
27+
// char b;
28+
// char c;
29+
// uint16_t d;
30+
// bool operator==(const S& o) const {
31+
// return a == o.a && b == o.b && c == o.c && d == o.d;
32+
// }
33+
// };
34+
//
35+
// Is optimized as :
36+
//
37+
// bool S::operator==(const S& o) const {
38+
// return memcmp(this, &o, 8) == 0;
39+
// }
40+
//
41+
// Which will later be expanded (ExpandMemCmp) as a single 8-bytes icmp.
42+
//
2343
//===----------------------------------------------------------------------===//
2444

25-
#include <algorithm>
26-
#include <numeric>
27-
#include <utility>
28-
#include <vector>
2945
#include "llvm/Analysis/Loads.h"
3046
#include "llvm/Analysis/TargetLibraryInfo.h"
3147
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -34,6 +50,10 @@
3450
#include "llvm/Pass.h"
3551
#include "llvm/Transforms/Scalar.h"
3652
#include "llvm/Transforms/Utils/BuildLibCalls.h"
53+
#include <algorithm>
54+
#include <numeric>
55+
#include <utility>
56+
#include <vector>
3757

3858
using namespace llvm;
3959

@@ -50,76 +70,95 @@ static bool isSimpleLoadOrStore(const Instruction *I) {
5070
return false;
5171
}
5272

53-
// A BCE atom.
73+
// A BCE atom "Binary Compare Expression Atom" represents an integer load
74+
// that is a constant offset from a base value, e.g. `a` or `o.c` in the example
75+
// at the top.
5476
struct BCEAtom {
55-
BCEAtom() : GEP(nullptr), LoadI(nullptr), Offset() {}
56-
57-
const Value *Base() const { return GEP ? GEP->getPointerOperand() : nullptr; }
58-
77+
BCEAtom() = default;
78+
BCEAtom(GetElementPtrInst *GEP, LoadInst *LoadI, int BaseId, APInt Offset)
79+
: GEP(GEP), LoadI(LoadI), BaseId(BaseId), Offset(Offset) {}
80+
81+
// We want to order BCEAtoms by (Base, Offset). However we cannot use
82+
// the pointer values for Base because these are non-deterministic.
83+
// To make sure that the sort order is stable, we first assign to each atom
84+
// base value an index based on its order of appearance in the chain of
85+
// comparisons. We call this index `BaseOrdering`. For example, for:
86+
// b[3] == c[2] && a[1] == d[1] && b[4] == c[3]
87+
// | block 1 | | block 2 | | block 3 |
88+
// b gets assigned index 0 and a index 1, because b appears as LHS in block 1,
89+
// which is before block 2.
90+
// We then sort by (BaseOrdering[LHS.Base()], LHS.Offset), which is stable.
5991
bool operator<(const BCEAtom &O) const {
60-
assert(Base() && "invalid atom");
61-
assert(O.Base() && "invalid atom");
62-
// Just ordering by (Base(), Offset) is sufficient. However because this
63-
// means that the ordering will depend on the addresses of the base
64-
// values, which are not reproducible from run to run. To guarantee
65-
// stability, we use the names of the values if they exist; we sort by:
66-
// (Base.getName(), Base(), Offset).
67-
const int NameCmp = Base()->getName().compare(O.Base()->getName());
68-
if (NameCmp == 0) {
69-
if (Base() == O.Base()) {
70-
return Offset.slt(O.Offset);
71-
}
72-
return Base() < O.Base();
73-
}
74-
return NameCmp < 0;
92+
return BaseId != O.BaseId ? BaseId < O.BaseId : Offset.slt(O.Offset);
7593
}
7694

77-
GetElementPtrInst *GEP;
78-
LoadInst *LoadI;
95+
GetElementPtrInst *GEP = nullptr;
96+
LoadInst *LoadI = nullptr;
97+
unsigned BaseId = 0;
7998
APInt Offset;
8099
};
81100

101+
// A class that assigns increasing ids to values in the order in which they are
102+
// seen. See comment in `BCEAtom::operator<()``.
103+
class BaseIdentifier {
104+
public:
105+
// Returns the id for value `Base`, after assigning one if `Base` has not been
106+
// seen before.
107+
int getBaseId(const Value *Base) {
108+
assert(Base && "invalid base");
109+
const auto Insertion = BaseToIndex.try_emplace(Base, Order);
110+
if (Insertion.second)
111+
++Order;
112+
return Insertion.first->second;
113+
}
114+
115+
private:
116+
unsigned Order = 1;
117+
DenseMap<const Value*, int> BaseToIndex;
118+
};
119+
82120
// If this value is a load from a constant offset w.r.t. a base address, and
83121
// there are no other users of the load or address, returns the base address and
84122
// the offset.
85-
BCEAtom visitICmpLoadOperand(Value *const Val) {
86-
BCEAtom Result;
87-
if (auto *const LoadI = dyn_cast<LoadInst>(Val)) {
88-
LLVM_DEBUG(dbgs() << "load\n");
89-
if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
90-
LLVM_DEBUG(dbgs() << "used outside of block\n");
91-
return {};
92-
}
93-
// Do not optimize atomic loads to non-atomic memcmp
94-
if (!LoadI->isSimple()) {
95-
LLVM_DEBUG(dbgs() << "volatile or atomic\n");
96-
return {};
97-
}
98-
Value *const Addr = LoadI->getOperand(0);
99-
if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) {
100-
LLVM_DEBUG(dbgs() << "GEP\n");
101-
if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) {
102-
LLVM_DEBUG(dbgs() << "used outside of block\n");
103-
return {};
104-
}
105-
const auto &DL = GEP->getModule()->getDataLayout();
106-
if (!isDereferenceablePointer(GEP, DL)) {
107-
LLVM_DEBUG(dbgs() << "not dereferenceable\n");
108-
// We need to make sure that we can do comparison in any order, so we
109-
// require memory to be unconditionnally dereferencable.
110-
return {};
111-
}
112-
Result.Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
113-
if (GEP->accumulateConstantOffset(DL, Result.Offset)) {
114-
Result.GEP = GEP;
115-
Result.LoadI = LoadI;
116-
}
117-
}
123+
BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
124+
auto *const LoadI = dyn_cast<LoadInst>(Val);
125+
if (!LoadI)
126+
return {};
127+
LLVM_DEBUG(dbgs() << "load\n");
128+
if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
129+
LLVM_DEBUG(dbgs() << "used outside of block\n");
130+
return {};
131+
}
132+
// Do not optimize atomic loads to non-atomic memcmp
133+
if (!LoadI->isSimple()) {
134+
LLVM_DEBUG(dbgs() << "volatile or atomic\n");
135+
return {};
118136
}
119-
return Result;
137+
Value *const Addr = LoadI->getOperand(0);
138+
auto *const GEP = dyn_cast<GetElementPtrInst>(Addr);
139+
if (!GEP)
140+
return {};
141+
LLVM_DEBUG(dbgs() << "GEP\n");
142+
if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) {
143+
LLVM_DEBUG(dbgs() << "used outside of block\n");
144+
return {};
145+
}
146+
const auto &DL = GEP->getModule()->getDataLayout();
147+
if (!isDereferenceablePointer(GEP, DL)) {
148+
LLVM_DEBUG(dbgs() << "not dereferenceable\n");
149+
// We need to make sure that we can do comparison in any order, so we
150+
// require memory to be unconditionnally dereferencable.
151+
return {};
152+
}
153+
APInt Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
154+
if (!GEP->accumulateConstantOffset(DL, Offset))
155+
return {};
156+
return BCEAtom(GEP, LoadI, BaseId.getBaseId(GEP->getPointerOperand()),
157+
Offset);
120158
}
121159

122-
// A basic block with a comparison between two BCE atoms.
160+
// A basic block with a comparison between two BCE atoms, e.g. `a == o.a` in the
161+
// example at the top.
123162
// The block might do extra work besides the atom comparison, in which case
124163
// doesOtherWork() returns true. Under some conditions, the block can be
125164
// split into the atom comparison part and the "other work" part
@@ -137,9 +176,7 @@ class BCECmpBlock {
137176
if (Rhs_ < Lhs_) std::swap(Rhs_, Lhs_);
138177
}
139178

140-
bool IsValid() const {
141-
return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr;
142-
}
179+
bool IsValid() const { return Lhs_.BaseId != 0 && Rhs_.BaseId != 0; }
143180

144181
// Assert the block is consistent: If valid, it should also have
145182
// non-null members besides Lhs_ and Rhs_.
@@ -265,7 +302,8 @@ bool BCECmpBlock::doesOtherWork() const {
265302
// Visit the given comparison. If this is a comparison between two valid
266303
// BCE atoms, returns the comparison.
267304
BCECmpBlock visitICmp(const ICmpInst *const CmpI,
268-
const ICmpInst::Predicate ExpectedPredicate) {
305+
const ICmpInst::Predicate ExpectedPredicate,
306+
BaseIdentifier &BaseId) {
269307
// The comparison can only be used once:
270308
// - For intermediate blocks, as a branch condition.
271309
// - For the final block, as an incoming value for the Phi.
@@ -275,25 +313,27 @@ BCECmpBlock visitICmp(const ICmpInst *const CmpI,
275313
LLVM_DEBUG(dbgs() << "cmp has several uses\n");
276314
return {};
277315
}
278-
if (CmpI->getPredicate() == ExpectedPredicate) {
279-
LLVM_DEBUG(dbgs() << "cmp "
280-
<< (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
281-
<< "\n");
282-
auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0));
283-
if (!Lhs.Base()) return {};
284-
auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1));
285-
if (!Rhs.Base()) return {};
286-
const auto &DL = CmpI->getModule()->getDataLayout();
287-
return BCECmpBlock(std::move(Lhs), std::move(Rhs),
288-
DL.getTypeSizeInBits(CmpI->getOperand(0)->getType()));
289-
}
290-
return {};
316+
if (CmpI->getPredicate() != ExpectedPredicate)
317+
return {};
318+
LLVM_DEBUG(dbgs() << "cmp "
319+
<< (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
320+
<< "\n");
321+
auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0), BaseId);
322+
if (!Lhs.BaseId)
323+
return {};
324+
auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1), BaseId);
325+
if (!Rhs.BaseId)
326+
return {};
327+
const auto &DL = CmpI->getModule()->getDataLayout();
328+
return BCECmpBlock(std::move(Lhs), std::move(Rhs),
329+
DL.getTypeSizeInBits(CmpI->getOperand(0)->getType()));
291330
}
292331

293332
// Visit the given comparison block. If this is a comparison between two valid
294333
// BCE atoms, returns the comparison.
295334
BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
296-
const BasicBlock *const PhiBlock) {
335+
const BasicBlock *const PhiBlock,
336+
BaseIdentifier &BaseId) {
297337
if (Block->empty()) return {};
298338
auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
299339
if (!BranchI) return {};
@@ -306,7 +346,7 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
306346
auto *const CmpI = dyn_cast<ICmpInst>(Val);
307347
if (!CmpI) return {};
308348
LLVM_DEBUG(dbgs() << "icmp\n");
309-
auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ);
349+
auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ, BaseId);
310350
Result.CmpI = CmpI;
311351
Result.BranchI = BranchI;
312352
return Result;
@@ -323,7 +363,8 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
323363
assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
324364
BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
325365
auto Result = visitICmp(
326-
CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE);
366+
CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
367+
BaseId);
327368
Result.CmpI = CmpI;
328369
Result.BranchI = BranchI;
329370
return Result;
@@ -335,9 +376,9 @@ static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
335376
BCECmpBlock &Comparison) {
336377
LLVM_DEBUG(dbgs() << "Block '" << Comparison.BB->getName()
337378
<< "': Found cmp of " << Comparison.SizeBits()
338-
<< " bits between " << Comparison.Lhs().Base() << " + "
379+
<< " bits between " << Comparison.Lhs().BaseId << " + "
339380
<< Comparison.Lhs().Offset << " and "
340-
<< Comparison.Rhs().Base() << " + "
381+
<< Comparison.Rhs().BaseId << " + "
341382
<< Comparison.Rhs().Offset << "\n");
342383
LLVM_DEBUG(dbgs() << "\n");
343384
Comparisons.push_back(Comparison);
@@ -360,8 +401,8 @@ class BCECmpChain {
360401
private:
361402
static bool IsContiguous(const BCECmpBlock &First,
362403
const BCECmpBlock &Second) {
363-
return First.Lhs().Base() == Second.Lhs().Base() &&
364-
First.Rhs().Base() == Second.Rhs().Base() &&
404+
return First.Lhs().BaseId == Second.Lhs().BaseId &&
405+
First.Rhs().BaseId == Second.Rhs().BaseId &&
365406
First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
366407
First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset;
367408
}
@@ -385,11 +426,12 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
385426
assert(!Blocks.empty() && "a chain should have at least one block");
386427
// Now look inside blocks to check for BCE comparisons.
387428
std::vector<BCECmpBlock> Comparisons;
429+
BaseIdentifier BaseId;
388430
for (size_t BlockIdx = 0; BlockIdx < Blocks.size(); ++BlockIdx) {
389431
BasicBlock *const Block = Blocks[BlockIdx];
390432
assert(Block && "invalid block");
391433
BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
392-
Block, Phi.getParent());
434+
Block, Phi.getParent(), BaseId);
393435
Comparison.BB = Block;
394436
if (!Comparison.IsValid()) {
395437
LLVM_DEBUG(dbgs() << "chain with invalid BCECmpBlock, no merge.\n");
@@ -466,9 +508,10 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
466508
#endif // MERGEICMPS_DOT_ON
467509
// Reorder blocks by LHS. We can do that without changing the
468510
// semantics because we are only accessing dereferencable memory.
469-
llvm::sort(Comparisons_, [](const BCECmpBlock &a, const BCECmpBlock &b) {
470-
return a.Lhs() < b.Lhs();
471-
});
511+
llvm::sort(Comparisons_,
512+
[](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) {
513+
return LhsBlock.Lhs() < RhsBlock.Lhs();
514+
});
472515
#ifdef MERGEICMPS_DOT_ON
473516
errs() << "AFTER REORDERING:\n\n";
474517
dump();

0 commit comments

Comments
 (0)