11
11
// later typically inlined as a chain of efficient hardware comparisons). This
12
12
// typically benefits c++ member or nonmember operator==().
13
13
//
14
- // The basic idea is to replace a larger chain of integer comparisons loaded
15
- // from contiguous memory locations into a smaller chain of such integer
14
+ // The basic idea is to replace a longer chain of integer comparisons loaded
15
+ // from contiguous memory locations into a shorter chain of larger integer
16
16
// comparisons. Benefits are double:
17
17
// - There are less jumps, and therefore less opportunities for mispredictions
18
18
// and I-cache misses.
19
19
// - Code size is smaller, both because jumps are removed and because the
20
20
// encoding of a 2*n byte compare is smaller than that of two n-byte
21
21
// compares.
22
-
22
+ //
23
+ // Example:
24
+ //
25
+ // struct S {
26
+ // int a;
27
+ // char b;
28
+ // char c;
29
+ // uint16_t d;
30
+ // bool operator==(const S& o) const {
31
+ // return a == o.a && b == o.b && c == o.c && d == o.d;
32
+ // }
33
+ // };
34
+ //
35
+ // Is optimized as :
36
+ //
37
+ // bool S::operator==(const S& o) const {
38
+ // return memcmp(this, &o, 8) == 0;
39
+ // }
40
+ //
41
+ // Which will later be expanded (ExpandMemCmp) as a single 8-bytes icmp.
42
+ //
23
43
// ===----------------------------------------------------------------------===//
24
44
25
- #include < algorithm>
26
- #include < numeric>
27
- #include < utility>
28
- #include < vector>
29
45
#include " llvm/Analysis/Loads.h"
30
46
#include " llvm/Analysis/TargetLibraryInfo.h"
31
47
#include " llvm/Analysis/TargetTransformInfo.h"
34
50
#include " llvm/Pass.h"
35
51
#include " llvm/Transforms/Scalar.h"
36
52
#include " llvm/Transforms/Utils/BuildLibCalls.h"
53
+ #include < algorithm>
54
+ #include < numeric>
55
+ #include < utility>
56
+ #include < vector>
37
57
38
58
using namespace llvm ;
39
59
@@ -50,76 +70,95 @@ static bool isSimpleLoadOrStore(const Instruction *I) {
50
70
return false ;
51
71
}
52
72
53
- // A BCE atom.
73
+ // A BCE atom "Binary Compare Expression Atom" represents an integer load
74
+ // that is a constant offset from a base value, e.g. `a` or `o.c` in the example
75
+ // at the top.
54
76
struct BCEAtom {
55
- BCEAtom () : GEP(nullptr ), LoadI(nullptr ), Offset() {}
56
-
57
- const Value *Base () const { return GEP ? GEP->getPointerOperand () : nullptr ; }
58
-
77
+ BCEAtom () = default ;
78
+ BCEAtom (GetElementPtrInst *GEP, LoadInst *LoadI, int BaseId, APInt Offset)
79
+ : GEP(GEP), LoadI(LoadI), BaseId(BaseId), Offset(Offset) {}
80
+
81
+ // We want to order BCEAtoms by (Base, Offset). However we cannot use
82
+ // the pointer values for Base because these are non-deterministic.
83
+ // To make sure that the sort order is stable, we first assign to each atom
84
+ // base value an index based on its order of appearance in the chain of
85
+ // comparisons. We call this index `BaseOrdering`. For example, for:
86
+ // b[3] == c[2] && a[1] == d[1] && b[4] == c[3]
87
+ // | block 1 | | block 2 | | block 3 |
88
+ // b gets assigned index 0 and a index 1, because b appears as LHS in block 1,
89
+ // which is before block 2.
90
+ // We then sort by (BaseOrdering[LHS.Base()], LHS.Offset), which is stable.
59
91
bool operator <(const BCEAtom &O) const {
60
- assert (Base () && " invalid atom" );
61
- assert (O.Base () && " invalid atom" );
62
- // Just ordering by (Base(), Offset) is sufficient. However because this
63
- // means that the ordering will depend on the addresses of the base
64
- // values, which are not reproducible from run to run. To guarantee
65
- // stability, we use the names of the values if they exist; we sort by:
66
- // (Base.getName(), Base(), Offset).
67
- const int NameCmp = Base ()->getName ().compare (O.Base ()->getName ());
68
- if (NameCmp == 0 ) {
69
- if (Base () == O.Base ()) {
70
- return Offset.slt (O.Offset );
71
- }
72
- return Base () < O.Base ();
73
- }
74
- return NameCmp < 0 ;
92
+ return BaseId != O.BaseId ? BaseId < O.BaseId : Offset.slt (O.Offset );
75
93
}
76
94
77
- GetElementPtrInst *GEP;
78
- LoadInst *LoadI;
95
+ GetElementPtrInst *GEP = nullptr ;
96
+ LoadInst *LoadI = nullptr ;
97
+ unsigned BaseId = 0 ;
79
98
APInt Offset;
80
99
};
81
100
101
+ // A class that assigns increasing ids to values in the order in which they are
102
+ // seen. See comment in `BCEAtom::operator<()``.
103
+ class BaseIdentifier {
104
+ public:
105
+ // Returns the id for value `Base`, after assigning one if `Base` has not been
106
+ // seen before.
107
+ int getBaseId (const Value *Base) {
108
+ assert (Base && " invalid base" );
109
+ const auto Insertion = BaseToIndex.try_emplace (Base, Order);
110
+ if (Insertion.second )
111
+ ++Order;
112
+ return Insertion.first ->second ;
113
+ }
114
+
115
+ private:
116
+ unsigned Order = 1 ;
117
+ DenseMap<const Value*, int > BaseToIndex;
118
+ };
119
+
82
120
// If this value is a load from a constant offset w.r.t. a base address, and
83
121
// there are no other users of the load or address, returns the base address and
84
122
// the offset.
85
- BCEAtom visitICmpLoadOperand (Value *const Val) {
86
- BCEAtom Result;
87
- if (auto *const LoadI = dyn_cast<LoadInst>(Val)) {
88
- LLVM_DEBUG (dbgs () << " load\n " );
89
- if (LoadI->isUsedOutsideOfBlock (LoadI->getParent ())) {
90
- LLVM_DEBUG (dbgs () << " used outside of block\n " );
91
- return {};
92
- }
93
- // Do not optimize atomic loads to non-atomic memcmp
94
- if (!LoadI->isSimple ()) {
95
- LLVM_DEBUG (dbgs () << " volatile or atomic\n " );
96
- return {};
97
- }
98
- Value *const Addr = LoadI->getOperand (0 );
99
- if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) {
100
- LLVM_DEBUG (dbgs () << " GEP\n " );
101
- if (GEP->isUsedOutsideOfBlock (LoadI->getParent ())) {
102
- LLVM_DEBUG (dbgs () << " used outside of block\n " );
103
- return {};
104
- }
105
- const auto &DL = GEP->getModule ()->getDataLayout ();
106
- if (!isDereferenceablePointer (GEP, DL)) {
107
- LLVM_DEBUG (dbgs () << " not dereferenceable\n " );
108
- // We need to make sure that we can do comparison in any order, so we
109
- // require memory to be unconditionnally dereferencable.
110
- return {};
111
- }
112
- Result.Offset = APInt (DL.getPointerTypeSizeInBits (GEP->getType ()), 0 );
113
- if (GEP->accumulateConstantOffset (DL, Result.Offset )) {
114
- Result.GEP = GEP;
115
- Result.LoadI = LoadI;
116
- }
117
- }
123
+ BCEAtom visitICmpLoadOperand (Value *const Val, BaseIdentifier &BaseId) {
124
+ auto *const LoadI = dyn_cast<LoadInst>(Val);
125
+ if (!LoadI)
126
+ return {};
127
+ LLVM_DEBUG (dbgs () << " load\n " );
128
+ if (LoadI->isUsedOutsideOfBlock (LoadI->getParent ())) {
129
+ LLVM_DEBUG (dbgs () << " used outside of block\n " );
130
+ return {};
131
+ }
132
+ // Do not optimize atomic loads to non-atomic memcmp
133
+ if (!LoadI->isSimple ()) {
134
+ LLVM_DEBUG (dbgs () << " volatile or atomic\n " );
135
+ return {};
118
136
}
119
- return Result;
137
+ Value *const Addr = LoadI->getOperand (0 );
138
+ auto *const GEP = dyn_cast<GetElementPtrInst>(Addr);
139
+ if (!GEP)
140
+ return {};
141
+ LLVM_DEBUG (dbgs () << " GEP\n " );
142
+ if (GEP->isUsedOutsideOfBlock (LoadI->getParent ())) {
143
+ LLVM_DEBUG (dbgs () << " used outside of block\n " );
144
+ return {};
145
+ }
146
+ const auto &DL = GEP->getModule ()->getDataLayout ();
147
+ if (!isDereferenceablePointer (GEP, DL)) {
148
+ LLVM_DEBUG (dbgs () << " not dereferenceable\n " );
149
+ // We need to make sure that we can do comparison in any order, so we
150
+ // require memory to be unconditionnally dereferencable.
151
+ return {};
152
+ }
153
+ APInt Offset = APInt (DL.getPointerTypeSizeInBits (GEP->getType ()), 0 );
154
+ if (!GEP->accumulateConstantOffset (DL, Offset))
155
+ return {};
156
+ return BCEAtom (GEP, LoadI, BaseId.getBaseId (GEP->getPointerOperand ()),
157
+ Offset);
120
158
}
121
159
122
- // A basic block with a comparison between two BCE atoms.
160
+ // A basic block with a comparison between two BCE atoms, e.g. `a == o.a` in the
161
+ // example at the top.
123
162
// The block might do extra work besides the atom comparison, in which case
124
163
// doesOtherWork() returns true. Under some conditions, the block can be
125
164
// split into the atom comparison part and the "other work" part
@@ -137,9 +176,7 @@ class BCECmpBlock {
137
176
if (Rhs_ < Lhs_) std::swap (Rhs_, Lhs_);
138
177
}
139
178
140
- bool IsValid () const {
141
- return Lhs_.Base () != nullptr && Rhs_.Base () != nullptr ;
142
- }
179
+ bool IsValid () const { return Lhs_.BaseId != 0 && Rhs_.BaseId != 0 ; }
143
180
144
181
// Assert the block is consistent: If valid, it should also have
145
182
// non-null members besides Lhs_ and Rhs_.
@@ -265,7 +302,8 @@ bool BCECmpBlock::doesOtherWork() const {
265
302
// Visit the given comparison. If this is a comparison between two valid
266
303
// BCE atoms, returns the comparison.
267
304
BCECmpBlock visitICmp (const ICmpInst *const CmpI,
268
- const ICmpInst::Predicate ExpectedPredicate) {
305
+ const ICmpInst::Predicate ExpectedPredicate,
306
+ BaseIdentifier &BaseId) {
269
307
// The comparison can only be used once:
270
308
// - For intermediate blocks, as a branch condition.
271
309
// - For the final block, as an incoming value for the Phi.
@@ -275,25 +313,27 @@ BCECmpBlock visitICmp(const ICmpInst *const CmpI,
275
313
LLVM_DEBUG (dbgs () << " cmp has several uses\n " );
276
314
return {};
277
315
}
278
- if (CmpI->getPredicate () == ExpectedPredicate) {
279
- LLVM_DEBUG (dbgs () << " cmp "
280
- << (ExpectedPredicate == ICmpInst::ICMP_EQ ? " eq" : " ne" )
281
- << " \n " );
282
- auto Lhs = visitICmpLoadOperand (CmpI->getOperand (0 ));
283
- if (!Lhs.Base ()) return {};
284
- auto Rhs = visitICmpLoadOperand (CmpI->getOperand (1 ));
285
- if (!Rhs.Base ()) return {};
286
- const auto &DL = CmpI->getModule ()->getDataLayout ();
287
- return BCECmpBlock (std::move (Lhs), std::move (Rhs),
288
- DL.getTypeSizeInBits (CmpI->getOperand (0 )->getType ()));
289
- }
290
- return {};
316
+ if (CmpI->getPredicate () != ExpectedPredicate)
317
+ return {};
318
+ LLVM_DEBUG (dbgs () << " cmp "
319
+ << (ExpectedPredicate == ICmpInst::ICMP_EQ ? " eq" : " ne" )
320
+ << " \n " );
321
+ auto Lhs = visitICmpLoadOperand (CmpI->getOperand (0 ), BaseId);
322
+ if (!Lhs.BaseId )
323
+ return {};
324
+ auto Rhs = visitICmpLoadOperand (CmpI->getOperand (1 ), BaseId);
325
+ if (!Rhs.BaseId )
326
+ return {};
327
+ const auto &DL = CmpI->getModule ()->getDataLayout ();
328
+ return BCECmpBlock (std::move (Lhs), std::move (Rhs),
329
+ DL.getTypeSizeInBits (CmpI->getOperand (0 )->getType ()));
291
330
}
292
331
293
332
// Visit the given comparison block. If this is a comparison between two valid
294
333
// BCE atoms, returns the comparison.
295
334
BCECmpBlock visitCmpBlock (Value *const Val, BasicBlock *const Block,
296
- const BasicBlock *const PhiBlock) {
335
+ const BasicBlock *const PhiBlock,
336
+ BaseIdentifier &BaseId) {
297
337
if (Block->empty ()) return {};
298
338
auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator ());
299
339
if (!BranchI) return {};
@@ -306,7 +346,7 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
306
346
auto *const CmpI = dyn_cast<ICmpInst>(Val);
307
347
if (!CmpI) return {};
308
348
LLVM_DEBUG (dbgs () << " icmp\n " );
309
- auto Result = visitICmp (CmpI, ICmpInst::ICMP_EQ);
349
+ auto Result = visitICmp (CmpI, ICmpInst::ICMP_EQ, BaseId );
310
350
Result.CmpI = CmpI;
311
351
Result.BranchI = BranchI;
312
352
return Result;
@@ -323,7 +363,8 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
323
363
assert (BranchI->getNumSuccessors () == 2 && " expecting a cond branch" );
324
364
BasicBlock *const FalseBlock = BranchI->getSuccessor (1 );
325
365
auto Result = visitICmp (
326
- CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE);
366
+ CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
367
+ BaseId);
327
368
Result.CmpI = CmpI;
328
369
Result.BranchI = BranchI;
329
370
return Result;
@@ -335,9 +376,9 @@ static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
335
376
BCECmpBlock &Comparison) {
336
377
LLVM_DEBUG (dbgs () << " Block '" << Comparison.BB ->getName ()
337
378
<< " ': Found cmp of " << Comparison.SizeBits ()
338
- << " bits between " << Comparison.Lhs ().Base () << " + "
379
+ << " bits between " << Comparison.Lhs ().BaseId << " + "
339
380
<< Comparison.Lhs ().Offset << " and "
340
- << Comparison.Rhs ().Base () << " + "
381
+ << Comparison.Rhs ().BaseId << " + "
341
382
<< Comparison.Rhs ().Offset << " \n " );
342
383
LLVM_DEBUG (dbgs () << " \n " );
343
384
Comparisons.push_back (Comparison);
@@ -360,8 +401,8 @@ class BCECmpChain {
360
401
private:
361
402
static bool IsContiguous (const BCECmpBlock &First,
362
403
const BCECmpBlock &Second) {
363
- return First.Lhs ().Base () == Second.Lhs ().Base () &&
364
- First.Rhs ().Base () == Second.Rhs ().Base () &&
404
+ return First.Lhs ().BaseId == Second.Lhs ().BaseId &&
405
+ First.Rhs ().BaseId == Second.Rhs ().BaseId &&
365
406
First.Lhs ().Offset + First.SizeBits () / 8 == Second.Lhs ().Offset &&
366
407
First.Rhs ().Offset + First.SizeBits () / 8 == Second.Rhs ().Offset ;
367
408
}
@@ -385,11 +426,12 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
385
426
assert (!Blocks.empty () && " a chain should have at least one block" );
386
427
// Now look inside blocks to check for BCE comparisons.
387
428
std::vector<BCECmpBlock> Comparisons;
429
+ BaseIdentifier BaseId;
388
430
for (size_t BlockIdx = 0 ; BlockIdx < Blocks.size (); ++BlockIdx) {
389
431
BasicBlock *const Block = Blocks[BlockIdx];
390
432
assert (Block && " invalid block" );
391
433
BCECmpBlock Comparison = visitCmpBlock (Phi.getIncomingValueForBlock (Block),
392
- Block, Phi.getParent ());
434
+ Block, Phi.getParent (), BaseId );
393
435
Comparison.BB = Block;
394
436
if (!Comparison.IsValid ()) {
395
437
LLVM_DEBUG (dbgs () << " chain with invalid BCECmpBlock, no merge.\n " );
@@ -466,9 +508,10 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
466
508
#endif // MERGEICMPS_DOT_ON
467
509
// Reorder blocks by LHS. We can do that without changing the
468
510
// semantics because we are only accessing dereferencable memory.
469
- llvm::sort (Comparisons_, [](const BCECmpBlock &a, const BCECmpBlock &b) {
470
- return a.Lhs () < b.Lhs ();
471
- });
511
+ llvm::sort (Comparisons_,
512
+ [](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) {
513
+ return LhsBlock.Lhs () < RhsBlock.Lhs ();
514
+ });
472
515
#ifdef MERGEICMPS_DOT_ON
473
516
errs () << " AFTER REORDERING:\n\n " ;
474
517
dump ();
0 commit comments