9
9
//
10
10
// Performs general IR level optimizations on SVE intrinsics.
11
11
//
12
- // The main goal of this pass is to remove unnecessary reinterpret
13
- // intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
12
+ // This pass performs the following optimizations:
14
13
//
15
- // %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
16
- // %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
14
+ // - removes unnecessary reinterpret intrinsics
15
+ // (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
16
+ // %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
17
+ // %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
17
18
//
18
- // This pass also looks for ptest intrinsics & phi instructions where the
19
- // operands are being needlessly converted to and from svbool_t.
19
+ // - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g:
20
+ // %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
21
+ // %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
22
+ // ; (%1 can be replaced with a reinterpret of %2)
23
+ //
24
+ // - optimizes ptest intrinsics and phi instructions where the operands are
25
+ // being needlessly converted to and from svbool_t.
20
26
//
21
27
// ===----------------------------------------------------------------------===//
22
28
@@ -56,8 +62,17 @@ struct SVEIntrinsicOpts : public ModulePass {
56
62
private:
57
63
static IntrinsicInst *isReinterpretToSVBool (Value *V);
58
64
65
+ bool coalescePTrueIntrinsicCalls (BasicBlock &BB,
66
+ SmallSetVector<IntrinsicInst *, 4 > &PTrues);
67
+ bool optimizePTrueIntrinsicCalls (SmallSetVector<Function *, 4 > &Functions);
68
+
69
+ // / Operates at the instruction-scope. I.e., optimizations are applied local
70
+ // / to individual instructions.
59
71
static bool optimizeIntrinsic (Instruction *I);
72
+ bool optimizeIntrinsicCalls (SmallSetVector<Function *, 4 > &Functions);
60
73
74
+ // / Operates at the function-scope. I.e., optimizations are applied local to
75
+ // / the functions themselves.
61
76
bool optimizeFunctions (SmallSetVector<Function *, 4 > &Functions);
62
77
63
78
static bool optimizeConvertFromSVBool (IntrinsicInst *I);
@@ -95,6 +110,188 @@ IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) {
95
110
return I;
96
111
}
97
112
113
+ // / Checks if a ptrue intrinsic call is promoted. The act of promoting a
114
+ // / ptrue will introduce zeroing. For example:
115
+ // /
116
+ // / %1 = <vscale x 4 x i1> call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
117
+ // / %2 = <vscale x 16 x i1> call @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
118
+ // / %3 = <vscale x 8 x i1> call @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
119
+ // /
120
+ // / %1 is promoted, because it is converted:
121
+ // /
122
+ // / <vscale x 4 x i1> => <vscale x 16 x i1> => <vscale x 8 x i1>
123
+ // /
124
+ // / via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool.
125
+ bool isPTruePromoted (IntrinsicInst *PTrue) {
126
+ // Find all users of this intrinsic that are calls to convert-to-svbool
127
+ // reinterpret intrinsics.
128
+ SmallVector<IntrinsicInst *, 4 > ConvertToUses;
129
+ for (User *User : PTrue->users ()) {
130
+ if (match (User, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>())) {
131
+ ConvertToUses.push_back (cast<IntrinsicInst>(User));
132
+ }
133
+ }
134
+
135
+ // If no such calls were found, this is ptrue is not promoted.
136
+ if (ConvertToUses.empty ())
137
+ return false ;
138
+
139
+ // Otherwise, try to find users of the convert-to-svbool intrinsics that are
140
+ // calls to the convert-from-svbool intrinsic, and would result in some lanes
141
+ // being zeroed.
142
+ const auto *PTrueVTy = cast<ScalableVectorType>(PTrue->getType ());
143
+ for (IntrinsicInst *ConvertToUse : ConvertToUses) {
144
+ for (User *User : ConvertToUse->users ()) {
145
+ auto *IntrUser = dyn_cast<IntrinsicInst>(User);
146
+ if (IntrUser && IntrUser->getIntrinsicID () ==
147
+ Intrinsic::aarch64_sve_convert_from_svbool) {
148
+ const auto *IntrUserVTy = cast<ScalableVectorType>(IntrUser->getType ());
149
+
150
+ // Would some lanes become zeroed by the conversion?
151
+ if (IntrUserVTy->getElementCount ().getKnownMinValue () >
152
+ PTrueVTy->getElementCount ().getKnownMinValue ())
153
+ // This is a promoted ptrue.
154
+ return true ;
155
+ }
156
+ }
157
+ }
158
+
159
+ // If no matching calls were found, this is not a promoted ptrue.
160
+ return false ;
161
+ }
162
+
163
+ // / Attempts to coalesce ptrues in a basic block.
164
+ bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls (
165
+ BasicBlock &BB, SmallSetVector<IntrinsicInst *, 4 > &PTrues) {
166
+ if (PTrues.size () <= 1 )
167
+ return false ;
168
+
169
+ // Find the ptrue with the most lanes.
170
+ auto *MostEncompassingPTrue = *std::max_element (
171
+ PTrues.begin (), PTrues.end (), [](auto *PTrue1, auto *PTrue2) {
172
+ auto *PTrue1VTy = cast<ScalableVectorType>(PTrue1->getType ());
173
+ auto *PTrue2VTy = cast<ScalableVectorType>(PTrue2->getType ());
174
+ return PTrue1VTy->getElementCount ().getKnownMinValue () <
175
+ PTrue2VTy->getElementCount ().getKnownMinValue ();
176
+ });
177
+
178
+ // Remove the most encompassing ptrue, as well as any promoted ptrues, leaving
179
+ // behind only the ptrues to be coalesced.
180
+ PTrues.remove (MostEncompassingPTrue);
181
+ PTrues.remove_if ([](auto *PTrue) { return isPTruePromoted (PTrue); });
182
+
183
+ // Hoist MostEncompassingPTrue to the start of the basic block. It is always
184
+ // safe to do this, since ptrue intrinsic calls are guaranteed to have no
185
+ // predecessors.
186
+ MostEncompassingPTrue->moveBefore (BB, BB.getFirstInsertionPt ());
187
+
188
+ LLVMContext &Ctx = BB.getContext ();
189
+ IRBuilder<> Builder (Ctx);
190
+ Builder.SetInsertPoint (&BB, ++MostEncompassingPTrue->getIterator ());
191
+
192
+ auto *MostEncompassingPTrueVTy =
193
+ cast<VectorType>(MostEncompassingPTrue->getType ());
194
+ auto *ConvertToSVBool = Builder.CreateIntrinsic (
195
+ Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy},
196
+ {MostEncompassingPTrue});
197
+
198
+ for (auto *PTrue : PTrues) {
199
+ auto *PTrueVTy = cast<VectorType>(PTrue->getType ());
200
+
201
+ Builder.SetInsertPoint (&BB, ++ConvertToSVBool->getIterator ());
202
+ auto *ConvertFromSVBool =
203
+ Builder.CreateIntrinsic (Intrinsic::aarch64_sve_convert_from_svbool,
204
+ {PTrueVTy}, {ConvertToSVBool});
205
+ PTrue->replaceAllUsesWith (ConvertFromSVBool);
206
+ PTrue->eraseFromParent ();
207
+ }
208
+
209
+ return true ;
210
+ }
211
+
212
+ // / The goal of this function is to remove redundant calls to the SVE ptrue
213
+ // / intrinsic in each basic block within the given functions.
214
+ // /
215
+ // / SVE ptrues have two representations in LLVM IR:
216
+ // / - a logical representation -- an arbitrary-width scalable vector of i1s,
217
+ // / i.e. <vscale x N x i1>.
218
+ // / - a physical representation (svbool, <vscale x 16 x i1>) -- a 16-element
219
+ // / scalable vector of i1s, i.e. <vscale x 16 x i1>.
220
+ // /
221
+ // / The SVE ptrue intrinsic is used to create a logical representation of an SVE
222
+ // / predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If
223
+ // / P1 creates a logical SVE predicate that is at least as wide as the logical
224
+ // / SVE predicate created by P2, then all of the bits that are true in the
225
+ // / physical representation of P2 are necessarily also true in the physical
226
+ // / representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to
227
+ // / P2 is redundant and can be replaced by an SVE reinterpret of P1 via
228
+ // / convert.{to,from}.svbool.
229
+ // /
230
+ // / Currently, this pass only coalesces calls to SVE ptrue intrinsics
231
+ // / if they match the following conditions:
232
+ // /
233
+ // / - the call to the intrinsic uses either the SV_ALL or SV_POW2 patterns.
234
+ // / SV_ALL indicates that all bits of the predicate vector are to be set to
235
+ // / true. SV_POW2 indicates that all bits of the predicate vector up to the
236
+ // / largest power-of-two are to be set to true.
237
+ // / - the result of the call to the intrinsic is not promoted to a wider
238
+ // / predicate. In this case, keeping the extra ptrue leads to better codegen
239
+ // / -- coalescing here would create an irreducible chain of SVE reinterprets
240
+ // / via convert.{to,from}.svbool.
241
+ // /
242
+ // / EXAMPLE:
243
+ // /
244
+ // / %1 = <vscale x 8 x i1> ptrue(i32 SV_ALL)
245
+ // / ; Logical: <1, 1, 1, 1, 1, 1, 1, 1>
246
+ // / ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0>
247
+ // / ...
248
+ // /
249
+ // / %2 = <vscale x 4 x i1> ptrue(i32 SV_ALL)
250
+ // / ; Logical: <1, 1, 1, 1>
251
+ // / ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0>
252
+ // / ...
253
+ // /
254
+ // / Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance:
255
+ // /
256
+ // / %1 = <vscale x 8 x i1> ptrue(i32 i31)
257
+ // / %2 = <vscale x 16 x i1> convert.to.svbool(<vscale x 8 x i1> %1)
258
+ // / %3 = <vscale x 4 x i1> convert.from.svbool(<vscale x 16 x i1> %2)
259
+ // /
260
+ bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls (
261
+ SmallSetVector<Function *, 4 > &Functions) {
262
+ bool Changed = false ;
263
+
264
+ for (auto *F : Functions) {
265
+ for (auto &BB : *F) {
266
+ SmallSetVector<IntrinsicInst *, 4 > SVAllPTrues;
267
+ SmallSetVector<IntrinsicInst *, 4 > SVPow2PTrues;
268
+
269
+ // For each basic block, collect the used ptrues and try to coalesce them.
270
+ for (Instruction &I : BB) {
271
+ if (I.use_empty ())
272
+ continue ;
273
+
274
+ auto *IntrI = dyn_cast<IntrinsicInst>(&I);
275
+ if (!IntrI || IntrI->getIntrinsicID () != Intrinsic::aarch64_sve_ptrue)
276
+ continue ;
277
+
278
+ const auto PTruePattern =
279
+ cast<ConstantInt>(IntrI->getOperand (0 ))->getZExtValue ();
280
+
281
+ if (PTruePattern == AArch64SVEPredPattern::all)
282
+ SVAllPTrues.insert (IntrI);
283
+ if (PTruePattern == AArch64SVEPredPattern::pow2)
284
+ SVPow2PTrues.insert (IntrI);
285
+ }
286
+
287
+ Changed |= coalescePTrueIntrinsicCalls (BB, SVAllPTrues);
288
+ Changed |= coalescePTrueIntrinsicCalls (BB, SVPow2PTrues);
289
+ }
290
+ }
291
+
292
+ return Changed;
293
+ }
294
+
98
295
// / The function will remove redundant reinterprets casting in the presence
99
296
// / of the control flow
100
297
bool SVEIntrinsicOpts::processPhiNode (IntrinsicInst *X) {
@@ -243,7 +440,7 @@ bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
243
440
return true ;
244
441
}
245
442
246
- bool SVEIntrinsicOpts::optimizeFunctions (
443
+ bool SVEIntrinsicOpts::optimizeIntrinsicCalls (
247
444
SmallSetVector<Function *, 4 > &Functions) {
248
445
bool Changed = false ;
249
446
for (auto *F : Functions) {
@@ -260,6 +457,16 @@ bool SVEIntrinsicOpts::optimizeFunctions(
260
457
return Changed;
261
458
}
262
459
460
+ bool SVEIntrinsicOpts::optimizeFunctions (
461
+ SmallSetVector<Function *, 4 > &Functions) {
462
+ bool Changed = false ;
463
+
464
+ Changed |= optimizePTrueIntrinsicCalls (Functions);
465
+ Changed |= optimizeIntrinsicCalls (Functions);
466
+
467
+ return Changed;
468
+ }
469
+
263
470
bool SVEIntrinsicOpts::runOnModule (Module &M) {
264
471
bool Changed = false ;
265
472
SmallSetVector<Function *, 4 > Functions;
@@ -276,6 +483,7 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
276
483
case Intrinsic::aarch64_sve_ptest_any:
277
484
case Intrinsic::aarch64_sve_ptest_first:
278
485
case Intrinsic::aarch64_sve_ptest_last:
486
+ case Intrinsic::aarch64_sve_ptrue:
279
487
for (User *U : F.users ())
280
488
Functions.insert (cast<Instruction>(U)->getFunction ());
281
489
break ;
0 commit comments