Skip to content

Commit c9439ca

Browse files
author
Joe Ellis
committed
[AArch64][SVE] Coalesce calls to the SVE ptrue intrinsic where possible
It is possible to eliminate redundant calls to the SVE ptrue intrinsic. For example: suppose that we have two SVE ptrue intrinsic calls P1 and P2. If P1 is at least as wide as P2, then P2 can be written as a reinterpret P1 using the SVE reinterpret intrinsics. Coalescing ptrue intrinsics can result in fewer ptrue instructions in the codegen, and is conducive to better analysis further down the line. This commit extends the aarch64-sve-intrinsic-opts pass to support coalescing ptrue intrisic calls. Differential Revision: https://reviews.llvm.org/D94230
1 parent 5626adc commit c9439ca

File tree

2 files changed

+404
-7
lines changed

2 files changed

+404
-7
lines changed

llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp

Lines changed: 215 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,20 @@
99
//
1010
// Performs general IR level optimizations on SVE intrinsics.
1111
//
12-
// The main goal of this pass is to remove unnecessary reinterpret
13-
// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
12+
// This pass performs the following optimizations:
1413
//
15-
// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
16-
// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
14+
// - removes unnecessary reinterpret intrinsics
15+
// (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
16+
// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
17+
// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
1718
//
18-
// This pass also looks for ptest intrinsics & phi instructions where the
19-
// operands are being needlessly converted to and from svbool_t.
19+
// - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g:
20+
// %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
21+
// %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
22+
// ; (%1 can be replaced with a reinterpret of %2)
23+
//
24+
// - optimizes ptest intrinsics and phi instructions where the operands are
25+
// being needlessly converted to and from svbool_t.
2026
//
2127
//===----------------------------------------------------------------------===//
2228

@@ -56,8 +62,17 @@ struct SVEIntrinsicOpts : public ModulePass {
5662
private:
5763
static IntrinsicInst *isReinterpretToSVBool(Value *V);
5864

65+
bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
66+
SmallSetVector<IntrinsicInst *, 4> &PTrues);
67+
bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
68+
69+
/// Operates at the instruction-scope. I.e., optimizations are applied local
70+
/// to individual instructions.
5971
static bool optimizeIntrinsic(Instruction *I);
72+
bool optimizeIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
6073

74+
/// Operates at the function-scope. I.e., optimizations are applied local to
75+
/// the functions themselves.
6176
bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
6277

6378
static bool optimizeConvertFromSVBool(IntrinsicInst *I);
@@ -95,6 +110,188 @@ IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) {
95110
return I;
96111
}
97112

113+
/// Checks if a ptrue intrinsic call is promoted. The act of promoting a
114+
/// ptrue will introduce zeroing. For example:
115+
///
116+
/// %1 = <vscale x 4 x i1> call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
117+
/// %2 = <vscale x 16 x i1> call @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
118+
/// %3 = <vscale x 8 x i1> call @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
119+
///
120+
/// %1 is promoted, because it is converted:
121+
///
122+
/// <vscale x 4 x i1> => <vscale x 16 x i1> => <vscale x 8 x i1>
123+
///
124+
/// via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool.
125+
bool isPTruePromoted(IntrinsicInst *PTrue) {
126+
// Find all users of this intrinsic that are calls to convert-to-svbool
127+
// reinterpret intrinsics.
128+
SmallVector<IntrinsicInst *, 4> ConvertToUses;
129+
for (User *User : PTrue->users()) {
130+
if (match(User, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>())) {
131+
ConvertToUses.push_back(cast<IntrinsicInst>(User));
132+
}
133+
}
134+
135+
// If no such calls were found, this is ptrue is not promoted.
136+
if (ConvertToUses.empty())
137+
return false;
138+
139+
// Otherwise, try to find users of the convert-to-svbool intrinsics that are
140+
// calls to the convert-from-svbool intrinsic, and would result in some lanes
141+
// being zeroed.
142+
const auto *PTrueVTy = cast<ScalableVectorType>(PTrue->getType());
143+
for (IntrinsicInst *ConvertToUse : ConvertToUses) {
144+
for (User *User : ConvertToUse->users()) {
145+
auto *IntrUser = dyn_cast<IntrinsicInst>(User);
146+
if (IntrUser && IntrUser->getIntrinsicID() ==
147+
Intrinsic::aarch64_sve_convert_from_svbool) {
148+
const auto *IntrUserVTy = cast<ScalableVectorType>(IntrUser->getType());
149+
150+
// Would some lanes become zeroed by the conversion?
151+
if (IntrUserVTy->getElementCount().getKnownMinValue() >
152+
PTrueVTy->getElementCount().getKnownMinValue())
153+
// This is a promoted ptrue.
154+
return true;
155+
}
156+
}
157+
}
158+
159+
// If no matching calls were found, this is not a promoted ptrue.
160+
return false;
161+
}
162+
163+
/// Attempts to coalesce ptrues in a basic block.
164+
bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls(
165+
BasicBlock &BB, SmallSetVector<IntrinsicInst *, 4> &PTrues) {
166+
if (PTrues.size() <= 1)
167+
return false;
168+
169+
// Find the ptrue with the most lanes.
170+
auto *MostEncompassingPTrue = *std::max_element(
171+
PTrues.begin(), PTrues.end(), [](auto *PTrue1, auto *PTrue2) {
172+
auto *PTrue1VTy = cast<ScalableVectorType>(PTrue1->getType());
173+
auto *PTrue2VTy = cast<ScalableVectorType>(PTrue2->getType());
174+
return PTrue1VTy->getElementCount().getKnownMinValue() <
175+
PTrue2VTy->getElementCount().getKnownMinValue();
176+
});
177+
178+
// Remove the most encompassing ptrue, as well as any promoted ptrues, leaving
179+
// behind only the ptrues to be coalesced.
180+
PTrues.remove(MostEncompassingPTrue);
181+
PTrues.remove_if([](auto *PTrue) { return isPTruePromoted(PTrue); });
182+
183+
// Hoist MostEncompassingPTrue to the start of the basic block. It is always
184+
// safe to do this, since ptrue intrinsic calls are guaranteed to have no
185+
// predecessors.
186+
MostEncompassingPTrue->moveBefore(BB, BB.getFirstInsertionPt());
187+
188+
LLVMContext &Ctx = BB.getContext();
189+
IRBuilder<> Builder(Ctx);
190+
Builder.SetInsertPoint(&BB, ++MostEncompassingPTrue->getIterator());
191+
192+
auto *MostEncompassingPTrueVTy =
193+
cast<VectorType>(MostEncompassingPTrue->getType());
194+
auto *ConvertToSVBool = Builder.CreateIntrinsic(
195+
Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy},
196+
{MostEncompassingPTrue});
197+
198+
for (auto *PTrue : PTrues) {
199+
auto *PTrueVTy = cast<VectorType>(PTrue->getType());
200+
201+
Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator());
202+
auto *ConvertFromSVBool =
203+
Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
204+
{PTrueVTy}, {ConvertToSVBool});
205+
PTrue->replaceAllUsesWith(ConvertFromSVBool);
206+
PTrue->eraseFromParent();
207+
}
208+
209+
return true;
210+
}
211+
212+
/// The goal of this function is to remove redundant calls to the SVE ptrue
213+
/// intrinsic in each basic block within the given functions.
214+
///
215+
/// SVE ptrues have two representations in LLVM IR:
216+
/// - a logical representation -- an arbitrary-width scalable vector of i1s,
217+
/// i.e. <vscale x N x i1>.
218+
/// - a physical representation (svbool, <vscale x 16 x i1>) -- a 16-element
219+
/// scalable vector of i1s, i.e. <vscale x 16 x i1>.
220+
///
221+
/// The SVE ptrue intrinsic is used to create a logical representation of an SVE
222+
/// predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If
223+
/// P1 creates a logical SVE predicate that is at least as wide as the logical
224+
/// SVE predicate created by P2, then all of the bits that are true in the
225+
/// physical representation of P2 are necessarily also true in the physical
226+
/// representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to
227+
/// P2 is redundant and can be replaced by an SVE reinterpret of P1 via
228+
/// convert.{to,from}.svbool.
229+
///
230+
/// Currently, this pass only coalesces calls to SVE ptrue intrinsics
231+
/// if they match the following conditions:
232+
///
233+
/// - the call to the intrinsic uses either the SV_ALL or SV_POW2 patterns.
234+
/// SV_ALL indicates that all bits of the predicate vector are to be set to
235+
/// true. SV_POW2 indicates that all bits of the predicate vector up to the
236+
/// largest power-of-two are to be set to true.
237+
/// - the result of the call to the intrinsic is not promoted to a wider
238+
/// predicate. In this case, keeping the extra ptrue leads to better codegen
239+
/// -- coalescing here would create an irreducible chain of SVE reinterprets
240+
/// via convert.{to,from}.svbool.
241+
///
242+
/// EXAMPLE:
243+
///
244+
/// %1 = <vscale x 8 x i1> ptrue(i32 SV_ALL)
245+
/// ; Logical: <1, 1, 1, 1, 1, 1, 1, 1>
246+
/// ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0>
247+
/// ...
248+
///
249+
/// %2 = <vscale x 4 x i1> ptrue(i32 SV_ALL)
250+
/// ; Logical: <1, 1, 1, 1>
251+
/// ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0>
252+
/// ...
253+
///
254+
/// Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance:
255+
///
256+
/// %1 = <vscale x 8 x i1> ptrue(i32 i31)
257+
/// %2 = <vscale x 16 x i1> convert.to.svbool(<vscale x 8 x i1> %1)
258+
/// %3 = <vscale x 4 x i1> convert.from.svbool(<vscale x 16 x i1> %2)
259+
///
260+
bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
261+
SmallSetVector<Function *, 4> &Functions) {
262+
bool Changed = false;
263+
264+
for (auto *F : Functions) {
265+
for (auto &BB : *F) {
266+
SmallSetVector<IntrinsicInst *, 4> SVAllPTrues;
267+
SmallSetVector<IntrinsicInst *, 4> SVPow2PTrues;
268+
269+
// For each basic block, collect the used ptrues and try to coalesce them.
270+
for (Instruction &I : BB) {
271+
if (I.use_empty())
272+
continue;
273+
274+
auto *IntrI = dyn_cast<IntrinsicInst>(&I);
275+
if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
276+
continue;
277+
278+
const auto PTruePattern =
279+
cast<ConstantInt>(IntrI->getOperand(0))->getZExtValue();
280+
281+
if (PTruePattern == AArch64SVEPredPattern::all)
282+
SVAllPTrues.insert(IntrI);
283+
if (PTruePattern == AArch64SVEPredPattern::pow2)
284+
SVPow2PTrues.insert(IntrI);
285+
}
286+
287+
Changed |= coalescePTrueIntrinsicCalls(BB, SVAllPTrues);
288+
Changed |= coalescePTrueIntrinsicCalls(BB, SVPow2PTrues);
289+
}
290+
}
291+
292+
return Changed;
293+
}
294+
98295
/// The function will remove redundant reinterprets casting in the presence
99296
/// of the control flow
100297
bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) {
@@ -243,7 +440,7 @@ bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
243440
return true;
244441
}
245442

246-
bool SVEIntrinsicOpts::optimizeFunctions(
443+
bool SVEIntrinsicOpts::optimizeIntrinsicCalls(
247444
SmallSetVector<Function *, 4> &Functions) {
248445
bool Changed = false;
249446
for (auto *F : Functions) {
@@ -260,6 +457,16 @@ bool SVEIntrinsicOpts::optimizeFunctions(
260457
return Changed;
261458
}
262459

460+
bool SVEIntrinsicOpts::optimizeFunctions(
461+
SmallSetVector<Function *, 4> &Functions) {
462+
bool Changed = false;
463+
464+
Changed |= optimizePTrueIntrinsicCalls(Functions);
465+
Changed |= optimizeIntrinsicCalls(Functions);
466+
467+
return Changed;
468+
}
469+
263470
bool SVEIntrinsicOpts::runOnModule(Module &M) {
264471
bool Changed = false;
265472
SmallSetVector<Function *, 4> Functions;
@@ -276,6 +483,7 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
276483
case Intrinsic::aarch64_sve_ptest_any:
277484
case Intrinsic::aarch64_sve_ptest_first:
278485
case Intrinsic::aarch64_sve_ptest_last:
486+
case Intrinsic::aarch64_sve_ptrue:
279487
for (User *U : F.users())
280488
Functions.insert(cast<Instruction>(U)->getFunction());
281489
break;

0 commit comments

Comments
 (0)