Skip to content

Commit f9b8786

Browse files
committed
[AArch64][ARM] Optimize more tbl/tbx calls into shufflevector
1 parent a6f4448 commit f9b8786

File tree

6 files changed

+151
-60
lines changed

6 files changed

+151
-60
lines changed

llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,10 @@ namespace llvm {
2626

2727
namespace ARMCommon {
2828

29-
/// Convert a table lookup to shufflevector if the mask is constant.
30-
/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
31-
/// which case we could lower the shufflevector with rev64 instructions
32-
/// as it's actually a byte reverse.
33-
Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC);
29+
/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and
30+
/// at most two source operands are actually referenced.
31+
Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC,
32+
bool IsExtension);
3433

3534
/// Simplify NEON multiply-long intrinsics (smull, umull).
3635
/// These intrinsics perform widening multiplies: they multiply two vectors of

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2858,7 +2858,15 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
28582858
case Intrinsic::aarch64_neon_fminnm:
28592859
return instCombineMaxMinNM(IC, II);
28602860
case Intrinsic::aarch64_neon_tbl1:
2861-
return ARMCommon::simplifyNeonTbl1(II, IC);
2861+
case Intrinsic::aarch64_neon_tbl2:
2862+
case Intrinsic::aarch64_neon_tbl3:
2863+
case Intrinsic::aarch64_neon_tbl4:
2864+
return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/false);
2865+
case Intrinsic::aarch64_neon_tbx1:
2866+
case Intrinsic::aarch64_neon_tbx2:
2867+
case Intrinsic::aarch64_neon_tbx3:
2868+
case Intrinsic::aarch64_neon_tbx4:
2869+
return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/true);
28622870
case Intrinsic::aarch64_neon_smull:
28632871
case Intrinsic::aarch64_neon_umull: {
28642872
bool IsSigned = IID == Intrinsic::aarch64_neon_smull;

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,16 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
184184
}
185185

186186
case Intrinsic::arm_neon_vtbl1:
187-
return ARMCommon::simplifyNeonTbl1(II, IC);
187+
case Intrinsic::arm_neon_vtbl2:
188+
case Intrinsic::arm_neon_vtbl3:
189+
case Intrinsic::arm_neon_vtbl4:
190+
return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/false);
191+
192+
case Intrinsic::arm_neon_vtbx1:
193+
case Intrinsic::arm_neon_vtbx2:
194+
case Intrinsic::arm_neon_vtbx3:
195+
case Intrinsic::arm_neon_vtbx4:
196+
return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/true);
188197

189198
case Intrinsic::arm_neon_vmulls:
190199
case Intrinsic::arm_neon_vmullu: {

llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp

Lines changed: 100 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
1919
#include "llvm/IR/Constants.h"
20+
#include "llvm/IR/DerivedTypes.h"
2021
#include "llvm/IR/IntrinsicInst.h"
2122
#include "llvm/IR/Value.h"
2223
#include "llvm/Transforms/InstCombine/InstCombiner.h"
@@ -27,41 +28,121 @@ using namespace llvm::PatternMatch;
2728
namespace llvm {
2829
namespace ARMCommon {
2930

30-
/// Convert a table lookup to shufflevector if the mask is constant.
31-
/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
32-
/// which case we could lower the shufflevector with rev64 instructions
33-
/// as it's actually a byte reverse.
34-
Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC) {
31+
/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and
32+
/// at most two source operands are actually referenced.
33+
Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC,
34+
bool IsExtension) {
3535
// Bail out if the mask is not a constant.
36-
auto *C = dyn_cast<Constant>(II.getArgOperand(1));
36+
auto *C = dyn_cast<Constant>(II.getArgOperand(II.arg_size() - 1));
3737
if (!C)
3838
return nullptr;
3939

40-
auto *VecTy = cast<FixedVectorType>(II.getType());
41-
unsigned NumElts = VecTy->getNumElements();
40+
auto *RetTy = cast<FixedVectorType>(II.getType());
41+
unsigned NumIndexes = RetTy->getNumElements();
4242

43-
// Only perform this transformation for <8 x i8> vector types.
44-
if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
43+
// Only perform this transformation for <8 x i8> and <16 x i8> vector types.
44+
if (!(RetTy->getElementType()->isIntegerTy(8) &&
45+
(NumIndexes == 8 || NumIndexes == 16)))
4546
return nullptr;
4647

47-
int Indexes[8];
48+
// For tbx instructions, the first argument is the "fallback" vector, which
49+
// has the same length as the mask and return type.
50+
unsigned int StartIndex = (unsigned)IsExtension;
51+
auto *SourceTy =
52+
cast<FixedVectorType>(II.getArgOperand(StartIndex)->getType());
53+
// Note that the element count of each source vector does *not* need to be the
54+
// same as the element count of the return type and mask! All source vectors
55+
// must have the same element count as each other, though.
56+
unsigned NumElementsPerSource = SourceTy->getNumElements();
57+
58+
// There are no tbl/tbx intrinsics for which the destination size exceeds the
59+
// source size. However, our definitions of the intrinsics, at least in
60+
// IntrinsicsAArch64.td, allow for arbitrary destination vector sizes, so it
61+
// *could* technically happen.
62+
if (NumIndexes > NumElementsPerSource) {
63+
return nullptr;
64+
}
65+
66+
// The tbl/tbx intrinsics take several source operands followed by a mask
67+
// operand.
68+
unsigned int NumSourceOperands = II.arg_size() - 1 - (unsigned)IsExtension;
69+
70+
// Map input operands to shuffle indices. This also helpfully deduplicates the
71+
// input arguments, in case the same value is passed as an argument multiple
72+
// times.
73+
SmallDenseMap<Value *, unsigned, 2> ValueToShuffleSlot;
74+
Value *ShuffleOperands[2] = {PoisonValue::get(SourceTy),
75+
PoisonValue::get(SourceTy)};
4876

49-
for (unsigned I = 0; I < NumElts; ++I) {
77+
int Indexes[16];
78+
for (unsigned I = 0; I < NumIndexes; ++I) {
5079
Constant *COp = C->getAggregateElement(I);
5180

52-
if (!COp || !isa<ConstantInt>(COp))
81+
if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
5382
return nullptr;
5483

55-
Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
84+
if (isa<UndefValue>(COp)) {
85+
Indexes[I] = -1;
86+
continue;
87+
}
88+
89+
uint64_t Index = cast<ConstantInt>(COp)->getZExtValue();
90+
// The index of the input argument that this index references (0 = first
91+
// source argument, etc).
92+
unsigned SourceOperandIndex = Index / NumElementsPerSource;
93+
// The index of the element at that source operand.
94+
unsigned SourceOperandElementIndex = Index % NumElementsPerSource;
95+
96+
Value *SourceOperand;
97+
if (SourceOperandIndex >= NumSourceOperands) {
98+
// This index is out of bounds. Map it to index into either the fallback
99+
// vector (tbx) or vector of zeroes (tbl).
100+
SourceOperandIndex = NumSourceOperands;
101+
if (IsExtension) {
102+
// For out-of-bounds indices in tbx, choose the `I`th element of the
103+
// fallback.
104+
SourceOperand = II.getArgOperand(0);
105+
SourceOperandElementIndex = I;
106+
} else {
107+
// Otherwise, choose some element from the dummy vector of zeroes (we'll
108+
// always choose the first).
109+
SourceOperand = Constant::getNullValue(SourceTy);
110+
SourceOperandElementIndex = 0;
111+
}
112+
} else {
113+
SourceOperand = II.getArgOperand(SourceOperandIndex + StartIndex);
114+
}
56115

57-
// Make sure the mask indices are in range.
58-
if ((unsigned)Indexes[I] >= NumElts)
116+
// The source operand may be the fallback vector, which may not have the
117+
// same number of elements as the source vector. In that case, we *could*
118+
// choose to extend its length with another shufflevector, but it's simpler
119+
// to just bail instead.
120+
if (cast<FixedVectorType>(SourceOperand->getType())->getNumElements() !=
121+
NumElementsPerSource) {
59122
return nullptr;
123+
}
124+
125+
// We now know the source operand referenced by this index. Make it a
126+
// shufflevector operand, if it isn't already.
127+
unsigned NumSlots = ValueToShuffleSlot.size();
128+
// This shuffle references more than two sources, and hence cannot be
129+
// represented as a shufflevector.
130+
if (NumSlots == 2 && !ValueToShuffleSlot.contains(SourceOperand)) {
131+
return nullptr;
132+
}
133+
auto [It, Inserted] =
134+
ValueToShuffleSlot.try_emplace(SourceOperand, NumSlots);
135+
if (Inserted) {
136+
ShuffleOperands[It->getSecond()] = SourceOperand;
137+
}
138+
139+
unsigned RemappedIndex =
140+
(It->getSecond() * NumElementsPerSource) + SourceOperandElementIndex;
141+
Indexes[I] = RemappedIndex;
60142
}
61143

62-
auto *V1 = II.getArgOperand(0);
63-
auto *V2 = Constant::getNullValue(V1->getType());
64-
Value *Shuf = IC.Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
144+
Value *Shuf = IC.Builder.CreateShuffleVector(
145+
ShuffleOperands[0], ShuffleOperands[1], ArrayRef(Indexes, NumIndexes));
65146
return IC.replaceInstUsesWith(II, Shuf);
66147
}
67148

0 commit comments

Comments
 (0)