Skip to content

Commit b05dd6e

Browse files
committed
[AArch64][ARM] Optimize more tbl/tbx calls into shufflevector
1 parent 14ae19c commit b05dd6e

File tree

6 files changed

+153
-60
lines changed

6 files changed

+153
-60
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2858,7 +2858,15 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
28582858
case Intrinsic::aarch64_neon_fminnm:
28592859
return instCombineMaxMinNM(IC, II);
28602860
case Intrinsic::aarch64_neon_tbl1:
2861-
return ARMCommon::simplifyNeonTbl1(II, IC);
2861+
case Intrinsic::aarch64_neon_tbl2:
2862+
case Intrinsic::aarch64_neon_tbl3:
2863+
case Intrinsic::aarch64_neon_tbl4:
2864+
return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/false);
2865+
case Intrinsic::aarch64_neon_tbx1:
2866+
case Intrinsic::aarch64_neon_tbx2:
2867+
case Intrinsic::aarch64_neon_tbx3:
2868+
case Intrinsic::aarch64_neon_tbx4:
2869+
return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/true);
28622870
case Intrinsic::aarch64_neon_smull:
28632871
case Intrinsic::aarch64_neon_umull: {
28642872
bool IsSigned = IID == Intrinsic::aarch64_neon_smull;

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,16 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
184184
}
185185

186186
case Intrinsic::arm_neon_vtbl1:
187-
return ARMCommon::simplifyNeonTbl1(II, IC);
187+
case Intrinsic::arm_neon_vtbl2:
188+
case Intrinsic::arm_neon_vtbl3:
189+
case Intrinsic::arm_neon_vtbl4:
190+
return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/false);
191+
192+
case Intrinsic::arm_neon_vtbx1:
193+
case Intrinsic::arm_neon_vtbx2:
194+
case Intrinsic::arm_neon_vtbx3:
195+
case Intrinsic::arm_neon_vtbx4:
196+
return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/true);
188197

189198
case Intrinsic::arm_neon_vmulls:
190199
case Intrinsic::arm_neon_vmullu: {

llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp

Lines changed: 102 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include "ARMCommonInstCombineIntrinsic.h"
2020
#include "llvm/IR/Constants.h"
21+
#include "llvm/IR/DerivedTypes.h"
2122
#include "llvm/IR/IntrinsicInst.h"
2223
#include "llvm/IR/Value.h"
2324
#include "llvm/Transforms/InstCombine/InstCombiner.h"
@@ -28,41 +29,123 @@ using namespace llvm::PatternMatch;
2829
namespace llvm {
2930
namespace ARMCommon {
3031

31-
/// Convert a table lookup to shufflevector if the mask is constant.
32-
/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
33-
/// which case we could lower the shufflevector with rev64 instructions
34-
/// as it's actually a byte reverse.
35-
Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC) {
32+
/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and
33+
/// at most two source operands are actually referenced.
34+
Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC,
35+
bool IsExtension) {
3636
// Bail out if the mask is not a constant.
37-
auto *C = dyn_cast<Constant>(II.getArgOperand(1));
37+
auto *C = dyn_cast<Constant>(II.getArgOperand(II.arg_size() - 1));
3838
if (!C)
3939
return nullptr;
4040

41-
auto *VecTy = cast<FixedVectorType>(II.getType());
42-
unsigned NumElts = VecTy->getNumElements();
41+
auto *RetTy = cast<FixedVectorType>(II.getType());
42+
unsigned NumIndexes = RetTy->getNumElements();
4343

44-
// Only perform this transformation for <8 x i8> vector types.
45-
if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
44+
// Only perform this transformation for <8 x i8> and <16 x i8> vector types.
45+
// Even the language-level intrinsics that operate on u8/p8 should lower to an
46+
// LLVM intrinsic that operates on i8.
47+
if (!(RetTy->getElementType()->isIntegerTy(8) &&
48+
(NumIndexes == 8 || NumIndexes == 16)))
4649
return nullptr;
4750

48-
int Indexes[8];
51+
// For tbx instructions, the first argument is the "fallback" vector, which
52+
// has the same length as the mask and return type.
53+
unsigned int StartIndex = (unsigned)IsExtension;
54+
auto *SourceTy =
55+
cast<FixedVectorType>(II.getArgOperand(StartIndex)->getType());
56+
// Note that the element count of each source vector does *not* need to be the
57+
// same as the element count of the return type and mask! All source vectors
58+
// must have the same element count as each other, though.
59+
unsigned NumElementsPerSource = SourceTy->getNumElements();
60+
61+
// There are no tbl/tbx intrinsics for which the destination size exceeds the
62+
// source size. However, our definitions of the intrinsics, at least in
63+
// IntrinsicsAArch64.td, allow for arbitrary destination vector sizes, so it
64+
// *could* technically happen.
65+
if (NumIndexes > NumElementsPerSource) {
66+
return nullptr;
67+
}
68+
69+
// The tbl/tbx intrinsics take several source operands followed by a mask
70+
// operand.
71+
unsigned int NumSourceOperands = II.arg_size() - 1 - (unsigned)IsExtension;
72+
73+
// Map input operands to shuffle indices. This also helpfully deduplicates the
74+
// input arguments, in case the same value is passed as an argument multiple
75+
// times.
76+
SmallDenseMap<Value *, unsigned, 2> ValueToShuffleSlot;
77+
Value *ShuffleOperands[2] = {PoisonValue::get(SourceTy),
78+
PoisonValue::get(SourceTy)};
4979

50-
for (unsigned I = 0; I < NumElts; ++I) {
80+
int Indexes[16];
81+
for (unsigned I = 0; I < NumIndexes; ++I) {
5182
Constant *COp = C->getAggregateElement(I);
5283

53-
if (!COp || !isa<ConstantInt>(COp))
84+
if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
5485
return nullptr;
5586

56-
Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
87+
if (isa<UndefValue>(COp)) {
88+
Indexes[I] = -1;
89+
continue;
90+
}
91+
92+
uint64_t Index = cast<ConstantInt>(COp)->getZExtValue();
93+
// The index of the input argument that this index references (0 = first
94+
// source argument, etc).
95+
unsigned SourceOperandIndex = Index / NumElementsPerSource;
96+
// The index of the element at that source operand.
97+
unsigned SourceOperandElementIndex = Index % NumElementsPerSource;
98+
99+
Value *SourceOperand;
100+
if (SourceOperandIndex >= NumSourceOperands) {
101+
// This index is out of bounds. Map it to index into either the fallback
102+
// vector (tbx) or vector of zeroes (tbl).
103+
SourceOperandIndex = NumSourceOperands;
104+
if (IsExtension) {
105+
// For out-of-bounds indices in tbx, choose the `I`th element of the
106+
// fallback.
107+
SourceOperand = II.getArgOperand(0);
108+
SourceOperandElementIndex = I;
109+
} else {
110+
// Otherwise, choose some element from the dummy vector of zeroes (we'll
111+
// always choose the first).
112+
SourceOperand = Constant::getNullValue(SourceTy);
113+
SourceOperandElementIndex = 0;
114+
}
115+
} else {
116+
SourceOperand = II.getArgOperand(SourceOperandIndex + StartIndex);
117+
}
57118

58-
// Make sure the mask indices are in range.
59-
if ((unsigned)Indexes[I] >= NumElts)
119+
// The source operand may be the fallback vector, which may not have the
120+
// same number of elements as the source vector. In that case, we *could*
121+
// choose to extend its length with another shufflevector, but it's simpler
122+
// to just bail instead.
123+
if (cast<FixedVectorType>(SourceOperand->getType())->getNumElements() !=
124+
NumElementsPerSource) {
60125
return nullptr;
126+
}
127+
128+
// We now know the source operand referenced by this index. Make it a
129+
// shufflevector operand, if it isn't already.
130+
unsigned NumSlots = ValueToShuffleSlot.size();
131+
// This shuffle references more than two sources, and hence cannot be
132+
// represented as a shufflevector.
133+
if (NumSlots == 2 && !ValueToShuffleSlot.contains(SourceOperand)) {
134+
return nullptr;
135+
}
136+
auto [It, Inserted] =
137+
ValueToShuffleSlot.try_emplace(SourceOperand, NumSlots);
138+
if (Inserted) {
139+
ShuffleOperands[It->getSecond()] = SourceOperand;
140+
}
141+
142+
unsigned RemappedIndex =
143+
(It->getSecond() * NumElementsPerSource) + SourceOperandElementIndex;
144+
Indexes[I] = RemappedIndex;
61145
}
62146

63-
auto *V1 = II.getArgOperand(0);
64-
auto *V2 = Constant::getNullValue(V1->getType());
65-
Value *Shuf = IC.Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
147+
Value *Shuf = IC.Builder.CreateShuffleVector(
148+
ShuffleOperands[0], ShuffleOperands[1], ArrayRef(Indexes, NumIndexes));
66149
return IC.replaceInstUsesWith(II, Shuf);
67150
}
68151

llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,10 @@ namespace llvm {
2727

2828
namespace ARMCommon {
2929

30-
/// Convert a table lookup to shufflevector if the mask is constant.
31-
/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
32-
/// which case we could lower the shufflevector with rev64 instructions
33-
/// as it's actually a byte reverse.
34-
Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC);
30+
/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and
31+
/// at most two source operands are actually referenced.
32+
Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC,
33+
bool IsExtension);
3534

3635
/// Simplify NEON multiply-long intrinsics (smull, umull).
3736
/// These intrinsics perform widening multiplies: they multiply two vectors of

0 commit comments

Comments
 (0)