[AArch64][ARM] Optimize more tbl/tbx calls into shufflevector

valadaptive · valadaptive · commit b05dd6e10ad3 · 2025-11-26T18:28:24.000-05:00
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2858,7 +2858,15 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_neon_fminnm:
     return instCombineMaxMinNM(IC, II);
   case Intrinsic::aarch64_neon_tbl1:
-    return ARMCommon::simplifyNeonTbl1(II, IC);
+  case Intrinsic::aarch64_neon_tbl2:
+  case Intrinsic::aarch64_neon_tbl3:
+  case Intrinsic::aarch64_neon_tbl4:
+    return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/false);
+  case Intrinsic::aarch64_neon_tbx1:
+  case Intrinsic::aarch64_neon_tbx2:
+  case Intrinsic::aarch64_neon_tbx3:
+  case Intrinsic::aarch64_neon_tbx4:
+    return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/true);
   case Intrinsic::aarch64_neon_smull:
   case Intrinsic::aarch64_neon_umull: {
     bool IsSigned = IID == Intrinsic::aarch64_neon_smull;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -184,7 +184,16 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   }
 
   case Intrinsic::arm_neon_vtbl1:
-    return ARMCommon::simplifyNeonTbl1(II, IC);
+  case Intrinsic::arm_neon_vtbl2:
+  case Intrinsic::arm_neon_vtbl3:
+  case Intrinsic::arm_neon_vtbl4:
+    return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/false);
+
+  case Intrinsic::arm_neon_vtbx1:
+  case Intrinsic::arm_neon_vtbx2:
+  case Intrinsic::arm_neon_vtbx3:
+  case Intrinsic::arm_neon_vtbx4:
+    return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/true);
 
   case Intrinsic::arm_neon_vmulls:
   case Intrinsic::arm_neon_vmullu: {
diff --git a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp
@@ -18,6 +18,7 @@
 
 #include "ARMCommonInstCombineIntrinsic.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
@@ -28,41 +29,123 @@ using namespace llvm::PatternMatch;
 namespace llvm {
 namespace ARMCommon {
 
-/// Convert a table lookup to shufflevector if the mask is constant.
-/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
-/// which case we could lower the shufflevector with rev64 instructions
-/// as it's actually a byte reverse.
-Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC) {
+/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and
+/// at most two source operands are actually referenced.
+Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC,
+                             bool IsExtension) {
   // Bail out if the mask is not a constant.
-  auto *C = dyn_cast<Constant>(II.getArgOperand(1));
+  auto *C = dyn_cast<Constant>(II.getArgOperand(II.arg_size() - 1));
   if (!C)
     return nullptr;
 
-  auto *VecTy = cast<FixedVectorType>(II.getType());
-  unsigned NumElts = VecTy->getNumElements();
+  auto *RetTy = cast<FixedVectorType>(II.getType());
+  unsigned NumIndexes = RetTy->getNumElements();
 
-  // Only perform this transformation for <8 x i8> vector types.
-  if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
+  // Only perform this transformation for <8 x i8> and <16 x i8> vector types.
+  // Even the language-level intrinsics that operate on u8/p8 should lower to an
+  // LLVM intrinsic that operates on i8.
+  if (!(RetTy->getElementType()->isIntegerTy(8) &&
+        (NumIndexes == 8 || NumIndexes == 16)))
     return nullptr;
 
-  int Indexes[8];
+  // For tbx instructions, the first argument is the "fallback" vector, which
+  // has the same length as the mask and return type.
+  unsigned int StartIndex = (unsigned)IsExtension;
+  auto *SourceTy =
+      cast<FixedVectorType>(II.getArgOperand(StartIndex)->getType());
+  // Note that the element count of each source vector does *not* need to be the
+  // same as the element count of the return type and mask! All source vectors
+  // must have the same element count as each other, though.
+  unsigned NumElementsPerSource = SourceTy->getNumElements();
+
+  // There are no tbl/tbx intrinsics for which the destination size exceeds the
+  // source size. However, our definitions of the intrinsics, at least in
+  // IntrinsicsAArch64.td, allow for arbitrary destination vector sizes, so it
+  // *could* technically happen.
+  if (NumIndexes > NumElementsPerSource) {
+    return nullptr;
+  }
+
+  // The tbl/tbx intrinsics take several source operands followed by a mask
+  // operand.
+  unsigned int NumSourceOperands = II.arg_size() - 1 - (unsigned)IsExtension;
+
+  // Map input operands to shuffle indices. This also helpfully deduplicates the
+  // input arguments, in case the same value is passed as an argument multiple
+  // times.
+  SmallDenseMap<Value *, unsigned, 2> ValueToShuffleSlot;
+  Value *ShuffleOperands[2] = {PoisonValue::get(SourceTy),
+                               PoisonValue::get(SourceTy)};
 
-  for (unsigned I = 0; I < NumElts; ++I) {
+  int Indexes[16];
+  for (unsigned I = 0; I < NumIndexes; ++I) {
     Constant *COp = C->getAggregateElement(I);
 
-    if (!COp || !isa<ConstantInt>(COp))
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
       return nullptr;
 
-    Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = -1;
+      continue;
+    }
+
+    uint64_t Index = cast<ConstantInt>(COp)->getZExtValue();
+    // The index of the input argument that this index references (0 = first
+    // source argument, etc).
+    unsigned SourceOperandIndex = Index / NumElementsPerSource;
+    // The index of the element at that source operand.
+    unsigned SourceOperandElementIndex = Index % NumElementsPerSource;
+
+    Value *SourceOperand;
+    if (SourceOperandIndex >= NumSourceOperands) {
+      // This index is out of bounds. Map it to index into either the fallback
+      // vector (tbx) or vector of zeroes (tbl).
+      SourceOperandIndex = NumSourceOperands;
+      if (IsExtension) {
+        // For out-of-bounds indices in tbx, choose the `I`th element of the
+        // fallback.
+        SourceOperand = II.getArgOperand(0);
+        SourceOperandElementIndex = I;
+      } else {
+        // Otherwise, choose some element from the dummy vector of zeroes (we'll
+        // always choose the first).
+        SourceOperand = Constant::getNullValue(SourceTy);
+        SourceOperandElementIndex = 0;
+      }
+    } else {
+      SourceOperand = II.getArgOperand(SourceOperandIndex + StartIndex);
+    }
 
-    // Make sure the mask indices are in range.
-    if ((unsigned)Indexes[I] >= NumElts)
+    // The source operand may be the fallback vector, which may not have the
+    // same number of elements as the source vector. In that case, we *could*
+    // choose to extend its length with another shufflevector, but it's simpler
+    // to just bail instead.
+    if (cast<FixedVectorType>(SourceOperand->getType())->getNumElements() !=
+        NumElementsPerSource) {
       return nullptr;
+    }
+
+    // We now know the source operand referenced by this index. Make it a
+    // shufflevector operand, if it isn't already.
+    unsigned NumSlots = ValueToShuffleSlot.size();
+    // This shuffle references more than two sources, and hence cannot be
+    // represented as a shufflevector.
+    if (NumSlots == 2 && !ValueToShuffleSlot.contains(SourceOperand)) {
+      return nullptr;
+    }
+    auto [It, Inserted] =
+        ValueToShuffleSlot.try_emplace(SourceOperand, NumSlots);
+    if (Inserted) {
+      ShuffleOperands[It->getSecond()] = SourceOperand;
+    }
+
+    unsigned RemappedIndex =
+        (It->getSecond() * NumElementsPerSource) + SourceOperandElementIndex;
+    Indexes[I] = RemappedIndex;
   }
 
-  auto *V1 = II.getArgOperand(0);
-  auto *V2 = Constant::getNullValue(V1->getType());
-  Value *Shuf = IC.Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
+  Value *Shuf = IC.Builder.CreateShuffleVector(
+      ShuffleOperands[0], ShuffleOperands[1], ArrayRef(Indexes, NumIndexes));
   return IC.replaceInstUsesWith(II, Shuf);
 }
 
diff --git a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h
@@ -27,11 +27,10 @@ namespace llvm {
 
 namespace ARMCommon {
 
-/// Convert a table lookup to shufflevector if the mask is constant.
-/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
-/// which case we could lower the shufflevector with rev64 instructions
-/// as it's actually a byte reverse.
-Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC);
+/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and
+/// at most two source operands are actually referenced.
+Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC,
+                             bool IsExtension);
 
 /// Simplify NEON multiply-long intrinsics (smull, umull).
 /// These intrinsics perform widening multiplies: they multiply two vectors of
diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll
diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl.ll b/llvm/test/Transforms/InstCombine/ARM/tbl.ll