[InstCombine] Fold bitcast (extelt (bitcast X), Idx) into bitcast+shuffle.

rj-jesus · rj-jesus · commit ec4373b3e552 · 2025-04-23T07:37:38.000-07:00
Fold sequences such as: ```llvm %bc = bitcast <8 x i32> %v to <2 x i128> %ext = extractelement <2 x i128> %bc, i64 0 %res = bitcast i128 %ext to <2 x i64> ``` Into: ```llvm %bc = bitcast <8 x i32> %v to <4 x i64> %res = shufflevector <4 x i64> %bc, <4 x i64> poison, <2 x i32> <i32 0, i32 1> ``` The motivation for this is a long standing regression affecting SIMDe on AArch64 introduced indirectly by the AlwaysInliner (1a2e77c). Some reproducers: * https://godbolt.org/z/53qx18s6M * https://godbolt.org/z/o5e43h5M7
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2380,6 +2380,51 @@ static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
   return Result;
 }
 
+/// If the input is (extractelement (bitcast X), Idx) and the source and
+/// destination types are vectors, we are performing a vector extract from X. We
+/// can replace the extractelement+bitcast with a shufflevector, avoiding the
+/// final scalar->vector bitcast. This pattern is usually handled better by the
+/// backend.
+///
+/// Example:
+///   %bc = bitcast <8 x i32> %X to <2 x i128>
+///   %ext = extractelement <2 x i128> %bc1, i64 1
+///   bitcast i128 %ext to <2 x i64>
+///   --->
+///   %bc = bitcast <8 x i32> %X to <4 x i64>
+///   shufflevector <4 x i64> %bc, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+static Instruction *foldBitCastExtElt(BitCastInst &BitCast,
+                                      InstCombiner::BuilderTy &Builder) {
+  Value *X;
+  uint64_t Index;
+  if (!match(
+          BitCast.getOperand(0),
+          m_OneUse(m_ExtractElt(m_BitCast(m_Value(X)), m_ConstantInt(Index)))))
+    return nullptr;
+
+  auto *SrcTy = dyn_cast<FixedVectorType>(X->getType());
+  auto *DstTy = dyn_cast<FixedVectorType>(BitCast.getType());
+  if (!SrcTy || !DstTy)
+    return nullptr;
+
+  // Check if the mask indices would overflow.
+  unsigned NumElts = DstTy->getNumElements();
+  if (Index > INT32_MAX || NumElts > INT32_MAX ||
+      (Index + 1) * NumElts - 1 > INT32_MAX)
+    return nullptr;
+
+  unsigned DstEltWidth = DstTy->getScalarSizeInBits();
+  unsigned SrcVecWidth = SrcTy->getPrimitiveSizeInBits();
+  assert((SrcVecWidth % DstEltWidth == 0) && "Invalid types.");
+  auto *NewVecTy =
+      FixedVectorType::get(DstTy->getElementType(), SrcVecWidth / DstEltWidth);
+  auto *NewBC = Builder.CreateBitCast(X, NewVecTy, "bc");
+
+  unsigned StartIdx = Index * NumElts;
+  auto Mask = llvm::to_vector<16>(llvm::seq<int>(StartIdx, StartIdx + NumElts));
+  return new ShuffleVectorInst(NewBC, Mask);
+}
+
 /// Canonicalize scalar bitcasts of extracted elements into a bitcast of the
 /// vector followed by extract element. The backend tends to handle bitcasts of
 /// vectors better than bitcasts of scalars because vector registers are
@@ -2866,6 +2911,9 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
   if (Instruction *I = canonicalizeBitCastExtElt(CI, *this))
     return I;
 
+  if (Instruction *I = foldBitCastExtElt(CI, Builder))
+    return I;
+
   if (Instruction *I = foldBitCastBitwiseLogic(CI, Builder))
     return I;
 
diff --git a/llvm/test/Transforms/InstCombine/bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast.ll
@@ -484,9 +484,8 @@ define double @bitcast_extelt8(<1 x i64> %A) {
 
 define <2 x i64> @bitcast_extelt9(<8 x i32> %A) {
 ; CHECK-LABEL: @bitcast_extelt9(
-; CHECK-NEXT:    [[BC1:%.*]] = bitcast <8 x i32> [[A:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[EXT:%.*]] = extractelement <2 x i128> [[BC1]], i64 1
-; CHECK-NEXT:    [[BC2:%.*]] = bitcast i128 [[EXT]] to <2 x i64>
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <8 x i32> [[A:%.*]] to <4 x i64>
+; CHECK-NEXT:    [[BC2:%.*]] = shufflevector <4 x i64> [[BC]], <4 x i64> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    ret <2 x i64> [[BC2]]
 ;
   %bc1 = bitcast <8 x i32> %A to <2 x i128>
@@ -499,9 +498,8 @@ define <2 x i64> @bitcast_extelt9(<8 x i32> %A) {
 
 define <2 x i8> @bitcast_extelt10(<8 x i32> %A) {
 ; CHECK-LABEL: @bitcast_extelt10(
-; CHECK-NEXT:    [[BC1:%.*]] = bitcast <8 x i32> [[A:%.*]] to <16 x i16>
-; CHECK-NEXT:    [[EXT:%.*]] = extractelement <16 x i16> [[BC1]], i64 3
-; CHECK-NEXT:    [[BC2:%.*]] = bitcast i16 [[EXT]] to <2 x i8>
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <8 x i32> [[A:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[BC2:%.*]] = shufflevector <32 x i8> [[BC]], <32 x i8> poison, <2 x i32> <i32 6, i32 7>
 ; CHECK-NEXT:    ret <2 x i8> [[BC2]]
 ;
   %bc1 = bitcast <8 x i32> %A to <16 x i16>