[VectorCombine] Allow shuffling with bitcast for not multiple offset for loadsize #119139

ParkHanbum · 2024-12-08T18:23:31Z

Previously, vectorization for load-insert failed when the Offset was not
a multiple of the Load type size.

This patch allow it in two steps,

Vectorize it using a common multiple of Offset and LoadSize.
Bitcast to fit

Alive2: https://alive2.llvm.org/ce/z/Kgr9HQ

llvmbot · 2024-12-08T18:24:07Z

@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: hanbeom (ParkHanbum)

Changes

Previously, vectorization for load-insert failed when the Offset was not
a multiple of the Load type size.

This patch allow it in two steps,

Vectorize it using a common multiple of Offset and LoadSize.
Bitcast to fit

Alive2: https://alive2.llvm.org/ce/z/Kgr9HQ

Patch is 25.07 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119139.diff

3 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+60-16)
(modified) llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll (+119-9)
(modified) llvm/test/Transforms/VectorCombine/X86/load.ll (+171-9)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index b9caf8c0df9be1..85b9feb6c51e03 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -190,6 +190,15 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   if (!canWidenLoad(Load, TTI))
     return false;
 
+  auto MaxCommonDivisor = [](int n) {
+    if (n % 4 == 0)
+      return 4;
+    if (n % 2 == 0)
+      return 2;
+    else
+      return 1;
+  };
+
   Type *ScalarTy = Scalar->getType();
   uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
   unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
@@ -204,6 +213,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   unsigned MinVecNumElts = MinVectorSize / ScalarSize;
   auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
   unsigned OffsetEltIndex = 0;
+  unsigned VectorRange = 0;
+  bool NeedCast = false;
   Align Alignment = Load->getAlign();
   if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
                                    &DT)) {
@@ -220,15 +231,27 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
     if (Offset.isNegative())
       return false;
 
-    // The offset must be a multiple of the scalar element to shuffle cleanly
-    // in the element's size.
+    // If Offset is multiple of a Scalar element, it can be shuffled to the
+    // element's size; otherwise, Offset and Scalar must be shuffled to the
+    // appropriate element size for both.
     uint64_t ScalarSizeInBytes = ScalarSize / 8;
-    if (Offset.urem(ScalarSizeInBytes) != 0)
-      return false;
+    if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes);
+        UnalignedBytes != 0) {
+      uint64_t OldScalarSizeInBytes = ScalarSizeInBytes;
+      // Assign the greatest common divisor between UnalignedBytes and Offset to
+      // ScalarSizeInBytes
+      ScalarSizeInBytes = MaxCommonDivisor(UnalignedBytes);
+      ScalarSize = ScalarSizeInBytes * 8;
+      VectorRange = OldScalarSizeInBytes / ScalarSizeInBytes;
+      MinVecNumElts = MinVectorSize / ScalarSize;
+      ScalarTy = Type::getIntNTy(I.getContext(), ScalarSize);
+      MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
+      NeedCast = true;
+    }
 
-    // If we load MinVecNumElts, will our target element still be loaded?
     OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
-    if (OffsetEltIndex >= MinVecNumElts)
+    // If we load MinVecNumElts, will our target element still be loaded?
+    if (OffsetEltIndex + VectorRange >= MinVecNumElts)
       return false;
 
     if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
@@ -246,12 +269,15 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
   Type *LoadTy = Load->getType();
   unsigned AS = Load->getPointerAddressSpace();
+  auto VecTy = cast<InsertElementInst>(&I)->getType();
+
   InstructionCost OldCost =
       TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
-  APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
+  APInt DemandedElts =
+      APInt::getOneBitSet(VecTy->getElementCount().getFixedValue(), 0);
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   OldCost +=
-      TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
+      TTI.getScalarizationOverhead(VecTy, DemandedElts,
                                    /* Insert */ true, HasExtract, CostKind);
 
   // New pattern: load VecPtr
@@ -264,14 +290,28 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // We assume this operation has no cost in codegen if there was no offset.
   // Note that we could use freeze to avoid poison problems, but then we might
   // still need a shuffle to change the vector size.
-  auto *Ty = cast<FixedVectorType>(I.getType());
-  unsigned OutputNumElts = Ty->getNumElements();
-  SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
-  assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
-  Mask[0] = OffsetEltIndex;
+  SmallVector<int> Mask;
+  assert(OffsetEltIndex + VectorRange < MinVecNumElts &&
+         "Address offset too big");
+  if (!NeedCast) {
+    auto *Ty = cast<FixedVectorType>(I.getType());
+    unsigned OutputNumElts = Ty->getNumElements();
+    Mask.assign(OutputNumElts, PoisonMaskElem);
+    Mask[0] = OffsetEltIndex;
+  } else {
+    Mask.assign(MinVecNumElts, PoisonMaskElem);
+    for (unsigned InsertPos = 0; InsertPos < VectorRange; InsertPos++)
+      Mask[InsertPos] = OffsetEltIndex++;
+  }
+
   if (OffsetEltIndex)
     NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask);
 
+  if (NeedCast)
+    NewCost += TTI.getCastInstrCost(Instruction::BitCast, I.getType(), MinVecTy,
+                                    TargetTransformInfo::CastContextHint::None,
+                                    TargetTransformInfo::TCK_RecipThroughput);
+
   // We can aggressively convert to the vector form because the backend can
   // invert this transform if it does not result in a performance win.
   if (OldCost < NewCost || !NewCost.isValid())
@@ -280,12 +320,16 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // It is safe and potentially profitable to load a vector directly:
   // inselt undef, load Scalar, 0 --> load VecPtr
   IRBuilder<> Builder(Load);
+  Value *Result;
   Value *CastedPtr =
       Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
-  Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
-  VecLd = Builder.CreateShuffleVector(VecLd, Mask);
+  Result = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
+  Result = Builder.CreateShuffleVector(Result, Mask);
 
-  replaceValue(I, *VecLd);
+  if (NeedCast)
+    Result = Builder.CreateBitOrPointerCast(Result, I.getType());
+
+  replaceValue(I, *Result);
   ++NumVecLoad;
   return true;
 }
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 937d4043adc0c4..5c3615cffd8e43 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -302,23 +302,133 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer
   ret <8 x i16> %r
 }
 
-; Negative test - if we are shuffling a load from the base pointer, the address offset
-; must be a multiple of element size.
-; TODO: Could bitcast around this limitation.
+define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
+  %s = load i32, ptr %gep, align 1
+  %r = insertelement <4 x i32> poison, i32 %s, i64 0
+  ret <4 x i32> %r
+}
 
-define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(16) %p) nofree nosync {
-; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
+; SSE2-NEXT:    ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; AVX2-NEXT:    ret <2 x i64> [[R]]
 ;
   %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
+  %s = load i64, ptr %gep, align 1
+  %r = insertelement <2 x i64> poison, i64 %s, i64 0
+  ret <2 x i64> %r
+}
+
+define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11
+  %s = load i32, ptr %gep, align 1
+  %r = insertelement <4 x i32> poison, i32 %s, i64 0
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
+  %s = load i32, ptr %gep, align 1
+  %r = insertelement <4 x i32> poison, i32 %s, i64 0
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
+; SSE2-NEXT:    ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x i64>
+; AVX2-NEXT:    ret <2 x i64> [[R]]
+;
+  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
+  %s = load i64, ptr %gep, align 1
+  %r = insertelement <2 x i64> poison, i64 %s, i64 0
+  ret <2 x i64> %r
+}
+
+define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5
   %s = load i32, ptr %gep, align 1
   %r = insertelement <4 x i32> poison, i32 %s, i64 0
   ret <4 x i32> %r
 }
 
+define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 1
+  %s = load i64, ptr %gep, align 1
+  %r = insertelement <2 x i64> poison, i64 %s, i64 0
+  ret <2 x i64> %r
+}
+
 define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync {
 ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
index bdd05a1a37c70f..994ef1f9c66d89 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -285,23 +285,133 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer
   ret <8 x i16> %r
 }
 
-; Negative test - if we are shuffling a load from the base pointer, the address offset
-; must be a multiple of element size.
-; TODO: Could bitcast around this limitation.
+define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
+  %s = load i32, ptr %gep, align 1
+  %r = insertelement <4 x i32> undef, i32 %s, i64 0
+  ret <4 x i32> %r
+}
 
-define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
+; SSE2-NEXT:    ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; AVX2-NEXT:    ret <2 x i64> [[R]]
 ;
   %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
+  %s = load i64, ptr %gep, align 1
+  %r = insertelement <2 x i64> undef, i64 %s, i64 0
+  ret <2 x i64> %r
+}
+
+define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11
   %s = load i32, ptr %gep, align 1
   %r = insertelement <4 x i32> undef, i32 %s, i64 0
   ret <4 x i32> %r
 }
 
+define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
+  %s = load i32, ptr %gep, align 1
+  %r = insertelement <4 x i32> undef, i32 %s, i64 0
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
+; SSE2-NEXT:    ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NE...
[truncated]

ParkHanbum · 2025-02-04T18:06:46Z

@RKSimon could you review this patch please?

RKSimon

Does this work with big endian?

ParkHanbum · 2025-02-07T03:45:30Z

@RKSimon Probably not. Is it possible to separate logic according to Endian?

RKSimon · 2025-02-09T16:42:19Z

Yes - use DL->isBigEndian()

github-actions · 2025-02-11T06:05:31Z

✅ With the latest revision this PR passed the undef deprecator.

ParkHanbum · 2025-02-11T09:56:23Z

@RKSimon
Last time, I asked if the number of IRs increases, can we see if it improves the performance? I was able to see it through the number of Assembly. Thank you.

This time, it's a little difficult, so I'm asking you a question.
I understand that CostModel judge the costs internally, but,
Is there any way to infer performance improvement through assembly?

https://alive2.llvm.org/ce/z/2gqJjB

RKSimon · 2025-04-22T17:28:29Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+  SmallVector<int> Mask;
+  assert(OffsetEltIndex + VectorRange < MinVecNumElts &&
+         "Address offset too big");
+  if (!NeedCast) {


(style) Use if (NeedCast) {} else {} ordering

RKSimon · 2025-04-22T17:29:31Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

-    if (Offset.urem(ScalarSizeInBytes) != 0)
-      return false;
+    if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes);
+        UnalignedBytes != 0) {


Why not this?

if (uint64_t UnalignedBytes = Offset.urem(ScalarSizeInBytes)) {

RKSimon · 2025-04-22T17:30:44Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+    Mask[0] = OffsetEltIndex;
+  } else {
+    Mask.assign(MinVecNumElts, PoisonMaskElem);
+    for (unsigned InsertPos = 0; InsertPos < VectorRange; InsertPos++)


std::iota(Mask.begin(), Mask.begin() + VectorRange, OffsetEltIndex);

RKSimon · 2025-04-22T17:33:08Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+  Worklist.pushValue(Result);
+  if (NeedCast) {
+    Result = Builder.CreateBitOrPointerCast(Result, I.getType());
+    Worklist.pushValue(Result);


replaceValue should handle WorkList additions - you just need to add the instructions you created earlier

RKSimon · 2025-05-27T12:11:57Z

@ParkHanbum I'm sorry but this fell of my radar - please can you merge to latest on trunk?

ParkHanbum · 2025-06-06T03:57:36Z

@RKSimon I've never done that, so I'm a little worried. Can we try it first in another PR with one commit?

RKSimon · 2025-06-06T08:17:32Z

@RKSimon I've never done that, so I'm a little worried. Can we try it first in another PR with one commit?

In many circumstances you can just use the "Update branch" button on this webpage - I've used it to merge to latest, and it restarts the CI as well.

ParkHanbum · 2025-06-06T11:57:27Z

@RKSimon Ah! That's what you meant, I misunderstood it as something else. .

ParkHanbum · 2025-11-05T04:40:21Z

@RKSimon Would you be able to review this PR when you have time?

…for loadsize Previously, vectorization for load-insert failed when the Offset was not a multiple of the Load type size. This patch allow it in two steps, 1. Vectorize it using a common multiple of Offset and LoadSize. 2. Bitcast to fit Alive2: https://alive2.llvm.org/ce/z/Kgr9HQ

… statement

replaceValue adds new instruction to the worklist internally, so don't need to push it to the worklist to remove it.

RKSimon · 2025-11-12T11:58:21Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+    Mask.assign(OutputNumElts, PoisonMaskElem);
+    Mask[0] = OffsetEltIndex;
+  }
+
  if (OffsetEltIndex)
    NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,


llvm-project/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp:351:61: error: use of undeclared identifier 'Ty'

I apologize. I thought nothing had changed when I requested the review, but there were changes, and I'm currently addressing this issue.

github-actions · 2025-11-13T05:36:00Z

✅ With the latest revision this PR passed the C/C++ code formatter.

ParkHanbum · 2025-11-13T06:34:04Z

building in linux failed but I can't figure it out why
[{'explained': False, 'name': 'lib/Transforms/Vectorize/CMakeFiles/LLVMVectorize.dir/VectorCombine.cpp.o', 'reason': None}]

RKSimon · 2025-11-13T13:46:26Z

llvm-project/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp:356:5: error: add explicit braces to avoid dangling else [-Werror,-Wdangling-else]
                      else

RKSimon · 2025-11-13T13:47:55Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

If you need cast - shouldn't there be different vector types?

I will explain based on the following IR.

+define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) { ; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison> +; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[R]] ;

We first create a vector identical to the one processed up to the insertElement in the original code by performing a self shuffle.

After that, bitcast is used to match the type returned by the original code.

Therefore, when bitcast is required, it is used to replace insertelement via self shuffle, and in this case, the source and destination types are the same minvecty.

So why do you nee different getShuffleCost calls for if(NeedCast) {} else ()?

define <2 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync { %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 12 %s = load i32, ptr %gep, align 1 %r = insertelement <2 x i32> poison, i32 %s, i64 0 ret <2 x i32> %r }

The above IR is optimized as follows.

; Function Attrs: nofree nosync define <2 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) #0 { %1 = load <4 x i32>, ptr %p, align 1 %r = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 3, i32 poison> ret <2 x i32> %r }

Here, Ty is <2 x i32> and MinVecTy is <4 x i32>, so shouldn't they be distinguished?

ParkHanbum · 2025-11-13T15:35:16Z

@RKSimon Thank you. I probably wouldn't have found it if you hadn't told me.

llvmbot added vectorizers llvm:transforms labels Dec 8, 2024

ParkHanbum force-pushed the vector_combine4 branch 2 times, most recently from 9d438e1 to ecc814b Compare December 10, 2024 21:53

ParkHanbum mentioned this pull request Jan 29, 2025

[VectorCombine] Allow shuffling between vectors the same type but different element sizes #121216

Merged

RKSimon requested review from RKSimon and davemgreen February 4, 2025 18:09

RKSimon reviewed Feb 6, 2025

View reviewed changes

ParkHanbum force-pushed the vector_combine4 branch from ecc814b to 4af373f Compare February 11, 2025 06:01

ParkHanbum requested a review from RKSimon February 11, 2025 07:22

ParkHanbum force-pushed the vector_combine4 branch 2 times, most recently from 0e72791 to 9bfc721 Compare April 7, 2025 21:53

RKSimon reviewed Apr 22, 2025

View reviewed changes

ParkHanbum requested a review from RKSimon April 29, 2025 08:21

ParkHanbum mentioned this pull request Nov 5, 2025

[x86][reg][performance] addsubpd not generated in complex multiplication since LLVM 13 #58139

Open

ParkHanbum added 4 commits November 12, 2025 15:07

add test-cases

99b51e0

BigEndian check update

fbbec1d

use std::gcd instead self code

4860a87

ParkHanbum added 4 commits November 12, 2025 15:07

new created IR push to Worklist

cec5357

remove unnecessary checks

054c1f4

replace the for statement with std::iota and relocate the conditional…

fc06d20

… statement

remove duplicate pushes to Worklist

772eb2f

replaceValue adds new instruction to the worklist internally, so don't need to push it to the worklist to remove it.

ParkHanbum force-pushed the vector_combine4 branch from cfae42a to 772eb2f Compare November 12, 2025 06:07

llvmbot added the llvm:vectorcombine label Nov 12, 2025

RKSimon requested changes Nov 12, 2025

View reviewed changes

apply the revised shuffle cost calculation

7a3a634

ParkHanbum force-pushed the vector_combine4 branch from 8c8013c to 7a3a634 Compare November 13, 2025 05:37

ParkHanbum requested a review from RKSimon November 13, 2025 06:34

RKSimon reviewed Nov 13, 2025

View reviewed changes

ParkHanbum added 2 commits November 13, 2025 23:04

add braces

19039bc

add braces

fe6df1d

ParkHanbum requested a review from RKSimon November 14, 2025 14:49

[VectorCombine] Allow shuffling with bitcast for not multiple offset for loadsize #119139

Are you sure you want to change the base?

[VectorCombine] Allow shuffling with bitcast for not multiple offset for loadsize #119139

Uh oh!

Conversation

ParkHanbum commented Dec 8, 2024

Uh oh!

llvmbot commented Dec 8, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ParkHanbum commented Feb 4, 2025

Uh oh!

RKSimon left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

ParkHanbum commented Feb 7, 2025

Uh oh!

RKSimon commented Feb 9, 2025

Uh oh!

github-actions bot commented Feb 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ParkHanbum commented Feb 11, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

RKSimon commented May 27, 2025

Uh oh!

ParkHanbum commented Jun 6, 2025

Uh oh!

RKSimon commented Jun 6, 2025

Uh oh!

ParkHanbum commented Jun 6, 2025

Uh oh!

ParkHanbum commented Nov 5, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Nov 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ParkHanbum commented Nov 13, 2025

Uh oh!

RKSimon commented Nov 13, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

ParkHanbum Nov 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

ParkHanbum commented Nov 13, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

llvmbot commented Dec 8, 2024 •

edited

Loading

RKSimon left a comment •

edited

Loading

github-actions bot commented Feb 11, 2025 •

edited

Loading

github-actions bot commented Nov 13, 2025 •

edited

Loading

ParkHanbum Nov 13, 2025 •

edited

Loading