-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[VectorCombine] Allow shuffling with bitcast for not multiple offset for loadsize #119139
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: hanbeom (ParkHanbum) ChangesPreviously, vectorization for load-insert failed when the Offset was not This patch allow it in two steps,
Alive2: https://alive2.llvm.org/ce/z/Kgr9HQ Patch is 25.07 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119139.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index b9caf8c0df9be1..85b9feb6c51e03 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -190,6 +190,15 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
if (!canWidenLoad(Load, TTI))
return false;
+ auto MaxCommonDivisor = [](int n) {
+ if (n % 4 == 0)
+ return 4;
+ if (n % 2 == 0)
+ return 2;
+ else
+ return 1;
+ };
+
Type *ScalarTy = Scalar->getType();
uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
@@ -204,6 +213,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
unsigned MinVecNumElts = MinVectorSize / ScalarSize;
auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
unsigned OffsetEltIndex = 0;
+ unsigned VectorRange = 0;
+ bool NeedCast = false;
Align Alignment = Load->getAlign();
if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
&DT)) {
@@ -220,15 +231,27 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
if (Offset.isNegative())
return false;
- // The offset must be a multiple of the scalar element to shuffle cleanly
- // in the element's size.
+ // If Offset is multiple of a Scalar element, it can be shuffled to the
+ // element's size; otherwise, Offset and Scalar must be shuffled to the
+ // appropriate element size for both.
uint64_t ScalarSizeInBytes = ScalarSize / 8;
- if (Offset.urem(ScalarSizeInBytes) != 0)
- return false;
+ if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes);
+ UnalignedBytes != 0) {
+ uint64_t OldScalarSizeInBytes = ScalarSizeInBytes;
+ // Assign the greatest common divisor between UnalignedBytes and Offset to
+ // ScalarSizeInBytes
+ ScalarSizeInBytes = MaxCommonDivisor(UnalignedBytes);
+ ScalarSize = ScalarSizeInBytes * 8;
+ VectorRange = OldScalarSizeInBytes / ScalarSizeInBytes;
+ MinVecNumElts = MinVectorSize / ScalarSize;
+ ScalarTy = Type::getIntNTy(I.getContext(), ScalarSize);
+ MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
+ NeedCast = true;
+ }
- // If we load MinVecNumElts, will our target element still be loaded?
OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
- if (OffsetEltIndex >= MinVecNumElts)
+ // If we load MinVecNumElts, will our target element still be loaded?
+ if (OffsetEltIndex + VectorRange >= MinVecNumElts)
return false;
if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
@@ -246,12 +269,15 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
Type *LoadTy = Load->getType();
unsigned AS = Load->getPointerAddressSpace();
+ auto VecTy = cast<InsertElementInst>(&I)->getType();
+
InstructionCost OldCost =
TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
- APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
+ APInt DemandedElts =
+ APInt::getOneBitSet(VecTy->getElementCount().getFixedValue(), 0);
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
OldCost +=
- TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
+ TTI.getScalarizationOverhead(VecTy, DemandedElts,
/* Insert */ true, HasExtract, CostKind);
// New pattern: load VecPtr
@@ -264,14 +290,28 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// We assume this operation has no cost in codegen if there was no offset.
// Note that we could use freeze to avoid poison problems, but then we might
// still need a shuffle to change the vector size.
- auto *Ty = cast<FixedVectorType>(I.getType());
- unsigned OutputNumElts = Ty->getNumElements();
- SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
- assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
- Mask[0] = OffsetEltIndex;
+ SmallVector<int> Mask;
+ assert(OffsetEltIndex + VectorRange < MinVecNumElts &&
+ "Address offset too big");
+ if (!NeedCast) {
+ auto *Ty = cast<FixedVectorType>(I.getType());
+ unsigned OutputNumElts = Ty->getNumElements();
+ Mask.assign(OutputNumElts, PoisonMaskElem);
+ Mask[0] = OffsetEltIndex;
+ } else {
+ Mask.assign(MinVecNumElts, PoisonMaskElem);
+ for (unsigned InsertPos = 0; InsertPos < VectorRange; InsertPos++)
+ Mask[InsertPos] = OffsetEltIndex++;
+ }
+
if (OffsetEltIndex)
NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask);
+ if (NeedCast)
+ NewCost += TTI.getCastInstrCost(Instruction::BitCast, I.getType(), MinVecTy,
+ TargetTransformInfo::CastContextHint::None,
+ TargetTransformInfo::TCK_RecipThroughput);
+
// We can aggressively convert to the vector form because the backend can
// invert this transform if it does not result in a performance win.
if (OldCost < NewCost || !NewCost.isValid())
@@ -280,12 +320,16 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// It is safe and potentially profitable to load a vector directly:
// inselt undef, load Scalar, 0 --> load VecPtr
IRBuilder<> Builder(Load);
+ Value *Result;
Value *CastedPtr =
Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
- Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
- VecLd = Builder.CreateShuffleVector(VecLd, Mask);
+ Result = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
+ Result = Builder.CreateShuffleVector(Result, Mask);
- replaceValue(I, *VecLd);
+ if (NeedCast)
+ Result = Builder.CreateBitOrPointerCast(Result, I.getType());
+
+ replaceValue(I, *Result);
++NumVecLoad;
return true;
}
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 937d4043adc0c4..5c3615cffd8e43 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -302,23 +302,133 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer
ret <8 x i16> %r
}
-; Negative test - if we are shuffling a load from the base pointer, the address offset
-; must be a multiple of element size.
-; TODO: Could bitcast around this limitation.
+define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
+; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; SSE2-NEXT: ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+; AVX2-NEXT: ret <4 x i32> [[R]]
+;
+ %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
+ %s = load i32, ptr %gep, align 1
+ %r = insertelement <4 x i32> poison, i32 %s, i64 0
+ ret <4 x i32> %r
+}
-define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(16) %p) nofree nosync {
-; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
-; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
-; CHECK-NEXT: ret <4 x i32> [[R]]
+define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
+; SSE2-NEXT: ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; AVX2-NEXT: ret <2 x i64> [[R]]
;
%gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
+ %s = load i64, ptr %gep, align 1
+ %r = insertelement <2 x i64> poison, i64 %s, i64 0
+ ret <2 x i64> %r
+}
+
+define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
+; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; SSE2-NEXT: ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+; AVX2-NEXT: ret <4 x i32> [[R]]
+;
+ %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11
+ %s = load i32, ptr %gep, align 1
+ %r = insertelement <4 x i32> poison, i32 %s, i64 0
+ ret <4 x i32> %r
+}
+
+define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; SSE2-NEXT: ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+; AVX2-NEXT: ret <4 x i32> [[R]]
+;
+ %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
+ %s = load i32, ptr %gep, align 1
+ %r = insertelement <4 x i32> poison, i32 %s, i64 0
+ ret <4 x i32> %r
+}
+
+define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
+; SSE2-NEXT: ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x i64>
+; AVX2-NEXT: ret <2 x i64> [[R]]
+;
+ %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
+ %s = load i64, ptr %gep, align 1
+ %r = insertelement <2 x i64> poison, i64 %s, i64 0
+ ret <2 x i64> %r
+}
+
+define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
+; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
+; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; SSE2-NEXT: ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+; AVX2-NEXT: ret <4 x i32> [[R]]
+;
+ %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5
%s = load i32, ptr %gep, align 1
%r = insertelement <4 x i32> poison, i32 %s, i64 0
ret <4 x i32> %r
}
+define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+; CHECK-NEXT: ret <2 x i64> [[R]]
+;
+ %gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 1
+ %s = load i64, ptr %gep, align 1
+ %r = insertelement <2 x i64> poison, i64 %s, i64 0
+ ret <2 x i64> %r
+}
+
define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync {
; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
index bdd05a1a37c70f..994ef1f9c66d89 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -285,23 +285,133 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer
ret <8 x i16> %r
}
-; Negative test - if we are shuffling a load from the base pointer, the address offset
-; must be a multiple of element size.
-; TODO: Could bitcast around this limitation.
+define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
+; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; SSE2-NEXT: ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+; AVX2-NEXT: ret <4 x i32> [[R]]
+;
+ %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
+ %s = load i32, ptr %gep, align 1
+ %r = insertelement <4 x i32> undef, i32 %s, i64 0
+ ret <4 x i32> %r
+}
-define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
-; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
-; CHECK-NEXT: ret <4 x i32> [[R]]
+define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
+; SSE2-NEXT: ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; AVX2-NEXT: ret <2 x i64> [[R]]
;
%gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
+ %s = load i64, ptr %gep, align 1
+ %r = insertelement <2 x i64> undef, i64 %s, i64 0
+ ret <2 x i64> %r
+}
+
+define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
+; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; SSE2-NEXT: ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+; AVX2-NEXT: ret <4 x i32> [[R]]
+;
+ %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11
%s = load i32, ptr %gep, align 1
%r = insertelement <4 x i32> undef, i32 %s, i64 0
ret <4 x i32> %r
}
+define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; SSE2-NEXT: ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+; AVX2-NEXT: ret <4 x i32> [[R]]
+;
+ %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
+ %s = load i32, ptr %gep, align 1
+ %r = insertelement <4 x i32> undef, i32 %s, i64 0
+ ret <4 x i32> %r
+}
+
+define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; SSE2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
+; SSE2-NEXT: ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NE...
[truncated]
|
9d438e1 to
ecc814b
Compare
|
@RKSimon could you review this patch please? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this work with big endian?
|
@RKSimon Probably not. Is it possible to separate logic according to Endian? |
|
Yes - use |
ecc814b to
4af373f
Compare
|
✅ With the latest revision this PR passed the undef deprecator. |
|
@RKSimon This time, it's a little difficult, so I'm asking you a question. |
0e72791 to
9bfc721
Compare
| SmallVector<int> Mask; | ||
| assert(OffsetEltIndex + VectorRange < MinVecNumElts && | ||
| "Address offset too big"); | ||
| if (!NeedCast) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(style) Use if (NeedCast) {} else {} ordering
| if (Offset.urem(ScalarSizeInBytes) != 0) | ||
| return false; | ||
| if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes); | ||
| UnalignedBytes != 0) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not this?
if (uint64_t UnalignedBytes = Offset.urem(ScalarSizeInBytes)) {
| Mask[0] = OffsetEltIndex; | ||
| } else { | ||
| Mask.assign(MinVecNumElts, PoisonMaskElem); | ||
| for (unsigned InsertPos = 0; InsertPos < VectorRange; InsertPos++) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
std::iota(Mask.begin(), Mask.begin() + VectorRange, OffsetEltIndex);
| Worklist.pushValue(Result); | ||
| if (NeedCast) { | ||
| Result = Builder.CreateBitOrPointerCast(Result, I.getType()); | ||
| Worklist.pushValue(Result); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
replaceValue should handle WorkList additions - you just need to add the instructions you created earlier
|
@ParkHanbum I'm sorry but this fell of my radar - please can you merge to latest on trunk? |
|
@RKSimon I've never done that, so I'm a little worried. Can we try it first in another PR with one commit? |
In many circumstances you can just use the "Update branch" button on this webpage - I've used it to merge to latest, and it restarts the CI as well. |
|
@RKSimon Ah! That's what you meant, I misunderstood it as something else. . |
|
@RKSimon Would you be able to review this PR when you have time? |
…for loadsize Previously, vectorization for load-insert failed when the Offset was not a multiple of the Load type size. This patch allow it in two steps, 1. Vectorize it using a common multiple of Offset and LoadSize. 2. Bitcast to fit Alive2: https://alive2.llvm.org/ce/z/Kgr9HQ
replaceValue adds new instruction to the worklist internally, so don't need to push it to the worklist to remove it.
cfae42a to
772eb2f
Compare
| Mask.assign(OutputNumElts, PoisonMaskElem); | ||
| Mask[0] = OffsetEltIndex; | ||
| } | ||
|
|
||
| if (OffsetEltIndex) | ||
| NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
llvm-project/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp:351:61: error: use of undeclared identifier 'Ty'
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I apologize. I thought nothing had changed when I requested the review, but there were changes, and I'm currently addressing this issue.
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
8c8013c to
7a3a634
Compare
|
building in linux failed but I can't figure it out why |
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you need cast - shouldn't there be different vector types?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will explain based on the following IR.
+define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
-; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
; CHECK-NEXT: ret <2 x i64> [[R]]
;
We first create a vector identical to the one processed up to the insertElement in the original code by performing a self shuffle.
After that, bitcast is used to match the type returned by the original code.
Therefore, when bitcast is required, it is used to replace insertelement via self shuffle, and in this case, the source and destination types are the same minvecty.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So why do you nee different getShuffleCost calls for if(NeedCast) {} else ()?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
define <2 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync {
%gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 12
%s = load i32, ptr %gep, align 1
%r = insertelement <2 x i32> poison, i32 %s, i64 0
ret <2 x i32> %r
}
The above IR is optimized as follows.
; Function Attrs: nofree nosync
define <2 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) #0 {
%1 = load <4 x i32>, ptr %p, align 1
%r = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 3, i32 poison>
ret <2 x i32> %r
}
Here, Ty is <2 x i32> and MinVecTy is <4 x i32>, so shouldn't they be distinguished?
|
@RKSimon Thank you. I probably wouldn't have found it if you hadn't told me. |
Previously, vectorization for load-insert failed when the Offset was not
a multiple of the Load type size.
This patch allow it in two steps,
Alive2: https://alive2.llvm.org/ce/z/Kgr9HQ