-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[InterleavedAccess] Construct interleaved access store with shuffles #164000
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-aarch64 Author: Ramkrishnan (ram-NK) Changes
Patch is 34.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164000.diff 9 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 73f2c55a71125..86956d1c64451 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3206,6 +3206,11 @@ class LLVM_ABI TargetLoweringBase {
/// Default to be the minimum interleave factor: 2.
virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
+ /// Return true if the target can interleave data with shuffles.
+ virtual bool isProfitableToInterleaveWithGatherScatter() const {
+ return false;
+ }
+
/// Lower an interleaved load to target specific intrinsics. Return
/// true on success.
///
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index a6a9b5058ad94..c7d44c01f99f3 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -239,7 +239,8 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
/// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
- unsigned MaxFactor) {
+ unsigned MaxFactor,
+ bool InterleaveWithShuffles) {
unsigned NumElts = SVI->getShuffleMask().size();
if (NumElts < 4)
return false;
@@ -250,6 +251,13 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
return true;
}
+ if (InterleaveWithShuffles) {
+ for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) {
+ Factor = i * MaxFactor;
+ if (SVI->isInterleave(Factor))
+ return true;
+ }
+ }
return false;
}
@@ -530,7 +538,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
cast<FixedVectorType>(SVI->getType())->getNumElements();
// Check if the shufflevector is RE-interleave shuffle.
unsigned Factor;
- if (!isReInterleaveMask(SVI, Factor, MaxFactor))
+ if (!isReInterleaveMask(SVI, Factor, MaxFactor,
+ TLI->isProfitableToInterleaveWithGatherScatter()))
return false;
assert(NumStoredElements % Factor == 0 &&
"number of stored element should be a multiple of Factor");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 662d84b7a60a8..f26eef3ab61e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18023,11 +18023,17 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
unsigned Factor,
const APInt &GapMask) const {
- assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
- "Invalid interleave factor");
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
+
+ if (isProfitableToInterleaveWithGatherScatter() &&
+ Factor > getMaxSupportedInterleaveFactor())
+ return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);
+
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+
assert(!LaneMask && GapMask.popcount() == Factor &&
"Unexpected mask on store");
@@ -18173,6 +18179,136 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
return true;
}
+/// If the interleaved vector elements are greter than supported MaxFactor
+/// then, interleaving the data with additional shuffles can be used to
+/// achieve the same.
+/// Below shows how 8 interleaved data are shuffled to store with stN
+/// instructions. Data need store in this order v0,v1,v2,v3,v4,v5,v6,v7
+/// v0 v4 v2 v6 v1 v5 v3 v7
+/// | | | | | | | |
+/// \ / \ / \ / \ /
+/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7]==> stN = 4
+/// | | | |
+/// \ / \ /
+/// \ / \ /
+/// \ / \ /
+/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2
+///
+/// In stN = 4 level upper half of interleaved data V0,V1,V2,V3 is store
+/// withone st4 instruction. Lower half V4,V5,V6,V7 store with another st4.
+///
+/// In stN = 2 level first upper half of interleaved data V0,V1 is store
+/// with one st2 instruction. Second set V2,V3 with store with another st2.
+/// Total of 4 st2 are required.
+bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
+ StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
+ unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
+
+ auto *VecTy = cast<FixedVectorType>(SVI->getType());
+ assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
+
+ unsigned LaneLen = VecTy->getNumElements() / Factor;
+ Type *EltTy = VecTy->getElementType();
+ auto *SubVecTy = FixedVectorType::get(EltTy, Factor);
+
+ const DataLayout &DL = SI->getModule()->getDataLayout();
+ bool UseScalable;
+
+ // Skip if we do not have NEON and skip illegal vector types. We can
+ // "legalize" wide vector types into multiple interleaved accesses as long as
+ // the vector types are divisible by 128.
+ if (!Subtarget->hasNEON() ||
+ !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
+ return false;
+
+ if (UseScalable)
+ return false;
+
+ SmallVector<Value *, 8> Shufflelist;
+ Shufflelist.push_back(SVI);
+ unsigned ConcatLevel = Factor;
+ while (ConcatLevel > 1) {
+ SmallVector<Value *, 8> ShufflelistIntermediate;
+ ShufflelistIntermediate = Shufflelist;
+ Shufflelist.clear();
+ while (!ShufflelistIntermediate.empty()) {
+ ShuffleVectorInst *SFL =
+ dyn_cast<ShuffleVectorInst>(ShufflelistIntermediate[0]);
+ if (!SFL)
+ break;
+ ShufflelistIntermediate.erase(ShufflelistIntermediate.begin());
+
+ Value *Op0 = SFL->getOperand(0);
+ Value *Op1 = SFL->getOperand(1);
+
+ Shufflelist.push_back(dyn_cast<Value>(Op0));
+ Shufflelist.push_back(dyn_cast<Value>(Op1));
+ }
+ if (!ShufflelistIntermediate.empty()) {
+ Shufflelist = ShufflelistIntermediate;
+ break;
+ }
+ ConcatLevel = ConcatLevel >> 1;
+ }
+
+ if (Shufflelist.size() != Factor)
+ return false;
+
+ IRBuilder<> Builder(SI);
+ auto Mask = createInterleaveMask(LaneLen, 2);
+ SmallVector<int, 16> UpperHalfMask, LowerHalfMask;
+ for (unsigned i = 0; i < (2 * LaneLen); i++)
+ if (i < LaneLen)
+ LowerHalfMask.push_back(Mask[i]);
+ else
+ UpperHalfMask.push_back(Mask[i]);
+
+ unsigned InterleaveFactor = Factor >> 1;
+ while (InterleaveFactor >= MaxSupportedFactor) {
+ SmallVector<Value *, 8> ShufflelistIntermediate;
+ for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) {
+ for (unsigned i = 0; i < InterleaveFactor; i++) {
+ auto *Shuffle = Builder.CreateShuffleVector(
+ Shufflelist[i + j], Shufflelist[i + j + InterleaveFactor],
+ LowerHalfMask);
+ ShufflelistIntermediate.push_back(Shuffle);
+ }
+ for (unsigned i = 0; i < InterleaveFactor; i++) {
+ auto *Shuffle = Builder.CreateShuffleVector(
+ Shufflelist[i + j], Shufflelist[i + j + InterleaveFactor],
+ UpperHalfMask);
+ ShufflelistIntermediate.push_back(Shuffle);
+ }
+ }
+
+ Shufflelist = ShufflelistIntermediate;
+ InterleaveFactor = InterleaveFactor >> 1;
+ }
+
+ Type *PtrTy = SI->getPointerOperandType();
+ auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
+
+ Value *BaseAddr = SI->getPointerOperand();
+ Function *StNFunc = getStructuredStoreFunction(
+ SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
+ for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) {
+ SmallVector<Value *, 5> Ops;
+ for (unsigned j = 0; j < MaxSupportedFactor; j++)
+ Ops.push_back(Shufflelist[i * MaxSupportedFactor + j]);
+
+ if (i > 0) {
+ // We will compute the pointer operand of each store from the original
+ // base address using GEPs. Cast the base address to a pointer to the
+ // scalar element type.
+ BaseAddr = Builder.CreateConstGEP1_32(
+ SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
+ }
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
+ Builder.CreateCall(StNFunc, Ops);
+ }
+ return true;
+}
+
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 9495c9ffc47aa..867e01664eaae 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -229,6 +229,10 @@ class AArch64TargetLowering : public TargetLowering {
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
+ bool isProfitableToInterleaveWithGatherScatter() const override {
+ return true;
+ }
+
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
@@ -239,6 +243,9 @@ class AArch64TargetLowering : public TargetLowering {
ShuffleVectorInst *SVI, unsigned Factor,
const APInt &GapMask) const override;
+ bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
+ unsigned Factor) const;
+
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
IntrinsicInst *DI) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 479e34515fc8a..25055598a58f5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4801,11 +4801,35 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
return InstructionCost::getInvalid();
- if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+ unsigned NumLoadStores = 1;
+ InstructionCost ShuffleCost = 0;
+ bool isInterleaveWithShuffle = false;
+ unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();
+
+ auto *SubVecTy =
+ VectorType::get(VecVTy->getElementType(),
+ VecVTy->getElementCount().divideCoefficientBy(Factor));
+
+ if (TLI->isProfitableToInterleaveWithGatherScatter() &&
+ Opcode == Instruction::Store && (0 == Factor % MaxSupportedFactor) &&
+ Factor > MaxSupportedFactor) {
+ isInterleaveWithShuffle = true;
+ SmallVector<int, 16> Mask;
+ // preparing interleave Mask.
+ for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2;
+ i++)
+ for (unsigned j = 0; j < 2; j++)
+ Mask.push_back(j * Factor + i);
+
+ NumLoadStores = Factor / MaxSupportedFactor;
+ ShuffleCost =
+ (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
+ Mask, CostKind, 0, SubVecTy));
+ }
+
+ if (!UseMaskForGaps &&
+ (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
- auto *SubVecTy =
- VectorType::get(VecVTy->getElementType(),
- VecVTy->getElementCount().divideCoefficientBy(Factor));
// ldN/stN only support legal vector types of size 64 or 128 in bits.
// Accesses having vector types that are a multiple of 128 bits can be
@@ -4813,7 +4837,10 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
bool UseScalable;
if (MinElts % Factor == 0 &&
TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
- return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
+ return (Factor *
+ TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
+ NumLoadStores) +
+ ShuffleCost;
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index 3685e9cf85bd6..6d0a0300e0a91 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -730,6 +730,109 @@ entry:
ret void
}
+define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3,
+ <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) {
+; CHECK-LABEL: store_factor8:
+; CHECK: .Lfunc_begin17:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK: zip1 [[V1:.*s]], [[I1:.*s]], [[I5:.*s]]
+; CHECK-NEXT: zip2 [[V5:.*s]], [[I1]], [[I5]]
+; CHECK-NEXT: zip1 [[V2:.*s]], [[I2:.*s]], [[I6:.*s]]
+; CHECK-NEXT: zip2 [[V6:.*s]], [[I2]], [[I6]]
+; CHECK-NEXT: zip1 [[V3:.*s]], [[I3:.*s]], [[I7:.*s]]
+; CHECK-NEXT: zip2 [[V7:.*s]], [[I3]], [[I7]]
+; CHECK-NEXT: zip1 [[V4:.*s]], [[I4:.*s]], [[I8:.*s]]
+; CHECK-NEXT: zip2 [[V8:.*s]], [[I4]], [[I8]]
+; CHECK-NEXT: st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64
+; CHECK-NEXT: st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0]
+; CHECK-NEXT: ret
+
+ %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+ %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+ %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+ store <32 x i32> %interleaved.vec, ptr %ptr, align 4
+ ret void
+}
+
+define void @store_factor16(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3,
+ <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7,
+ <4 x i32> %a8, <4 x i32> %a9, <4 x i32> %a10, <4 x i32> %a11,
+ <4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) {
+; CHECK-LABEL: store_factor16:
+; CHECK: .Lfunc_begin18:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK: zip1 [[V05:.*s]], [[I05:.*s]], [[I13:.*s]]
+; CHECK-NEXT: zip1 [[V01:.*s]], [[I01:.*s]], [[I09:.*s]]
+; CHECK-NEXT: zip1 [[V02:.*s]], [[I02:.*s]], [[I10:.*s]]
+; CHECK-NEXT: zip1 [[V06:.*s]], [[I06:.*s]], [[I14:.*s]]
+; CHECK-NEXT: zip1 [[V07:.*s]], [[I07:.*s]], [[I15:.*s]]
+; CHECK-NEXT: zip1 [[V08:.*s]], [[I08:.*s]], [[I16:.*s]]
+; CHECK-NEXT: zip2 [[V09:.*s]], [[I01]], [[I09]]
+; CHECK-NEXT: zip1 [[V03:.*s]], [[I03:.*s]], [[I11:.*s]]
+; CHECK-NEXT: zip1 [[V04:.*s]], [[I04:.*s]], [[I12:.*s]]
+; CHECK-NEXT: zip2 [[V11:.*s]], [[I03]], [[I11]]
+; CHECK-NEXT: zip2 [[V12:.*s]], [[I04]], [[I12]]
+; CHECK-NEXT: zip2 [[V13:.*s]], [[I05]], [[I13]]
+; CHECK-NEXT: zip2 [[V10:.*s]], [[I02]], [[I10]]
+; CHECK-NEXT: zip1 [[V17:.*s]], [[V01]], [[V05]]
+; CHECK-NEXT: zip2 [[V21:.*s]], [[V01]], [[V05]]
+; CHECK-NEXT: zip2 [[V14:.*s]], [[I06]], [[I14]]
+; CHECK-NEXT: zip1 [[V18:.*s]], [[V02]], [[V06]]
+; CHECK-NEXT: zip2 [[V22:.*s]], [[V02]], [[V06]]
+; CHECK-NEXT: zip2 [[V15:.*s]], [[I07]], [[I15]]
+; CHECK-NEXT: zip1 [[V19:.*s]], [[V03]], [[V07]]
+; CHECK-NEXT: zip2 [[V23:.*s]], [[V03]], [[V07]]
+; CHECK-NEXT: zip2 [[V16:.*s]], [[I08]], [[I16]]
+; CHECK-NEXT: zip1 [[V20:.*s]], [[V04]], [[V08]]
+; CHECK-NEXT: zip2 [[V24:.*s]], [[V04]], [[V08]]
+; CHECK-NEXT: zip1 [[V25:.*s]], [[V09]], [[V13]]
+; CHECK-NEXT: zip1 [[V26:.*s]], [[V10]], [[V14]]
+; CHECK-NEXT: zip1 [[V27:.*s]], [[V11]], [[V15]]
+; CHECK-NEXT: zip1 [[V28:.*s]], [[V12]], [[V16]]
+; CHECK-NEXT: st4 { [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64
+; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: st4 { [[V21]], [[V22]], [[V23]], [[V24]] }, [x8]
+; CHECK-NEXT: zip2 [[V29:.*s]], [[V09]], [[V13]]
+; CHECK-NEXT: add x8, x0, #128
+; CHECK-NEXT: zip2 [[V30:.*s]], [[V10]], [[V14]]
+; CHECK-NEXT: zip2 [[V31:.*s]], [[V11]], [[V15]]
+; CHECK-NEXT: zip2 [[V32:.*s]], [[V12]], [[V16]]
+; CHECK-NEXT: st4 { [[V25]], [[V26]], [[V27]], [[V28]] }, [x8]
+; CHECK-NEXT: add x8, x0, #192
+; CHECK-NEXT: st4 { [[V29]], [[V30]], [[V31]], [[V32]] }, [x8]
+; CHECK-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+
+ %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+ %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+ %d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %d1 = shufflevector <16 x i32> %s2, <16 x i32> %s3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+
+ %interleaved.vec = shufflevector <32 x i32> %d0, <32 x i32> %d1, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, ...
[truncated]
|
Rajveer100
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I haven't taken a deep look yet, but some initial thoughts.
Also, any particular place of motivation where this pattern is used extensively?
53cb483 to
45eb570
Compare
I added details in the PR description. |
llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
Show resolved
Hide resolved
45eb570 to
643aa3e
Compare
643aa3e to
79198b9
Compare
|
@paulwalker-arm |
Happy to, but it's likely to be a couple of days before I can get to it. |
79198b9 to
b50580f
Compare
Rajveer100
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks good to me, I will leave the rest to other reviewers in case I missed something. Thanks!
b50580f to
0a6f468
Compare
- [AArch64]: Interleaved access store can handle more elements than
target supported maximum interleaved factor with shuffles.
0a6f468 to
894eacd
Compare
| @@ -0,0 +1,117 @@ | |||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 | |||
| ; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses=true -max-interleave-group-factor=16 -S < %s | FileCheck %s | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
-enable-interleaved-mem-accesses=true should not be needed, why does the max factor need to capped?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Interleaved memory access is disabled in LoopVectorize pass by default. enable-interleaved-mem-accesses is false and max-interleave-group-factor is 8 by default. For enabling interleave by 16, need to use both options.
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 | ||
| ; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses=true -max-interleave-group-factor=16 -S < %s | FileCheck %s | ||
|
|
||
| define dso_local void @_Z6unpackPhS_(ptr noalias noundef readonly captures(none) %in, ptr noalias noundef writeonly captures(none) %out) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would be good to have a comment about what this tests and clean up the naming n the function.
| } | ||
|
|
||
| if (InterleaveWithShuffles) { | ||
| for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LLVM stype uses upper case for variable names. Why cap at 16?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The next interleaving factor is 32. For efficient interleaving, more than 32 registers are required. Without spilling it is not possible for AArch64. So stopped at 16.
For all the corrections, will create a follow up PR.
| /// Default to be the minimum interleave factor: 2. | ||
| virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; } | ||
|
|
||
| /// Return true if the target interleave with shuffles are cheaper |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems like a very specific hook, targeted at InterleavedAcecssPass. Would be good to clarify the comment what it means precisely, possibly with an example.
It it is about profitability for gathers/scatters, can it be checked by checking the costs of the gather/scatter sequence?
|
This commit is causing failed asserts for me, this breaks compiling ffmpeg and libaom. Two reduced reproducers for the issue: int a, b;
void c(char e) {
char *d = 0;
while (--b) {
d[b * 8 + 7] = d[b * 8 + 6] = e;
d[b * 8 + 5] = d[b * 8 + 4] = e & 1;
d[b * 8 + 3] = e;
d[b * 8 + 2] = a;
d[b * 8 + 1] = d[b * 8] = e;
}
}typedef __attribute__((neon_vector_type(16))) unsigned char a;
typedef __attribute__((neon_vector_type(4))) short b;
typedef __attribute__((neon_vector_type(2))) int c;
typedef __attribute__((neon_vector_type(4))) int d;
typedef struct {
b e[2]
} f;
typedef struct {
c e[2]
} g;
f h;
b i;
a k;
g m, n;
c o, p, q, r;
f j(b s) {
__builtin_neon_vzip_v(&h, i, s, 17);
return h;
}
b l();
void t() {
a u;
f v = j(l()), w = j(l()), a = h, b = j(l());
r = v.e[0];
q = w.e[0];
g c;
__builtin_neon_vzip_v(&c, r, q, 18);
n = c;
p = a.e[0];
o = b.e[0];
__builtin_neon_vzip_v(&c, p, o, 18);
m = c;
{
d d = __builtin_shufflevector(n.e[1], m.e[1], 0, 1, 2, 3);
u = d;
}
k = u;
}I will push a revert shortly. |
|
Thanks for working on this PR, I'm aware this was given an LGTM but I think in this case it would be best to wait for a second approval before this relands from someone who maintains InterleavedAccessPass and AArch64 since it touches a significant amount of code there. |
…tore with shuffles" This reverts commit 78d6491. That commit caused failed asserts, see llvm/llvm-project#164000 for details.
p Shuffles
(std::deque<llvm::Value *>) size=8 {
[0] = 0x0000000a0af0d560
[1] = 0x0000000a0b08d120
[2] = 0x0000000a0af0d480
[3] = 0x0000000a0af0d600
[4] = 0x0000000a0ae55440
[5] = 0x0000000a0b08c9c0
[6] = 0x0000000a0af0d560
[7] = 0x0000000a0b08d120
}
(lldb) p Shuffles[0]
(std::deque<llvm::Value *>::value_type) 0x0000000a0af0d560
(lldb) p Shuffles[0]->dump()
%broadcast.splatinsert64 = insertelement <8 x i8> poison, i8 %e, i64 0
(lldb) p Shuffles[1]->dump()
<8 x i8> poison
(lldb) p Shuffles[2]->dump()
%broadcast.splat63 = shufflevector <8 x i8> %broadcast.splatinsert62, <8 x i8> poison, <8 x i32> zeroinitializer
(lldb) p Shuffles[3]->dump()
%broadcast.splat65 = shufflevector <8 x i8> %broadcast.splatinsert64, <8 x i8> poison, <8 x i32> zeroinitializer
(lldb) p Shuffles[4]->dump()
----------->> %2 = and <4 x i8> %1, <i8 1, i8 1, i8 -1, i8 -1> <<----------------------------------
(lldb) p Shuffles[5]->dump()
<4 x i8> poison
(lldb) p Shuffles[6]->dump()
%broadcast.splatinsert64 = insertelement <8 x i8> poison, i8 %e, i64 0
(lldb) p Shuffles[7]->dump()
<8 x i8> poisonI put this in the debugger locally, so it looks we need a check something like: Shuffles.push_back(dyn_cast<Value>(Op0));
Shuffles.push_back(dyn_cast<Value>(Op1));
}
ConcatLevel >>= 1;
}
....
for (Value *V : Shuffles) {
if (!isa<ShuffleVectorInst>(V))
return false;
} // we can use llvm::all_of.to ensure we indeed have all of them as shuffles, like here its an This happens because due to the @mstorsjo |
|
I am working on all these issues. @mstorsjo Thanks for the two failure cases. @Rajveer100 will check your solutions. |
isConcat() of ShuffleVectorInst will ensure the correctness of this interleaving logic. Two of the test cases were filtered out. Works fine for me. |
Motivation:
Given the following
pack_LUT()function,The IR that is right before loop vectorization is shown below. Here the inner loop is fully unrolled. 16 consecutive memory were store with separate 16 store instructions.
If this 16 stores can be interleaved then outer loop can be vectorized. This will improve the performance of the loop by more than 2%.
Assembly of the loop after vectorized and interleaved by 16.
For loop
M == 16caseNot vectorize loop need 16 X 16 store = 256 stores needed.
Vectorized loop need 16 zip1 + 16 zip2 + 4 st4 instructions