-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[InterleavedAccess] Construct interleaved access store with shuffles #164000
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-aarch64 Author: Ramkrishnan (ram-NK) Changes
Patch is 34.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164000.diff 9 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 73f2c55a71125..86956d1c64451 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3206,6 +3206,11 @@ class LLVM_ABI TargetLoweringBase {
/// Default to be the minimum interleave factor: 2.
virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
+ /// Return true if the target can interleave data with shuffles.
+ virtual bool isProfitableToInterleaveWithGatherScatter() const {
+ return false;
+ }
+
/// Lower an interleaved load to target specific intrinsics. Return
/// true on success.
///
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index a6a9b5058ad94..c7d44c01f99f3 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -239,7 +239,8 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
/// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
- unsigned MaxFactor) {
+ unsigned MaxFactor,
+ bool InterleaveWithShuffles) {
unsigned NumElts = SVI->getShuffleMask().size();
if (NumElts < 4)
return false;
@@ -250,6 +251,13 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
return true;
}
+ if (InterleaveWithShuffles) {
+ for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) {
+ Factor = i * MaxFactor;
+ if (SVI->isInterleave(Factor))
+ return true;
+ }
+ }
return false;
}
@@ -530,7 +538,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
cast<FixedVectorType>(SVI->getType())->getNumElements();
// Check if the shufflevector is RE-interleave shuffle.
unsigned Factor;
- if (!isReInterleaveMask(SVI, Factor, MaxFactor))
+ if (!isReInterleaveMask(SVI, Factor, MaxFactor,
+ TLI->isProfitableToInterleaveWithGatherScatter()))
return false;
assert(NumStoredElements % Factor == 0 &&
"number of stored element should be a multiple of Factor");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 662d84b7a60a8..f26eef3ab61e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18023,11 +18023,17 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
unsigned Factor,
const APInt &GapMask) const {
- assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
- "Invalid interleave factor");
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
+
+ if (isProfitableToInterleaveWithGatherScatter() &&
+ Factor > getMaxSupportedInterleaveFactor())
+ return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);
+
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+
assert(!LaneMask && GapMask.popcount() == Factor &&
"Unexpected mask on store");
@@ -18173,6 +18179,136 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
return true;
}
+/// If the interleaved vector elements are greter than supported MaxFactor
+/// then, interleaving the data with additional shuffles can be used to
+/// achieve the same.
+/// Below shows how 8 interleaved data are shuffled to store with stN
+/// instructions. Data need store in this order v0,v1,v2,v3,v4,v5,v6,v7
+/// v0 v4 v2 v6 v1 v5 v3 v7
+/// | | | | | | | |
+/// \ / \ / \ / \ /
+/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7]==> stN = 4
+/// | | | |
+/// \ / \ /
+/// \ / \ /
+/// \ / \ /
+/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2
+///
+/// In stN = 4 level upper half of interleaved data V0,V1,V2,V3 is store
+/// withone st4 instruction. Lower half V4,V5,V6,V7 store with another st4.
+///
+/// In stN = 2 level first upper half of interleaved data V0,V1 is store
+/// with one st2 instruction. Second set V2,V3 with store with another st2.
+/// Total of 4 st2 are required.
+bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
+ StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
+ unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
+
+ auto *VecTy = cast<FixedVectorType>(SVI->getType());
+ assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
+
+ unsigned LaneLen = VecTy->getNumElements() / Factor;
+ Type *EltTy = VecTy->getElementType();
+ auto *SubVecTy = FixedVectorType::get(EltTy, Factor);
+
+ const DataLayout &DL = SI->getModule()->getDataLayout();
+ bool UseScalable;
+
+ // Skip if we do not have NEON and skip illegal vector types. We can
+ // "legalize" wide vector types into multiple interleaved accesses as long as
+ // the vector types are divisible by 128.
+ if (!Subtarget->hasNEON() ||
+ !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
+ return false;
+
+ if (UseScalable)
+ return false;
+
+ SmallVector<Value *, 8> Shufflelist;
+ Shufflelist.push_back(SVI);
+ unsigned ConcatLevel = Factor;
+ while (ConcatLevel > 1) {
+ SmallVector<Value *, 8> ShufflelistIntermediate;
+ ShufflelistIntermediate = Shufflelist;
+ Shufflelist.clear();
+ while (!ShufflelistIntermediate.empty()) {
+ ShuffleVectorInst *SFL =
+ dyn_cast<ShuffleVectorInst>(ShufflelistIntermediate[0]);
+ if (!SFL)
+ break;
+ ShufflelistIntermediate.erase(ShufflelistIntermediate.begin());
+
+ Value *Op0 = SFL->getOperand(0);
+ Value *Op1 = SFL->getOperand(1);
+
+ Shufflelist.push_back(dyn_cast<Value>(Op0));
+ Shufflelist.push_back(dyn_cast<Value>(Op1));
+ }
+ if (!ShufflelistIntermediate.empty()) {
+ Shufflelist = ShufflelistIntermediate;
+ break;
+ }
+ ConcatLevel = ConcatLevel >> 1;
+ }
+
+ if (Shufflelist.size() != Factor)
+ return false;
+
+ IRBuilder<> Builder(SI);
+ auto Mask = createInterleaveMask(LaneLen, 2);
+ SmallVector<int, 16> UpperHalfMask, LowerHalfMask;
+ for (unsigned i = 0; i < (2 * LaneLen); i++)
+ if (i < LaneLen)
+ LowerHalfMask.push_back(Mask[i]);
+ else
+ UpperHalfMask.push_back(Mask[i]);
+
+ unsigned InterleaveFactor = Factor >> 1;
+ while (InterleaveFactor >= MaxSupportedFactor) {
+ SmallVector<Value *, 8> ShufflelistIntermediate;
+ for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) {
+ for (unsigned i = 0; i < InterleaveFactor; i++) {
+ auto *Shuffle = Builder.CreateShuffleVector(
+ Shufflelist[i + j], Shufflelist[i + j + InterleaveFactor],
+ LowerHalfMask);
+ ShufflelistIntermediate.push_back(Shuffle);
+ }
+ for (unsigned i = 0; i < InterleaveFactor; i++) {
+ auto *Shuffle = Builder.CreateShuffleVector(
+ Shufflelist[i + j], Shufflelist[i + j + InterleaveFactor],
+ UpperHalfMask);
+ ShufflelistIntermediate.push_back(Shuffle);
+ }
+ }
+
+ Shufflelist = ShufflelistIntermediate;
+ InterleaveFactor = InterleaveFactor >> 1;
+ }
+
+ Type *PtrTy = SI->getPointerOperandType();
+ auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
+
+ Value *BaseAddr = SI->getPointerOperand();
+ Function *StNFunc = getStructuredStoreFunction(
+ SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
+ for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) {
+ SmallVector<Value *, 5> Ops;
+ for (unsigned j = 0; j < MaxSupportedFactor; j++)
+ Ops.push_back(Shufflelist[i * MaxSupportedFactor + j]);
+
+ if (i > 0) {
+ // We will compute the pointer operand of each store from the original
+ // base address using GEPs. Cast the base address to a pointer to the
+ // scalar element type.
+ BaseAddr = Builder.CreateConstGEP1_32(
+ SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
+ }
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
+ Builder.CreateCall(StNFunc, Ops);
+ }
+ return true;
+}
+
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 9495c9ffc47aa..867e01664eaae 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -229,6 +229,10 @@ class AArch64TargetLowering : public TargetLowering {
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
+ bool isProfitableToInterleaveWithGatherScatter() const override {
+ return true;
+ }
+
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
@@ -239,6 +243,9 @@ class AArch64TargetLowering : public TargetLowering {
ShuffleVectorInst *SVI, unsigned Factor,
const APInt &GapMask) const override;
+ bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
+ unsigned Factor) const;
+
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
IntrinsicInst *DI) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 479e34515fc8a..25055598a58f5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4801,11 +4801,35 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
return InstructionCost::getInvalid();
- if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+ unsigned NumLoadStores = 1;
+ InstructionCost ShuffleCost = 0;
+ bool isInterleaveWithShuffle = false;
+ unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();
+
+ auto *SubVecTy =
+ VectorType::get(VecVTy->getElementType(),
+ VecVTy->getElementCount().divideCoefficientBy(Factor));
+
+ if (TLI->isProfitableToInterleaveWithGatherScatter() &&
+ Opcode == Instruction::Store && (0 == Factor % MaxSupportedFactor) &&
+ Factor > MaxSupportedFactor) {
+ isInterleaveWithShuffle = true;
+ SmallVector<int, 16> Mask;
+ // preparing interleave Mask.
+ for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2;
+ i++)
+ for (unsigned j = 0; j < 2; j++)
+ Mask.push_back(j * Factor + i);
+
+ NumLoadStores = Factor / MaxSupportedFactor;
+ ShuffleCost =
+ (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
+ Mask, CostKind, 0, SubVecTy));
+ }
+
+ if (!UseMaskForGaps &&
+ (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
- auto *SubVecTy =
- VectorType::get(VecVTy->getElementType(),
- VecVTy->getElementCount().divideCoefficientBy(Factor));
// ldN/stN only support legal vector types of size 64 or 128 in bits.
// Accesses having vector types that are a multiple of 128 bits can be
@@ -4813,7 +4837,10 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
bool UseScalable;
if (MinElts % Factor == 0 &&
TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
- return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
+ return (Factor *
+ TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
+ NumLoadStores) +
+ ShuffleCost;
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index 3685e9cf85bd6..6d0a0300e0a91 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -730,6 +730,109 @@ entry:
ret void
}
+define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3,
+ <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) {
+; CHECK-LABEL: store_factor8:
+; CHECK: .Lfunc_begin17:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK: zip1 [[V1:.*s]], [[I1:.*s]], [[I5:.*s]]
+; CHECK-NEXT: zip2 [[V5:.*s]], [[I1]], [[I5]]
+; CHECK-NEXT: zip1 [[V2:.*s]], [[I2:.*s]], [[I6:.*s]]
+; CHECK-NEXT: zip2 [[V6:.*s]], [[I2]], [[I6]]
+; CHECK-NEXT: zip1 [[V3:.*s]], [[I3:.*s]], [[I7:.*s]]
+; CHECK-NEXT: zip2 [[V7:.*s]], [[I3]], [[I7]]
+; CHECK-NEXT: zip1 [[V4:.*s]], [[I4:.*s]], [[I8:.*s]]
+; CHECK-NEXT: zip2 [[V8:.*s]], [[I4]], [[I8]]
+; CHECK-NEXT: st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64
+; CHECK-NEXT: st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0]
+; CHECK-NEXT: ret
+
+ %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+ %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+ %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+ store <32 x i32> %interleaved.vec, ptr %ptr, align 4
+ ret void
+}
+
+define void @store_factor16(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3,
+ <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7,
+ <4 x i32> %a8, <4 x i32> %a9, <4 x i32> %a10, <4 x i32> %a11,
+ <4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) {
+; CHECK-LABEL: store_factor16:
+; CHECK: .Lfunc_begin18:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK: zip1 [[V05:.*s]], [[I05:.*s]], [[I13:.*s]]
+; CHECK-NEXT: zip1 [[V01:.*s]], [[I01:.*s]], [[I09:.*s]]
+; CHECK-NEXT: zip1 [[V02:.*s]], [[I02:.*s]], [[I10:.*s]]
+; CHECK-NEXT: zip1 [[V06:.*s]], [[I06:.*s]], [[I14:.*s]]
+; CHECK-NEXT: zip1 [[V07:.*s]], [[I07:.*s]], [[I15:.*s]]
+; CHECK-NEXT: zip1 [[V08:.*s]], [[I08:.*s]], [[I16:.*s]]
+; CHECK-NEXT: zip2 [[V09:.*s]], [[I01]], [[I09]]
+; CHECK-NEXT: zip1 [[V03:.*s]], [[I03:.*s]], [[I11:.*s]]
+; CHECK-NEXT: zip1 [[V04:.*s]], [[I04:.*s]], [[I12:.*s]]
+; CHECK-NEXT: zip2 [[V11:.*s]], [[I03]], [[I11]]
+; CHECK-NEXT: zip2 [[V12:.*s]], [[I04]], [[I12]]
+; CHECK-NEXT: zip2 [[V13:.*s]], [[I05]], [[I13]]
+; CHECK-NEXT: zip2 [[V10:.*s]], [[I02]], [[I10]]
+; CHECK-NEXT: zip1 [[V17:.*s]], [[V01]], [[V05]]
+; CHECK-NEXT: zip2 [[V21:.*s]], [[V01]], [[V05]]
+; CHECK-NEXT: zip2 [[V14:.*s]], [[I06]], [[I14]]
+; CHECK-NEXT: zip1 [[V18:.*s]], [[V02]], [[V06]]
+; CHECK-NEXT: zip2 [[V22:.*s]], [[V02]], [[V06]]
+; CHECK-NEXT: zip2 [[V15:.*s]], [[I07]], [[I15]]
+; CHECK-NEXT: zip1 [[V19:.*s]], [[V03]], [[V07]]
+; CHECK-NEXT: zip2 [[V23:.*s]], [[V03]], [[V07]]
+; CHECK-NEXT: zip2 [[V16:.*s]], [[I08]], [[I16]]
+; CHECK-NEXT: zip1 [[V20:.*s]], [[V04]], [[V08]]
+; CHECK-NEXT: zip2 [[V24:.*s]], [[V04]], [[V08]]
+; CHECK-NEXT: zip1 [[V25:.*s]], [[V09]], [[V13]]
+; CHECK-NEXT: zip1 [[V26:.*s]], [[V10]], [[V14]]
+; CHECK-NEXT: zip1 [[V27:.*s]], [[V11]], [[V15]]
+; CHECK-NEXT: zip1 [[V28:.*s]], [[V12]], [[V16]]
+; CHECK-NEXT: st4 { [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64
+; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: st4 { [[V21]], [[V22]], [[V23]], [[V24]] }, [x8]
+; CHECK-NEXT: zip2 [[V29:.*s]], [[V09]], [[V13]]
+; CHECK-NEXT: add x8, x0, #128
+; CHECK-NEXT: zip2 [[V30:.*s]], [[V10]], [[V14]]
+; CHECK-NEXT: zip2 [[V31:.*s]], [[V11]], [[V15]]
+; CHECK-NEXT: zip2 [[V32:.*s]], [[V12]], [[V16]]
+; CHECK-NEXT: st4 { [[V25]], [[V26]], [[V27]], [[V28]] }, [x8]
+; CHECK-NEXT: add x8, x0, #192
+; CHECK-NEXT: st4 { [[V29]], [[V30]], [[V31]], [[V32]] }, [x8]
+; CHECK-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+
+ %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+ %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+ %d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %d1 = shufflevector <16 x i32> %s2, <16 x i32> %s3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+
+ %interleaved.vec = shufflevector <32 x i32> %d0, <32 x i32> %d1, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I haven't taken a deep look yet, but some initial thoughts.
Also, any particular place of motivation where this pattern is used extensively?
- [AArch64]: Interleaved access store can handle more elements than target supported maximum interleaved factor with shuffles.
53cb483
to
45eb570
Compare
I added details in the PR description. |
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && | ||
"Invalid interleave factor"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can get rid of this condition since we already check Factor > getMaxSupportedInterleaveFactor()
above.
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && | |
"Invalid interleave factor"); | |
assert(Factor >= 2 && "Invalid interleave factor"); |
/// If the interleaved vector elements are greter than supported MaxFactor | ||
/// then, interleaving the data with additional shuffles can be used to | ||
/// achieve the same. | ||
/// Below shows how 8 interleaved data are shuffled to store with stN | ||
/// instructions. Data need store in this order v0,v1,v2,v3,v4,v5,v6,v7 | ||
/// v0 v4 v2 v6 v1 v5 v3 v7 | ||
/// | | | | | | | | | ||
/// \ / \ / \ / \ / | ||
/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7]==> stN = 4 | ||
/// | | | | | ||
/// \ / \ / | ||
/// \ / \ / | ||
/// \ / \ / | ||
/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2 | ||
/// | ||
/// In stN = 4 level upper half of interleaved data V0,V1,V2,V3 is store | ||
/// withone st4 instruction. Lower half V4,V5,V6,V7 store with another st4. | ||
/// | ||
/// In stN = 2 level first upper half of interleaved data V0,V1 is store | ||
/// with one st2 instruction. Second set V2,V3 with store with another st2. | ||
/// Total of 4 st2 are required. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit.
/// If the interleaved vector elements are greter than supported MaxFactor | |
/// then, interleaving the data with additional shuffles can be used to | |
/// achieve the same. | |
/// Below shows how 8 interleaved data are shuffled to store with stN | |
/// instructions. Data need store in this order v0,v1,v2,v3,v4,v5,v6,v7 | |
/// v0 v4 v2 v6 v1 v5 v3 v7 | |
/// | | | | | | | | | |
/// \ / \ / \ / \ / | |
/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7]==> stN = 4 | |
/// | | | | | |
/// \ / \ / | |
/// \ / \ / | |
/// \ / \ / | |
/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2 | |
/// | |
/// In stN = 4 level upper half of interleaved data V0,V1,V2,V3 is store | |
/// withone st4 instruction. Lower half V4,V5,V6,V7 store with another st4. | |
/// | |
/// In stN = 2 level first upper half of interleaved data V0,V1 is store | |
/// with one st2 instruction. Second set V2,V3 with store with another st2. | |
/// Total of 4 st2 are required. | |
/// If the interleaved vector elements are greater than supported MaxFactor, | |
/// interleaving the data with additional shuffles can be used to | |
/// achieve the same. | |
/// | |
/// Consider the following data with 8 interleaves which are shuffled to store | |
/// stN instructions. Data needs to be stored in this order: | |
/// [v0, v1, v2, v3, v4, v5, v6, v7] | |
/// | |
/// v0 v4 v2 v6 v1 v5 v3 v7 | |
/// | | | | | | | | | |
/// \ / \ / \ / \ / | |
/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7] ==> stN = 4 | |
/// | | | | | |
/// \ / \ / | |
/// \ / \ / | |
/// \ / \ / | |
/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2 | |
/// | |
/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored | |
/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with | |
/// another st4. | |
/// | |
/// For stN = 2, upper half of interleaved data V0, V1 is stored | |
/// with one st2 instruction. Second set V2, V3 is stored with another st2. | |
/// Total of 4 st2's are required here. |
unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor(); | ||
|
||
auto *VecTy = cast<FixedVectorType>(SVI->getType()); | ||
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VecTy
could be nullptr
here.
std::deque<Value *> Shuffles; | ||
Shuffles.push_back(SVI); | ||
unsigned ConcatLevel = Factor; | ||
while (ConcatLevel > 1) { | ||
std::deque<Value *> ShufflesIntermediate; | ||
ShufflesIntermediate = Shuffles; | ||
Shuffles.clear(); | ||
while (!ShufflesIntermediate.empty()) { | ||
ShuffleVectorInst *SFL = | ||
dyn_cast<ShuffleVectorInst>(ShufflesIntermediate.front()); | ||
if (!SFL) | ||
break; | ||
ShufflesIntermediate.pop_front(); | ||
|
||
Value *Op0 = SFL->getOperand(0); | ||
Value *Op1 = SFL->getOperand(1); | ||
|
||
Shuffles.push_back(dyn_cast<Value>(Op0)); | ||
Shuffles.push_back(dyn_cast<Value>(Op1)); | ||
} | ||
if (!ShufflesIntermediate.empty()) { | ||
Shuffles = ShufflesIntermediate; | ||
break; | ||
} | ||
ConcatLevel = ConcatLevel >> 1; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can do this with just one deque here and the last row would be in hand at the end?
for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) { | ||
for (unsigned i = 0; i < InterleaveFactor; i++) { | ||
auto *Shuffle = Builder.CreateShuffleVector( | ||
Shuffles[i + j], Shuffles[i + j + InterleaveFactor], LowerHalfMask); | ||
ShufflesIntermediate.push_back(Shuffle); | ||
} | ||
for (unsigned i = 0; i < InterleaveFactor; i++) { | ||
auto *Shuffle = Builder.CreateShuffleVector( | ||
Shuffles[i + j], Shuffles[i + j + InterleaveFactor], UpperHalfMask); | ||
ShufflesIntermediate.push_back(Shuffle); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Inner two loops could be done with one like you did above.
@@ -1,5 +1,5 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6 | |||
; RUN: opt -p loop-vectorize -S %s | FileCheck %s | |||
; RUN: opt -p loop-vectorize -max-interleave-group-factor=4 -S %s | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any particular reason for changing this here, since tests remain unchanged?
/// Return true if the target can interleave data with shuffles. | ||
virtual bool isProfitableToInterleaveWithGatherScatter() const { | ||
return false; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this function implement the cost model or just indicates availability of the instruction? The name sounds like it is a cost model. The simple implementatoin and the comment sounds like it indicates availability of the functionality.
Motivation:
Given the following
pack_LUT()
function,The IR that is right before loop vectorization is shown below. Here the inner loop is fully unrolled. 16 consecutive memory were store with separate 16 store instructions.
If this 16 stores can be interleaved then outer loop can be vectorized. This will improve the performance of the loop by more than 2%.
Assembly of the loop after vectorized and interleaved by 16.
For loop
M == 16
caseNot vectorize loop need 16 X 16 store = 256 stores needed.
Vectorized loop need 16 zip1 + 16 zip2 + 4 st4 instructions