diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index cc7a65c0fd70b..918d7663548f5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9771,6 +9771,28 @@ void BoUpSLP::transformNodes() { // Strided store is more profitable than reverse + consecutive store - // transform the node to strided store. E.State = TreeEntry::StridedVectorize; + } else if (!E.ReorderIndices.empty()) { + // Check for interleaved stores. + auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef Mask) { + auto *BaseSI = cast(E.Scalars.front()); + assert(Mask.size() > 1 && "Expected mask greater than 1 element."); + if (Mask.size() < 4) + return 0u; + for (unsigned Factor : seq(2, Mask.size() / 2 + 1)) { + if (ShuffleVectorInst::isInterleaveMask( + Mask, Factor, VecTy->getElementCount().getFixedValue()) && + TTI.isLegalInterleavedAccessType( + VecTy, Factor, BaseSI->getAlign(), + BaseSI->getPointerAddressSpace())) + return Factor; + } + + return 0u; + }; + SmallVector Mask(E.ReorderIndices.begin(), E.ReorderIndices.end()); + unsigned InterleaveFactor = IsInterleaveMask(Mask); + if (InterleaveFactor != 0) + E.setInterleave(InterleaveFactor); } break; } @@ -11441,10 +11463,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } else { assert(E->State == TreeEntry::Vectorize && "Expected either strided or consecutive stores."); - TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0)); - VecStCost = TTI->getMemoryOpCost( - Instruction::Store, VecTy, BaseSI->getAlign(), - BaseSI->getPointerAddressSpace(), CostKind, OpInfo); + if (unsigned Factor = E->getInterleaveFactor()) { + assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() && + "No reused shuffles expected"); + CommonCost = 0; + VecStCost = TTI->getInterleavedMemoryOpCost( + Instruction::Store, VecTy, Factor, std::nullopt, + BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind); + } else { + TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0)); + VecStCost = TTI->getMemoryOpCost( + Instruction::Store, VecTy, BaseSI->getAlign(), + BaseSI->getPointerAddressSpace(), CostKind, OpInfo); + } } return VecStCost + CommonCost; }; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-stores.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-stores.ll index ae1c3e1ee0da2..071d0b972f23a 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-stores.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-stores.ll @@ -8,7 +8,7 @@ ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-1' +; YAML-NEXT: - Cost: '-2' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '2' define void @test(ptr %h) { @@ -17,16 +17,9 @@ define void @test(ptr %h) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[DCT2X211:%.*]] = alloca [0 x [0 x [8 x i64]]], i32 0, align 16 ; CHECK-NEXT: [[CHROMA_DC209:%.*]] = getelementptr i8, ptr [[H]], i64 0 -; CHECK-NEXT: [[ARRAYIDX33_I:%.*]] = getelementptr i8, ptr [[DCT2X211]], i64 8 -; CHECK-NEXT: [[ARRAYIDX36_I181:%.*]] = getelementptr i8, ptr [[DCT2X211]], i64 24 -; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr align 4 [[DCT2X211]], i64 16, <2 x i1> splat (i1 true), i32 2) -; CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[CHROMA_DC209]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[ARRAYIDX33_I]], align 2 -; CHECK-NEXT: [[ARRAYIDX5_I226:%.*]] = getelementptr i8, ptr [[H]], i64 16 -; CHECK-NEXT: store i64 [[TMP2]], ptr [[ARRAYIDX5_I226]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[ARRAYIDX36_I181]], align 2 -; CHECK-NEXT: [[ARRAYIDX7_I228:%.*]] = getelementptr i8, ptr [[H]], i64 24 -; CHECK-NEXT: store i64 [[TMP3]], ptr [[ARRAYIDX7_I228]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[DCT2X211]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: store <4 x i64> [[TMP1]], ptr [[CHROMA_DC209]], align 2 ; CHECK-NEXT: ret void ; entry: