From 88c8eae3602e61b1b65457fcddd649645e2b8f93 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 7 Nov 2024 18:05:25 +0000 Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?= =?UTF-8?q?l=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 40 +++++++++++++++++-- .../SLPVectorizer/RISCV/segmented-stores.ll | 15 ++----- 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 184413b420089..bff0684a2e6f1 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9759,6 +9759,29 @@ void BoUpSLP::transformNodes() { // Strided store is more profitable than reverse + consecutive store - // transform the node to strided store. E.State = TreeEntry::StridedVectorize; + } else if (!E.ReorderIndices.empty()) { + // Check for interleaved stores. + auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef Mask) { + assert(Mask.size() > 1 && "Expected mask greater than 1 element."); + if (Mask.size() < 4) + return 0u; + for (unsigned Factor : seq(2, Mask.size() / 2 + 1)) { + if (ShuffleVectorInst::isInterleaveMask( + Mask, Factor, VecTy->getElementCount().getFixedValue()) && + TTI.isLegalInterleavedAccessType( + VecTy, Factor, + cast(E.Scalars.front())->getAlign(), + cast(E.Scalars.front()) + ->getPointerAddressSpace())) + return Factor; + } + + return 0u; + }; + SmallVector Mask(E.ReorderIndices.begin(), E.ReorderIndices.end()); + unsigned InterleaveFactor = IsInterleaveMask(Mask); + if (InterleaveFactor != 0) + E.setInterleave(InterleaveFactor); } break; } @@ -11428,10 +11451,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } else { assert(E->State == TreeEntry::Vectorize && "Expected either strided or consecutive stores."); - TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0)); - VecStCost = TTI->getMemoryOpCost( - Instruction::Store, VecTy, BaseSI->getAlign(), - BaseSI->getPointerAddressSpace(), CostKind, OpInfo); + if (unsigned Factor = E->getInterleaveFactor()) { + assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() && + "No reused shuffles expected"); + CommonCost = 0; + VecStCost = TTI->getInterleavedMemoryOpCost( + Instruction::Store, VecTy, Factor, std::nullopt, + BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind); + } else { + TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0)); + VecStCost = TTI->getMemoryOpCost( + Instruction::Store, VecTy, BaseSI->getAlign(), + BaseSI->getPointerAddressSpace(), CostKind, OpInfo); + } } return VecStCost + CommonCost; }; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-stores.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-stores.ll index ae1c3e1ee0da2..071d0b972f23a 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-stores.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-stores.ll @@ -8,7 +8,7 @@ ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-1' +; YAML-NEXT: - Cost: '-2' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '2' define void @test(ptr %h) { @@ -17,16 +17,9 @@ define void @test(ptr %h) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[DCT2X211:%.*]] = alloca [0 x [0 x [8 x i64]]], i32 0, align 16 ; CHECK-NEXT: [[CHROMA_DC209:%.*]] = getelementptr i8, ptr [[H]], i64 0 -; CHECK-NEXT: [[ARRAYIDX33_I:%.*]] = getelementptr i8, ptr [[DCT2X211]], i64 8 -; CHECK-NEXT: [[ARRAYIDX36_I181:%.*]] = getelementptr i8, ptr [[DCT2X211]], i64 24 -; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr align 4 [[DCT2X211]], i64 16, <2 x i1> splat (i1 true), i32 2) -; CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[CHROMA_DC209]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[ARRAYIDX33_I]], align 2 -; CHECK-NEXT: [[ARRAYIDX5_I226:%.*]] = getelementptr i8, ptr [[H]], i64 16 -; CHECK-NEXT: store i64 [[TMP2]], ptr [[ARRAYIDX5_I226]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[ARRAYIDX36_I181]], align 2 -; CHECK-NEXT: [[ARRAYIDX7_I228:%.*]] = getelementptr i8, ptr [[H]], i64 24 -; CHECK-NEXT: store i64 [[TMP3]], ptr [[ARRAYIDX7_I228]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[DCT2X211]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: store <4 x i64> [[TMP1]], ptr [[CHROMA_DC209]], align 2 ; CHECK-NEXT: ret void ; entry: