diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d033b7c2ef4a9..687c8052fde45 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9616,8 +9616,20 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) { Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind, Idx, getWidenedType(ScalarTy, Sz)); } - Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true, - /*Extract=*/false, CostKind); + if (auto *FTy = dyn_cast(ScalarTy)) { + assert(SLPReVec && "Only supported by REVEC."); + // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead + // of CreateInsertElement. + unsigned ScalarTyNumElements = getNumElements(ScalarTy); + for (unsigned I : seq(TE.Scalars.size())) + if (DemandedElts[I]) + Cost += + TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt, + CostKind, I * ScalarTyNumElements, FTy); + } else { + Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind); + } int Sz = TE.Scalars.size(); SmallVector ReorderMask(TE.ReorderIndices.begin(), TE.ReorderIndices.end()); diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll new file mode 100644 index 0000000000000..c40e32baad7b3 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=systemz-unknown -mcpu=z15 -passes=slp-vectorizer -S -slp-revec %s | FileCheck %s + +define void @h() { +; CHECK-LABEL: @h( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = shl <4 x i32> zeroinitializer, zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> splat (i32 1), zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> zeroinitializer, zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i32> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: ret void +; +entry: + %0 = shl <4 x i32> zeroinitializer, zeroinitializer + %1 = or <4 x i32> %0, zeroinitializer + %2 = or <4 x i32> splat (i32 1), zeroinitializer + %3 = or <4 x i32> zeroinitializer, zeroinitializer + %4 = shl <4 x i32> zeroinitializer, zeroinitializer + %5 = or <4 x i32> %4, zeroinitializer + %6 = and <4 x i32> %2, %1 + %7 = and <4 x i32> %3, %6 + %8 = and <4 x i32> %5, %7 + %9 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %8) + ret void +}