Skip to content

Commit ead3a2f

Browse files
authored
[SLP][REVEC] getScalarizationOverhead should not be used when ScalarTy is FixedVectorType. (#117536)
1 parent 537343d commit ead3a2f

File tree

2 files changed

+44
-2
lines changed

2 files changed

+44
-2
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9614,8 +9614,20 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
96149614
Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,
96159615
Idx, getWidenedType(ScalarTy, Sz));
96169616
}
9617-
Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9618-
/*Extract=*/false, CostKind);
9617+
if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9618+
assert(SLPReVec && "Only supported by REVEC.");
9619+
// If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9620+
// of CreateInsertElement.
9621+
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9622+
for (unsigned I : seq<unsigned>(TE.Scalars.size()))
9623+
if (DemandedElts[I])
9624+
Cost +=
9625+
TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9626+
CostKind, I * ScalarTyNumElements, FTy);
9627+
} else {
9628+
Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9629+
/*Extract=*/false, CostKind);
9630+
}
96199631
int Sz = TE.Scalars.size();
96209632
SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
96219633
TE.ReorderIndices.end());
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -mtriple=systemz-unknown -mcpu=z15 -passes=slp-vectorizer -S -slp-revec %s | FileCheck %s
3+
4+
define void @h() {
5+
; CHECK-LABEL: @h(
6+
; CHECK-NEXT: entry:
7+
; CHECK-NEXT: [[TMP0:%.*]] = shl <4 x i32> zeroinitializer, zeroinitializer
8+
; CHECK-NEXT: [[TMP1:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer
9+
; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> splat (i32 1), zeroinitializer
10+
; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> zeroinitializer, zeroinitializer
11+
; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer
12+
; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i32> [[TMP2]], [[TMP1]]
13+
; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> zeroinitializer, [[TMP5]]
14+
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP4]], [[TMP6]]
15+
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP7]])
16+
; CHECK-NEXT: ret void
17+
;
18+
entry:
19+
%0 = shl <4 x i32> zeroinitializer, zeroinitializer
20+
%1 = or <4 x i32> %0, zeroinitializer
21+
%2 = or <4 x i32> splat (i32 1), zeroinitializer
22+
%3 = or <4 x i32> zeroinitializer, zeroinitializer
23+
%4 = shl <4 x i32> zeroinitializer, zeroinitializer
24+
%5 = or <4 x i32> %4, zeroinitializer
25+
%6 = and <4 x i32> %2, %1
26+
%7 = and <4 x i32> %3, %6
27+
%8 = and <4 x i32> %5, %7
28+
%9 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %8)
29+
ret void
30+
}

0 commit comments

Comments
 (0)