Skip to content

Commit adb6efe

Browse files
authored
[SLP] Fix cost estimation of external uses with wrong VF (#148185)
It assumed that the VF remains constant throughout the tree. That's not always true. This meant that we could query the extraction cost for a lane that is out of bounds. While experimenting with re-vectorisation for AArch64, we ran into this issue. We cannot add a proper AArch64 test as more changes would need to be brought in. This commit is only fixing the computation of VF and adding an assert. Some tests were failing after adding the assert: - foo() in llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll - test() in llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll - test_with_extract() in llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
1 parent 97d44e3 commit adb6efe

File tree

2 files changed

+42
-2
lines changed

2 files changed

+42
-2
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14560,8 +14560,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
1456014560
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
1456114561
<< VectorizableTree.size() << ".\n");
1456214562

14563-
unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
14564-
1456514563
SmallPtrSet<Value *, 4> CheckedExtracts;
1456614564
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
1456714565
TreeEntry &TE = *VectorizableTree[I];
@@ -14624,6 +14622,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
1462414622
}
1462514623
SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
1462614624
for (ExternalUser &EU : ExternalUses) {
14625+
LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
14626+
<< EU.E.Idx << " in lane " << EU.Lane << "\n");
14627+
LLVM_DEBUG(dbgs() << " User:" << *EU.User << "\n");
14628+
LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
14629+
1462714630
// Uses by ephemeral values are free (because the ephemeral value will be
1462814631
// removed prior to code generation, and so the extraction will be
1462914632
// removed as well).
@@ -14731,6 +14734,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
1473114734
// for the extract and the added cost of the sign extend if needed.
1473214735
InstructionCost ExtraCost = TTI::TCC_Free;
1473314736
auto *ScalarTy = EU.Scalar->getType();
14737+
const unsigned BundleWidth = EU.E.getVectorFactor();
14738+
assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
1473414739
auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
1473514740
const TreeEntry *Entry = &EU.E;
1473614741
auto It = MinBWs.find(Entry);
@@ -14744,10 +14749,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
1474414749
VecTy = getWidenedType(MinTy, BundleWidth);
1474514750
ExtraCost =
1474614751
getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
14752+
LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
14753+
<< ExtraCost << "\n");
1474714754
} else {
1474814755
ExtraCost =
1474914756
getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
1475014757
CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
14758+
LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
14759+
<< *VecTy << ": " << ExtraCost << "\n");
1475114760
}
1475214761
// Leave the scalar instructions as is if they are cheaper than extracts.
1475314762
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||

llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,34 @@ define void @test() {
3131
store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
3232
ret void
3333
}
34+
35+
; Same as above, but %a7 is also used as a scalar and must be extracted from
36+
; the wide load. (Or in this case, kept as a scalar load).
37+
define double @test_with_extract() {
38+
; CHECK-LABEL: @test_with_extract(
39+
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
40+
; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
41+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
42+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
43+
; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x double> [[TMP2]], [[TMP3]]
44+
; CHECK-NEXT: store <4 x double> [[TMP4]], ptr @dst, align 8
45+
; CHECK-NEXT: ret double [[A7]]
46+
;
47+
%a0 = load double, ptr @src, align 8
48+
%a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
49+
%a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
50+
%a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
51+
%a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
52+
%a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
53+
%a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
54+
%a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
55+
%res1 = fsub fast double %a0, %a1
56+
%res2 = fsub fast double %a2, %a3
57+
%res3 = fsub fast double %a4, %a5
58+
%res4 = fsub fast double %a6, %a7
59+
store double %res1, ptr @dst, align 8
60+
store double %res2, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 1), align 8
61+
store double %res3, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 2), align 8
62+
store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
63+
ret double %a7
64+
}

0 commit comments

Comments
 (0)