Skip to content

Commit fa3ae38

Browse files
committed
[LV] Re-compute cost of scalarized load users.
If there are direct memory op users of the newly scalarized load, their cost may have changed because there's no scalarization overhead for the operand. Update it. This ensures assigning consistent costs to scalarized memory instructions that themselves have scalarized memory instructions as operands. (cherry picked from commit 1a85027)
1 parent 5d30383 commit fa3ae38

File tree

3 files changed

+490
-6
lines changed

3 files changed

+490
-6
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5739,6 +5739,20 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
57395739
Worklist.push_back(InstOp);
57405740
}
57415741

5742+
auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
5743+
// If there are direct memory op users of the newly scalarized load,
5744+
// their cost may have changed because there's no scalarization
5745+
// overhead for the operand. Update it.
5746+
for (User *U : LI->users()) {
5747+
if (!isa<LoadInst, StoreInst>(U))
5748+
continue;
5749+
if (getWideningDecision(cast<Instruction>(U), VF) != CM_Scalarize)
5750+
continue;
5751+
setWideningDecision(
5752+
cast<Instruction>(U), VF, CM_Scalarize,
5753+
getMemInstScalarizationCost(cast<Instruction>(U), VF));
5754+
}
5755+
};
57425756
for (auto *I : AddrDefs) {
57435757
if (isa<LoadInst>(I)) {
57445758
// Setting the desired widening decision should ideally be handled in
@@ -5748,21 +5762,24 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
57485762
InstWidening Decision = getWideningDecision(I, VF);
57495763
if (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
57505764
(!isPredicatedInst(I) && !Legal->isUniformMemOp(*I, VF) &&
5751-
Decision == CM_Scalarize))
5765+
Decision == CM_Scalarize)) {
57525766
// Scalarize a widened load of address or update the cost of a scalar
57535767
// load of an address.
57545768
setWideningDecision(
57555769
I, VF, CM_Scalarize,
57565770
(VF.getKnownMinValue() *
57575771
getMemoryInstructionCost(I, ElementCount::getFixed(1))));
5758-
else if (const auto *Group = getInterleavedAccessGroup(I)) {
5772+
UpdateMemOpUserCost(cast<LoadInst>(I));
5773+
} else if (const auto *Group = getInterleavedAccessGroup(I)) {
57595774
// Scalarize an interleave group of address loads.
57605775
for (unsigned I = 0; I < Group->getFactor(); ++I) {
5761-
if (Instruction *Member = Group->getMember(I))
5776+
if (Instruction *Member = Group->getMember(I)) {
57625777
setWideningDecision(
57635778
Member, VF, CM_Scalarize,
57645779
(VF.getKnownMinValue() *
57655780
getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
5781+
UpdateMemOpUserCost(cast<LoadInst>(Member));
5782+
}
57665783
}
57675784
}
57685785
} else {

llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ target triple = "aarch64-unknown-linux"
77
; Test case from https://github.com/llvm/llvm-project/issues/148431.
88
define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8 %n, i64 %off) #0 {
99
; CHECK-LABEL: define void @test_predicated_load_cast_hint(
10+
; CHECK-SAME: ptr [[DST_1:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]], i8 [[N:%.*]], i64 [[OFF:%.*]]) #[[ATTR0:[0-9]+]] {
11+
; CHECK-NEXT: [[ENTRY:.*]]:
1012
; CHECK-NEXT: [[N_EXT:%.*]] = sext i8 [[N]] to i32
1113
; CHECK-NEXT: [[N_SUB:%.*]] = add i32 [[N_EXT]], -15
1214
; CHECK-NEXT: [[SMAX16:%.*]] = call i32 @llvm.smax.i32(i32 [[N_SUB]], i32 4)
@@ -65,6 +67,9 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
6567
; CHECK-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
6668
; CHECK-NEXT: br i1 [[CONFLICT_RDX15]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
6769
; CHECK: [[VECTOR_PH]]:
70+
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP2]], 3
71+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
72+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
6873
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 [[TMP2]])
6974
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
7075
; CHECK: [[VECTOR_BODY]]:
@@ -119,10 +124,10 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
119124
; CHECK-NEXT: store i8 0, ptr [[DST_2]], align 1, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
120125
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
121126
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX_NEXT]], i32 [[TMP2]])
122-
; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
123-
; CHECK-NEXT: [[TMP48:%.*]] = xor i1 [[TMP47]], true
127+
; CHECK-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
124128
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 16)
125-
; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
129+
; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i1> [[TMP48]], i32 0
130+
; CHECK-NEXT: br i1 [[TMP49]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
126131
; CHECK: [[MIDDLE_BLOCK]]:
127132
; CHECK-NEXT: br label %[[EXIT:.*]]
128133
; CHECK: [[SCALAR_PH]]:

0 commit comments

Comments
 (0)