Skip to content

Commit 92c8faa

Browse files
committed
[VPlan] Consider address computation cost in VPInterleaveRecipe.
As a side-effect of #106431, we started vectorising loops such as: ```cpp void s351(float *a, float *b, float *c, long n) { float alpha = c[0]; for (int i = 0; i < n; i += 5) { a[i] += alpha * b[i]; a[i + 1] += alpha * b[i + 1]; a[i + 2] += alpha * b[i + 2]; a[i + 3] += alpha * b[i + 3]; a[i + 4] += alpha * b[i + 4]; } } ``` (https://godbolt.org/z/o1fGfj6j7), which resulted in a slowdown of 54%. The estimated cost of the scalar and vectorised versions is currently very close (51 vs 50.5 per lane), in favour of the vectorised version. This patch helps VPlan favour the scalar version by taking into account the cost of address computations in VPInterleaveRecipe, as is already done for the scalar version in LoopVectorizationCostModel::getMemoryInstructionCost and for other recipes such as VPWidenMemoryRecipe. This affects ``` LLVM :: Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll ``` due to the new cost estimate. I've updated the test accordingly, but please let me know if this is not appropriate. As mentioned in #82218, ideally the loop above should be rerolled before attempting vectorisation. I'm hoping the approach in this patch is a reasonble alternative in its own right in the meantime. If there's a better suggestion to achieve this, please let me know. :)
1 parent def7bbb commit 92c8faa

File tree

2 files changed

+30
-50
lines changed

2 files changed

+30
-50
lines changed

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3666,6 +3666,9 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
36663666
InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
36673667
IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
36683668

3669+
// Add the address computation cost.
3670+
Cost += Ctx.TTI.getAddressComputationCost(WideVecTy);
3671+
36693672
if (!IG->isReverse())
36703673
return Cost;
36713674

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll

Lines changed: 27 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -8,46 +8,31 @@ define void @test_complex_add_float(ptr %res, ptr noalias %A, ptr noalias %B, i6
88
; CHECK-LABEL: define void @test_complex_add_float(
99
; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
1010
; CHECK-NEXT: [[ENTRY:.*:]]
11-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
11+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
1212
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
1313
; CHECK: [[VECTOR_PH]]:
14-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
14+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
1515
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
1616
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
1717
; CHECK: [[VECTOR_BODY]]:
1818
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
19-
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
20-
; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[INDEX]]
21-
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[TMP1]]
22-
; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[INDEX]]
23-
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[TMP1]]
24-
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[GEP_A_0]], align 4
19+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[INDEX]]
20+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[INDEX]]
21+
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[TMP0]], align 4
2522
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2623
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
27-
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
24+
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x float>, ptr [[TMP1]], align 4
2825
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2926
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
30-
; CHECK-NEXT: [[WIDE_VEC5:%.*]] = load <8 x float>, ptr [[GEP_B_0]], align 4
31-
; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
32-
; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
33-
; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
34-
; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
35-
; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
36-
; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[STRIDED_VEC]], [[STRIDED_VEC6]]
37-
; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[STRIDED_VEC3]], [[STRIDED_VEC9]]
38-
; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
39-
; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[STRIDED_VEC4]], [[STRIDED_VEC10]]
40-
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[INDEX]]
41-
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[TMP1]]
42-
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
43-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
44-
; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
45-
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
46-
; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
47-
; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4
48-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
49-
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
50-
; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
27+
; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[STRIDED_VEC]], [[STRIDED_VEC3]]
28+
; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[STRIDED_VEC1]], [[STRIDED_VEC4]]
29+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[INDEX]]
30+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
31+
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
32+
; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
33+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
34+
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
35+
; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
5136
; CHECK: [[MIDDLE_BLOCK]]:
5237
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
5338
; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -84,32 +69,24 @@ define void @test_complex_add_double(ptr %res, ptr noalias %A, ptr noalias %B, i
8469
; CHECK-LABEL: define void @test_complex_add_double(
8570
; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
8671
; CHECK-NEXT: [[ENTRY:.*:]]
87-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
72+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
8873
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
8974
; CHECK: [[VECTOR_PH]]:
90-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
75+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
9176
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
9277
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
9378
; CHECK: [[VECTOR_BODY]]:
9479
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
95-
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
96-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
97-
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP1]]
98-
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[INDEX]]
99-
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]]
100-
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
101-
; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = load <2 x double>, ptr [[TMP3]], align 4
102-
; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = load <2 x double>, ptr [[TMP4]], align 4
103-
; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = load <2 x double>, ptr [[TMP5]], align 4
104-
; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
105-
; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x double> [[STRIDED_VEC5]], [[STRIDED_VEC11]]
106-
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
107-
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]]
108-
; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[TMP10]], align 4
109-
; CHECK-NEXT: store <2 x double> [[TMP15]], ptr [[TMP11]], align 4
110-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
111-
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
112-
; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
80+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
81+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[INDEX]]
82+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 4
83+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
84+
; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD1]]
85+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
86+
; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[TMP3]], align 4
87+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
88+
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
89+
; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
11390
; CHECK: [[MIDDLE_BLOCK]]:
11491
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
11592
; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]

0 commit comments

Comments
 (0)