Skip to content

Commit ff9ba7c

Browse files
committed
[LoopVectorize][NFC] Rewrite tests to check output of vplan cost model
Currently it's not possible to improve the cost model for tail-folded loops because as soon as you add a VPInstruction::computeCost function that adds the costs of instructions such as VPInstruction::ActiveLaneMask and VPInstruction::ExplicitVectorLength the assert in LoopVectorizationPlanner::computeBestVF fails for some tests. This is because the VF chosen by the legacy cost model doesn't match the vplan cost model. See PR llvm#90191. This assert is currently inhibiting attempts to improve the cost model. I would like to remove the assert since we've been using the vplan cost model for 2 months now and that feels long enough to me. However, in order to do that we have to fix up a whole bunch of tests that rely upon the legacy cost model output. I've tried my best to update these tests to use vplan output instead. There are still a whole bunch of vectoriser tests in Analysis/CostModel/X86 that depend upon the legacy cost model, and also I feel they shouldn't really live there either. These can be fixed up in a separate patch!
1 parent 6bac414 commit ff9ba7c

25 files changed

+362
-277
lines changed

llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ target triple = "aarch64--linux-gnu"
1313
; %var4 a lower scalarization overhead.
1414
;
1515
; COST-LABEL: predicated_udiv_scalarized_operand
16-
; COST: LV: Found an estimated cost of 5 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
16+
; COST: Cost of 5 for VF 2: profitable to scalarize %var4 = udiv i64 %var2, %var3
1717
;
1818
;
1919
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {

llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,14 @@
1111
; CM: LV: Found uniform instruction: %a = extractvalue { i64, i64 } %sv, 0
1212
; CM: LV: Found uniform instruction: %b = extractvalue { i64, i64 } %sv, 1
1313

14+
; Ensure the extractvalue + add instructions are hoisted out
15+
; CM: vector.ph:
16+
; CM: CLONE ir<%a> = extractvalue ir<%sv>
17+
; CM: CLONE ir<%b> = extractvalue ir<%sv>
18+
; CM: WIDEN ir<%add> = add ir<%a>, ir<%b>
19+
; CM: Successor(s): vector loop
20+
1421
; CM: LV: Scalar loop costs: 5.
15-
; CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0
16-
; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1
1722

1823
; Check that the extractvalue operands are actually free in vector code.
1924

@@ -58,12 +63,14 @@ exit:
5863
; Similar to the test case above, but checks getVectorCallCost as well.
5964
declare float @powf(float, float) readnone nounwind
6065

61-
; CM: LV: Found uniform instruction: %a = extractvalue { float, float } %sv, 0
62-
; CM: LV: Found uniform instruction: %b = extractvalue { float, float } %sv, 1
66+
; Ensure the extractvalue + add instructions are hoisted out
67+
; CM: vector.ph:
68+
; CM: CLONE ir<%a> = extractvalue ir<%sv>
69+
; CM: CLONE ir<%b> = extractvalue ir<%sv>
70+
; CM: WIDEN ir<%add> = add ir<%a>, ir<%b>
71+
; CM: Successor(s): vector loop
6372

6473
; CM: LV: Scalar loop costs: 14.
65-
; CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %a = extractvalue { float, float } %sv, 0
66-
; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %b = extractvalue { float, float } %sv, 1
6774

6875
; FORCED-LABEL: define void @test_getVectorCallCost
6976

llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ target triple = "aarch64--linux-gnu"
88

99
; CHECK-COST-LABEL: sadd
1010
; CHECK-COST: Found an estimated cost of 6 for VF 1 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
11-
; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
12-
; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
13-
; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
11+
; CHECK-COST: Cost of 4 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
12+
; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
13+
; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
1414

1515
define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
1616
; CHECK-LABEL: @saddsat(
@@ -95,10 +95,10 @@ while.end: ; preds = %while.body, %entry
9595

9696
; CHECK-COST-LABEL: umin
9797
; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
98-
; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
99-
; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
100-
; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
101-
; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
98+
; CHECK-COST: Cost of 1 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
99+
; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
100+
; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
101+
; CHECK-COST: Cost of 1 for VF 16: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
102102

103103
define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
104104
; CHECK-LABEL: @umin(

llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
target triple = "aarch64-unknown-linux-gnu"
66

77
; CHECK-COST: Checking a loop in 'fixed_width'
8-
; CHECK-COST: Found an estimated cost of 10 for VF 2 For instruction: store i32 2, ptr %arrayidx1, align 4
9-
; CHECK-COST: Found an estimated cost of 20 for VF 4 For instruction: store i32 2, ptr %arrayidx1, align 4
8+
; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<%6>, ir<2>, vp<%5>
9+
; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<%6>, ir<2>, vp<%5>
1010
; CHECK-COST: Selecting VF: 1.
1111

1212
; We should decide this loop is not worth vectorising using fixed width vectors

llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ target triple = "aarch64"
1010
; due to invalid cost decisions. The loop below has a low maximum trip count,
1111
; so will be masked.
1212

13-
; COST: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %0 = load
14-
; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %0 = load
15-
; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %0 = load
16-
; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %0 = load
13+
; COST: Cost of 3000000 for VF 2: REPLICATE ir<%0> = load
14+
; COST: Cost of 3000000 for VF 4: REPLICATE ir<%0> = load
15+
; COST: Cost of 3000000 for VF 8: REPLICATE ir<%0> = load
16+
; COST: Cost of 3000000 for VF 16: REPLICATE ir<%0> = load
1717
; COST: LV: Selecting VF: 1.
1818

1919
define i32 @test(ptr nocapture noundef readonly %pInVec, ptr nocapture noundef readonly %pInA1, ptr nocapture noundef readonly %pInA2, ptr nocapture noundef readonly %pInA3, ptr nocapture noundef readonly %pInA4, i32 noundef %numCols) {

llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ target triple = "aarch64--linux-gnu"
66

77
; CHECK-LABEL: all_scalar
88
; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
9-
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
109
; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions
1110
;
1211
define void @all_scalar(ptr %a, i64 %n) {
@@ -27,7 +26,6 @@ for.end:
2726

2827
; CHECK-LABEL: PR33193
2928
; CHECK: LV: Found scalar instruction: %i.next = zext i32 %j.next to i64
30-
; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
3129
; CHECK: LV: Not considering vector loop of width 8 because it will not generate any vector instructions
3230
%struct.a = type { i32, i8 }
3331
define void @PR33193(ptr %a, i64 %n) {

llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
99
;; registers required for a <vscale x 4 x fp128> when trying to maximize
1010
;; vector bandwidth with SVE.
1111

12-
; CHECK: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %load.ext = fpext double %load.in to fp128
12+
; CHECK: Cost of Invalid for VF vscale x 2: WIDEN-CAST ir<%load.ext> = fpext ir<%load.in> to fp128
1313

1414
define void @load_ext_trunc_store(ptr readonly %in, ptr noalias %out, i64 %N) {
1515
; CHECK-LABEL: define void @load_ext_trunc_store(

llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,59 @@
11
; REQUIRES: asserts
22
; RUN: opt -mtriple=aarch64 -mattr=+sve \
33
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
4-
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
4+
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
55

66
; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
77
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
8-
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
8+
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
99

1010
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
1111
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
12-
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
12+
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16
1313

1414
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
1515
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
16-
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
16+
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
1717

18-
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
18+
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v2 \
1919
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
20-
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
20+
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V2,VF-16
21+
22+
; GENERIC: Cost for VF vscale x 2: 11
23+
; GENERIC: Cost for VF vscale x 4: 11
24+
; GENERIC: LV: Selecting VF: vscale x 16
2125

22-
; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
23-
; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
26+
; NEOVERSE-V1: Cost for VF vscale x 2: 11
27+
; NEOVERSE-V1: Cost for VF vscale x 4: 11
28+
; NEOVERSE-V1: LV: Selecting VF: vscale x 16
2429

25-
; NEOVERSE-V1: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
26-
; NEOVERSE-V1: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
30+
; NEOVERSE-N2: Cost for VF vscale x 2: 11
31+
; NEOVERSE-N2: Cost for VF vscale x 4: 11
32+
; NEOVERSE-N2: LV: Selecting VF: vscale x 16
2733

28-
; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1).
29-
; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
34+
; NEOVERSE-V2: Cost for VF vscale x 2: 11
35+
; NEOVERSE-V2: Cost for VF vscale x 4: 11
36+
; NEOVERSE-V2: LV: Selecting VF: 16
3037

31-
; VF-4: <4 x i32>
32-
; VF-VSCALE4: <16 x i32>
38+
; VF-16: <16 x i8>
39+
; VF-VSCALE16: <vscale x 16 x i8>
3340
define void @test0(ptr %a, ptr %b, ptr %c) #0 {
3441
entry:
3542
br label %loop
3643

3744
loop:
3845
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
39-
%arrayidx = getelementptr inbounds i32, ptr %c, i64 %iv
40-
%0 = load i32, ptr %arrayidx, align 4
46+
%arrayidx = getelementptr inbounds i8, ptr %c, i64 %iv
47+
%0 = load i8, ptr %arrayidx, align 4
4148
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
4249
%1 = load i8, ptr %arrayidx2, align 4
43-
%zext = zext i8 %1 to i32
44-
%add = add nsw i32 %zext, %0
45-
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %iv
46-
store i32 %add, ptr %arrayidx5, align 4
50+
%add = add nsw i8 %0, %1
51+
%arrayidx5 = getelementptr inbounds i8, ptr %a, i64 %iv
52+
store i8 %add, ptr %arrayidx5, align 4
4753
%iv.next = add nuw nsw i64 %iv, 1
4854
%exitcond.not = icmp eq i64 %iv.next, 1024
4955
br i1 %exitcond.not, label %exit, label %loop
5056

5157
exit:
5258
ret void
5359
}
54-

llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,15 @@ target triple = "arm64-apple-ios5.0.0"
66

77
define void @selects_1(ptr nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) {
88
; CHECK: LV: Checking a loop in 'selects_1'
9-
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond = select i1 %cmp1, i32 10, i32 %and
10-
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond6 = select i1 %cmp2, i32 30, i32 %and
11-
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
129

13-
; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond = select i1 %cmp1, i32 10, i32 %and
14-
; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond6 = select i1 %cmp2, i32 30, i32 %and
15-
; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
10+
; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
11+
; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
12+
; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>
13+
14+
; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
15+
; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
16+
; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>
17+
1618
; CHECK: LV: Selecting VF: 4
1719

1820
entry:
@@ -48,9 +50,11 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo
4850

4951
define i32 @multi_user_cmp(ptr readonly %a, i64 noundef %n) {
5052
; CHECK: LV: Checking a loop in 'multi_user_cmp'
51-
; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %cmp1 = fcmp olt float %load1, 0.000000e+00
52-
; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
53-
; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
53+
; CHECK: Cost of 1 for VF 16:
54+
; CHECK: any-of reduction %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
55+
; CHECK: Cost of 1 for VF 16:
56+
; CHECK: any-of reduction %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
57+
; CHECK: Cost of 4 for VF 16: WIDEN ir<%cmp1> = fcmp olt ir<%load1>, ir<0.000000e+00>
5458
; CHECK: LV: Selecting VF: 16.
5559
entry:
5660
br label %for.body

llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44

55
target triple = "aarch64-unknown-linux-gnu"
66

7-
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
8-
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %addi7 = add i7 %indvars.iv1294, 0
9-
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
7+
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
8+
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
9+
; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN ir<%addi7> = add ir<%indvars.iv1294>, ir<0>
1010

1111
define void @induction_i7(ptr %dst) #0 {
1212
; CHECK-LABEL: define void @induction_i7(
@@ -71,9 +71,9 @@ for.end: ; preds = %for.body
7171
}
7272

7373

74-
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
75-
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %zexti3 = zext i3 %indvars.iv1294 to i64
76-
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
74+
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
75+
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
76+
; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext ir<%indvars.iv1294> to i64
7777

7878
define void @induction_i3_zext(ptr %dst) #0 {
7979
; CHECK-LABEL: define void @induction_i3_zext(

0 commit comments

Comments
 (0)