Skip to content

Commit 2c9e9ff

Browse files
authored
[SCCP] Handle llvm.experimental.get.vector.length calls (llvm#169527)
As noted in the reproducer provided in llvm#164762 (comment), on RISC-V after LTO we sometimes have trip counts exposed to vectorized loops. The loop vectorizer will have generated calls to @llvm.experimental.get.vector.length, but there are [some properties](https://llvm.org/docs/LangRef.html#id2399) about the intrinsic we can use to simplify it: - The result is always less than both Count and MaxLanes - If Count <= MaxLanes, then the result is Count This teaches SCCP to handle these cases with the intrinsic, which allows some single-iteration-after-LTO loops to be unfolded. llvm#169293 is related and also simplifies the intrinsic in InstCombine via computeKnownBits, but it can't fully remove the loop since computeKnownBits only does limited reasoning on recurrences.
1 parent 8ec2112 commit 2c9e9ff

File tree

2 files changed

+179
-0
lines changed

2 files changed

+179
-0
lines changed

llvm/lib/Transforms/Utils/SCCPSolver.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2098,6 +2098,38 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
20982098
return (void)mergeInValue(ValueState[II], II,
20992099
ValueLatticeElement::getRange(Result));
21002100
}
2101+
if (II->getIntrinsicID() == Intrinsic::experimental_get_vector_length) {
2102+
Value *CountArg = II->getArgOperand(0);
2103+
Value *VF = II->getArgOperand(1);
2104+
bool Scalable = cast<ConstantInt>(II->getArgOperand(2))->isOne();
2105+
2106+
// Computation happens in the larger type.
2107+
unsigned BitWidth = std::max(CountArg->getType()->getScalarSizeInBits(),
2108+
VF->getType()->getScalarSizeInBits());
2109+
2110+
ConstantRange Count = getValueState(CountArg)
2111+
.asConstantRange(CountArg->getType(), false)
2112+
.zextOrTrunc(BitWidth);
2113+
ConstantRange MaxLanes = getValueState(VF)
2114+
.asConstantRange(VF->getType(), false)
2115+
.zextOrTrunc(BitWidth);
2116+
if (Scalable)
2117+
MaxLanes =
2118+
MaxLanes.multiply(getVScaleRange(II->getFunction(), BitWidth));
2119+
2120+
// The result is always less than both Count and MaxLanes.
2121+
ConstantRange Result(
2122+
APInt::getZero(BitWidth),
2123+
APIntOps::umin(Count.getUpper(), MaxLanes.getUpper()));
2124+
2125+
// If Count <= MaxLanes, getvectorlength(Count, MaxLanes) = Count
2126+
if (Count.icmp(CmpInst::ICMP_ULE, MaxLanes))
2127+
Result = Count;
2128+
2129+
Result = Result.zextOrTrunc(II->getType()->getScalarSizeInBits());
2130+
return (void)mergeInValue(ValueState[II], II,
2131+
ValueLatticeElement::getRange(Result));
2132+
}
21012133

21022134
if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) {
21032135
// Compute result range for intrinsics supported by ConstantRange.
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
2+
; RUN: opt < %s -p sccp -S | FileCheck %s
3+
4+
define i1 @result_le_count() {
5+
; CHECK-LABEL: define i1 @result_le_count() {
6+
; CHECK-NEXT: ret i1 true
7+
;
8+
%x = call i32 @llvm.experimental.get.vector.length(i32 3, i32 4, i1 false)
9+
%res = icmp ule i32 %x, 3
10+
ret i1 %res
11+
}
12+
13+
define i1 @result_le_max_lanes(i32 %count) {
14+
; CHECK-LABEL: define i1 @result_le_max_lanes(
15+
; CHECK-SAME: i32 [[COUNT:%.*]]) {
16+
; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[COUNT]], i32 3, i1 false)
17+
; CHECK-NEXT: ret i1 true
18+
;
19+
%x = call i32 @llvm.experimental.get.vector.length(i32 %count, i32 3, i1 false)
20+
%res = icmp ule i32 %x, 3
21+
ret i1 %res
22+
}
23+
24+
define i1 @result_le_max_lanes_scalable(i32 %count) vscale_range(2, 4) {
25+
; CHECK-LABEL: define i1 @result_le_max_lanes_scalable(
26+
; CHECK-SAME: i32 [[COUNT:%.*]]) #[[ATTR0:[0-9]+]] {
27+
; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[COUNT]], i32 4, i1 true)
28+
; CHECK-NEXT: ret i1 true
29+
;
30+
%x = call i32 @llvm.experimental.get.vector.length(i32 %count, i32 4, i1 true)
31+
%res = icmp ule i32 %x, 16
32+
ret i1 %res
33+
}
34+
35+
define i32 @count_le_max_lanes() {
36+
; CHECK-LABEL: define i32 @count_le_max_lanes() {
37+
; CHECK-NEXT: [[ENTRY:.*:]]
38+
; CHECK-NEXT: br label %[[LOOP:.*]]
39+
; CHECK: [[LOOP]]:
40+
; CHECK-NEXT: br label %[[EXIT:.*]]
41+
; CHECK: [[EXIT]]:
42+
; CHECK-NEXT: ret i32 4
43+
;
44+
entry:
45+
br label %loop
46+
47+
loop:
48+
%iv = phi i32 [4, %entry], [%iv.next, %loop]
49+
%x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 false)
50+
%iv.next = sub i32 %iv, %x
51+
%ec = icmp eq i32 %iv.next, 0
52+
br i1 %ec, label %exit, label %loop
53+
54+
exit:
55+
ret i32 %x
56+
}
57+
58+
; Can't simplify because %iv isn't <= max lanes.
59+
define i32 @count_not_le_max_lanes() {
60+
; CHECK-LABEL: define range(i32 0, 5) i32 @count_not_le_max_lanes() {
61+
; CHECK-NEXT: [[ENTRY:.*]]:
62+
; CHECK-NEXT: br label %[[LOOP:.*]]
63+
; CHECK: [[LOOP]]:
64+
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 6, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
65+
; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[IV]], i32 4, i1 false)
66+
; CHECK-NEXT: [[IV_NEXT]] = sub i32 [[IV]], [[X]]
67+
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
68+
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
69+
; CHECK: [[EXIT]]:
70+
; CHECK-NEXT: ret i32 [[X]]
71+
;
72+
entry:
73+
br label %loop
74+
75+
loop:
76+
%iv = phi i32 [6, %entry], [%iv.next, %loop]
77+
%x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 false)
78+
%iv.next = sub i32 %iv, %x
79+
%ec = icmp eq i32 %iv.next, 0
80+
br i1 %ec, label %exit, label %loop
81+
82+
exit:
83+
ret i32 %x
84+
}
85+
86+
define i32 @count_le_max_lanes_scalable_known() vscale_range(4, 8) {
87+
; CHECK-LABEL: define i32 @count_le_max_lanes_scalable_known(
88+
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
89+
; CHECK-NEXT: [[ENTRY:.*:]]
90+
; CHECK-NEXT: br label %[[LOOP:.*]]
91+
; CHECK: [[LOOP]]:
92+
; CHECK-NEXT: br label %[[EXIT:.*]]
93+
; CHECK: [[EXIT]]:
94+
; CHECK-NEXT: ret i32 16
95+
;
96+
entry:
97+
br label %loop
98+
99+
loop:
100+
%iv = phi i32 [16, %entry], [%iv.next, %loop]
101+
%x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 true)
102+
%iv.next = sub i32 %iv, %x
103+
%ec = icmp eq i32 %iv.next, 0
104+
br i1 %ec, label %exit, label %loop
105+
106+
exit:
107+
ret i32 %x
108+
}
109+
110+
; Can't simplify because %iv isn't guaranteed <= max lanes.
111+
define i32 @count_le_max_lanes_scalable_unknown() {
112+
; CHECK-LABEL: define range(i32 0, -1) i32 @count_le_max_lanes_scalable_unknown() {
113+
; CHECK-NEXT: [[ENTRY:.*]]:
114+
; CHECK-NEXT: br label %[[LOOP:.*]]
115+
; CHECK: [[LOOP]]:
116+
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 16, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
117+
; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[IV]], i32 4, i1 true)
118+
; CHECK-NEXT: [[IV_NEXT]] = sub i32 [[IV]], [[X]]
119+
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
120+
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
121+
; CHECK: [[EXIT]]:
122+
; CHECK-NEXT: ret i32 [[X]]
123+
;
124+
entry:
125+
br label %loop
126+
127+
loop:
128+
%iv = phi i32 [16, %entry], [%iv.next, %loop]
129+
%x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 true)
130+
%iv.next = sub i32 %iv, %x
131+
%ec = icmp eq i32 %iv.next, 0
132+
br i1 %ec, label %exit, label %loop
133+
134+
exit:
135+
ret i32 %x
136+
}
137+
138+
define i1 @result_le_overflow() {
139+
; CHECK-LABEL: define i1 @result_le_overflow() {
140+
; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 4294967296, i32 4, i1 false)
141+
; CHECK-NEXT: [[RES:%.*]] = icmp ule i32 [[X]], 3
142+
; CHECK-NEXT: ret i1 [[RES]]
143+
;
144+
%x = call i32 @llvm.experimental.get.vector.length(i64 u0x100000000, i32 4, i1 false)
145+
%res = icmp ule i32 %x, 3
146+
ret i1 %res
147+
}

0 commit comments

Comments
 (0)