Skip to content

Commit bb9449d

Browse files
authored
[InstCombine] Fold @llvm.experimental.get.vector.length when cnt <= max_lanes (#169293)
On RISC-V, some loops that the loop vectorizer vectorizes pre-LTO may turn out to have the exact trip count exposed after LTO, see #164762. If the trip count is small enough we can fold away the @llvm.experimental.get.vector.length intrinsic based on this corollary from the LangRef: > If %cnt is less than or equal to %max_lanes, the return value is equal to %cnt. This on its own doesn't remove the @llvm.experimental.get.vector.length in #164762 since we also need to teach computeKnownBits about @llvm.experimental.get.vector.length and the sub recurrence, but this PR is a starting point. I've added this in InstCombine rather than InstSimplify since we may need to insert a truncation (@llvm.experimental.get.vector.length can take an i64 %cnt argument, the result is always i32). Note that there was something similar done in VPlan in #167647 for when the loop vectorizer knows the trip count.
1 parent f1ddb2f commit bb9449d

File tree

2 files changed

+110
-0
lines changed

2 files changed

+110
-0
lines changed

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4016,6 +4016,27 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
40164016
}
40174017
break;
40184018
}
4019+
case Intrinsic::experimental_get_vector_length: {
4020+
// get.vector.length(Cnt, MaxLanes) --> Cnt when Cnt <= MaxLanes
4021+
unsigned BitWidth =
4022+
std::max(II->getArgOperand(0)->getType()->getScalarSizeInBits(),
4023+
II->getType()->getScalarSizeInBits());
4024+
ConstantRange Cnt =
4025+
computeConstantRangeIncludingKnownBits(II->getArgOperand(0), false,
4026+
SQ.getWithInstruction(II))
4027+
.zextOrTrunc(BitWidth);
4028+
ConstantRange MaxLanes = cast<ConstantInt>(II->getArgOperand(1))
4029+
->getValue()
4030+
.zextOrTrunc(Cnt.getBitWidth());
4031+
if (cast<ConstantInt>(II->getArgOperand(2))->isOne())
4032+
MaxLanes = MaxLanes.multiply(
4033+
getVScaleRange(II->getFunction(), Cnt.getBitWidth()));
4034+
4035+
if (Cnt.icmp(CmpInst::ICMP_ULE, MaxLanes))
4036+
return replaceInstUsesWith(
4037+
*II, Builder.CreateZExtOrTrunc(II->getArgOperand(0), II->getType()));
4038+
return nullptr;
4039+
}
40194040
default: {
40204041
// Handle target specific intrinsics
40214042
std::optional<Instruction *> V = targetInstCombineIntrinsic(*II);
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
2+
; RUN: opt < %s -passes=instcombine,verify -S | FileCheck %s
3+
4+
define i32 @cnt_known_lt() {
5+
; CHECK-LABEL: define i32 @cnt_known_lt() {
6+
; CHECK-NEXT: ret i32 1
7+
;
8+
%x = call i32 @llvm.experimental.get.vector.length(i32 1, i32 2, i1 false)
9+
ret i32 %x
10+
}
11+
12+
define i32 @cnt_not_known_lt() {
13+
; CHECK-LABEL: define i32 @cnt_not_known_lt() {
14+
; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 2, i32 1, i1 false)
15+
; CHECK-NEXT: ret i32 [[X]]
16+
;
17+
%x = call i32 @llvm.experimental.get.vector.length(i32 2, i32 1, i1 false)
18+
ret i32 %x
19+
}
20+
21+
define i32 @cnt_known_lt_scalable() vscale_range(2, 4) {
22+
; CHECK-LABEL: define i32 @cnt_known_lt_scalable(
23+
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
24+
; CHECK-NEXT: ret i32 2
25+
;
26+
%x = call i32 @llvm.experimental.get.vector.length(i32 2, i32 1, i1 true)
27+
ret i32 %x
28+
}
29+
30+
define i32 @cnt_not_known_lt_scalable() {
31+
; CHECK-LABEL: define i32 @cnt_not_known_lt_scalable() {
32+
; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 2, i32 1, i1 true)
33+
; CHECK-NEXT: ret i32 [[X]]
34+
;
35+
%x = call i32 @llvm.experimental.get.vector.length(i32 2, i32 1, i1 true)
36+
ret i32 %x
37+
}
38+
39+
define i32 @cnt_known_lt_runtime(i32 %x) {
40+
; CHECK-LABEL: define i32 @cnt_known_lt_runtime(
41+
; CHECK-SAME: i32 [[X:%.*]]) {
42+
; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i32 [[X]], 4
43+
; CHECK-NEXT: call void @llvm.assume(i1 [[ICMP]])
44+
; CHECK-NEXT: ret i32 [[X]]
45+
;
46+
%icmp = icmp ule i32 %x, 3
47+
call void @llvm.assume(i1 %icmp)
48+
%y = call i32 @llvm.experimental.get.vector.length(i32 %x, i32 3, i1 false)
49+
ret i32 %y
50+
}
51+
52+
define i32 @cnt_known_lt_runtime_trunc(i64 %x) {
53+
; CHECK-LABEL: define i32 @cnt_known_lt_runtime_trunc(
54+
; CHECK-SAME: i64 [[X:%.*]]) {
55+
; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[X]], 4
56+
; CHECK-NEXT: call void @llvm.assume(i1 [[ICMP]])
57+
; CHECK-NEXT: [[Y:%.*]] = trunc nuw nsw i64 [[X]] to i32
58+
; CHECK-NEXT: ret i32 [[Y]]
59+
;
60+
%icmp = icmp ule i64 %x, 3
61+
call void @llvm.assume(i1 %icmp)
62+
%y = call i32 @llvm.experimental.get.vector.length(i64 %x, i32 3, i1 false)
63+
ret i32 %y
64+
}
65+
66+
; FIXME: We should be able to deduce the constant range from AssumptionCache
67+
; rather than relying on KnownBits, which in this case only knows x <= 3.
68+
define i32 @cnt_known_lt_runtime_assumption(i32 %x) {
69+
; CHECK-LABEL: define i32 @cnt_known_lt_runtime_assumption(
70+
; CHECK-SAME: i32 [[X:%.*]]) {
71+
; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i32 [[X]], 3
72+
; CHECK-NEXT: call void @llvm.assume(i1 [[ICMP]])
73+
; CHECK-NEXT: [[Y:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[X]], i32 2, i1 false)
74+
; CHECK-NEXT: ret i32 [[Y]]
75+
;
76+
%icmp = icmp ule i32 %x, 2
77+
call void @llvm.assume(i1 %icmp)
78+
%y = call i32 @llvm.experimental.get.vector.length(i32 %x, i32 2, i1 false)
79+
ret i32 %y
80+
}
81+
82+
83+
define i32 @cnt_known_lt_i16() {
84+
; CHECK-LABEL: define i32 @cnt_known_lt_i16() {
85+
; CHECK-NEXT: ret i32 1
86+
;
87+
%x = call i32 @llvm.experimental.get.vector.length(i16 1, i32 2, i1 false)
88+
ret i32 %x
89+
}

0 commit comments

Comments
 (0)