Skip to content

Commit 275076d

Browse files
committed
[RISCV] Don't use EVL/Mask for vid when lowering vp.reverse
vp.reverse intrinsics are emitted by the loop vectorizer when EVL tail folding is enabled, and currently end up generating code like this: .LBB0_1: # %loop # =>This Inner Loop Header: Depth=1 sub a3, a2, a1 slli a4, a1, 3 vsetvli a3, a3, e64, m1, ta, ma add a4, a0, a4 vle64.v v8, (a4) addi a5, a3, -1 vid.v v9 vrsub.vx v9, v9, a5 vrgather.vv v10, v8, v9 add a1, a1, a3 vse64.v v10, (a4) bltu a1, a2, .LBB0_1 The vid.v needed for the indices is calculated every loop, but because its AVL is set to the EVL computed by get.vector.length within the loop it isn't hoisted out. This changes the AVL used to be VLMAX so it can be made loop invariant: vsetvli a3, zero, e64, m1, ta, ma vid.v v8 .LBB0_1: # %loop # =>This Inner Loop Header: Depth=1 sub a3, a2, a1 slli a4, a1, 3 vsetvli a3, a3, e64, m1, ta, ma add a4, a0, a4 vle64.v v9, (a4) addi a5, a3, -1 vrsub.vx v10, v8, a5 vrgather.vv v11, v9, v10 add a1, a1, a3 vse64.v v11, (a4) bltu a1, a2, .LBB0_1 Now that we have RISCVVLOptimizer, It shouldn't increase the number of vsetvlis for straight-line code. This also removes the mask which isn't needed, in case it also prevents hoisting.
1 parent f9d4785 commit 275076d

File tree

8 files changed

+180
-153
lines changed

8 files changed

+180
-153
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12430,7 +12430,11 @@ RISCVTargetLowering::lowerVPReverseExperimental(SDValue Op,
1243012430
GatherOpc = RISCVISD::VRGATHEREI16_VV_VL;
1243112431
}
1243212432

12433-
SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, IndicesVT, Mask, EVL);
12433+
// Don't use EVL or Mask for vid so it can be hoisted out of loops.
12434+
auto [TrueMask, VLMAX] =
12435+
getDefaultScalableVLOps(IndicesVT, DL, DAG, Subtarget);
12436+
SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, IndicesVT, TrueMask, VLMAX);
12437+
1243412438
SDValue VecLen =
1243512439
DAG.getNode(ISD::SUB, DL, XLenVT, EVL, DAG.getConstant(1, DL, XLenVT));
1243612440
SDValue VecLenSplat = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IndicesVT,

llvm/test/CodeGen/RISCV/rvv/vp-reverse-float-fixed-vectors.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
define <2 x double> @test_vp_reverse_v2f64_masked(<2 x double> %src, <2 x i1> %mask, i32 zeroext %evl) {
66
; CHECK-LABEL: test_vp_reverse_v2f64_masked:
77
; CHECK: # %bb.0:
8+
; CHECK-NEXT: addi a1, a0, -1
89
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
9-
; CHECK-NEXT: vid.v v9, v0.t
10-
; CHECK-NEXT: addi a0, a0, -1
11-
; CHECK-NEXT: vrsub.vx v10, v9, a0, v0.t
10+
; CHECK-NEXT: vid.v v9
11+
; CHECK-NEXT: vrsub.vx v10, v9, a1, v0.t
1212
; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t
1313
; CHECK-NEXT: vmv.v.v v8, v9
1414
; CHECK-NEXT: ret
@@ -34,10 +34,10 @@ define <2 x double> @test_vp_reverse_v2f64(<2 x double> %src, i32 zeroext %evl)
3434
define <4 x float> @test_vp_reverse_v4f32_masked(<4 x float> %src, <4 x i1> %mask, i32 zeroext %evl) {
3535
; CHECK-LABEL: test_vp_reverse_v4f32_masked:
3636
; CHECK: # %bb.0:
37+
; CHECK-NEXT: addi a1, a0, -1
3738
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
38-
; CHECK-NEXT: vid.v v9, v0.t
39-
; CHECK-NEXT: addi a0, a0, -1
40-
; CHECK-NEXT: vrsub.vx v10, v9, a0, v0.t
39+
; CHECK-NEXT: vid.v v9
40+
; CHECK-NEXT: vrsub.vx v10, v9, a1, v0.t
4141
; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t
4242
; CHECK-NEXT: vmv.v.v v8, v9
4343
; CHECK-NEXT: ret

llvm/test/CodeGen/RISCV/rvv/vp-reverse-float.ll

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
define <vscale x 1 x double> @test_vp_reverse_nxv1f64_masked(<vscale x 1 x double> %src, <vscale x 1 x i1> %mask, i32 zeroext %evl) {
55
; CHECK-LABEL: test_vp_reverse_nxv1f64_masked:
66
; CHECK: # %bb.0:
7+
; CHECK-NEXT: addi a1, a0, -1
78
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
8-
; CHECK-NEXT: vid.v v9, v0.t
9-
; CHECK-NEXT: addi a0, a0, -1
10-
; CHECK-NEXT: vrsub.vx v10, v9, a0, v0.t
9+
; CHECK-NEXT: vid.v v9
10+
; CHECK-NEXT: vrsub.vx v10, v9, a1, v0.t
1111
; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t
1212
; CHECK-NEXT: vmv.v.v v8, v9
1313
; CHECK-NEXT: ret
@@ -33,10 +33,10 @@ define <vscale x 1 x double> @test_vp_reverse_nxv1f64(<vscale x 1 x double> %src
3333
define <vscale x 2 x float> @test_vp_reverse_nxv2f32_masked(<vscale x 2 x float> %src, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
3434
; CHECK-LABEL: test_vp_reverse_nxv2f32_masked:
3535
; CHECK: # %bb.0:
36+
; CHECK-NEXT: addi a1, a0, -1
3637
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
37-
; CHECK-NEXT: vid.v v9, v0.t
38-
; CHECK-NEXT: addi a0, a0, -1
39-
; CHECK-NEXT: vrsub.vx v10, v9, a0, v0.t
38+
; CHECK-NEXT: vid.v v9
39+
; CHECK-NEXT: vrsub.vx v10, v9, a1, v0.t
4040
; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t
4141
; CHECK-NEXT: vmv.v.v v8, v9
4242
; CHECK-NEXT: ret
@@ -62,10 +62,10 @@ define <vscale x 2 x float> @test_vp_reverse_nxv2f32(<vscale x 2 x float> %src,
6262
define <vscale x 2 x double> @test_vp_reverse_nxv2f64_masked(<vscale x 2 x double> %src, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
6363
; CHECK-LABEL: test_vp_reverse_nxv2f64_masked:
6464
; CHECK: # %bb.0:
65+
; CHECK-NEXT: addi a1, a0, -1
6566
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
66-
; CHECK-NEXT: vid.v v10, v0.t
67-
; CHECK-NEXT: addi a0, a0, -1
68-
; CHECK-NEXT: vrsub.vx v12, v10, a0, v0.t
67+
; CHECK-NEXT: vid.v v10
68+
; CHECK-NEXT: vrsub.vx v12, v10, a1, v0.t
6969
; CHECK-NEXT: vrgather.vv v10, v8, v12, v0.t
7070
; CHECK-NEXT: vmv.v.v v8, v10
7171
; CHECK-NEXT: ret
@@ -91,10 +91,10 @@ define <vscale x 2 x double> @test_vp_reverse_nxv2f64(<vscale x 2 x double> %src
9191
define <vscale x 4 x float> @test_vp_reverse_nxv4f32_masked(<vscale x 4 x float> %src, <vscale x 4 x i1> %mask, i32 zeroext %evl) {
9292
; CHECK-LABEL: test_vp_reverse_nxv4f32_masked:
9393
; CHECK: # %bb.0:
94+
; CHECK-NEXT: addi a1, a0, -1
9495
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
95-
; CHECK-NEXT: vid.v v10, v0.t
96-
; CHECK-NEXT: addi a0, a0, -1
97-
; CHECK-NEXT: vrsub.vx v12, v10, a0, v0.t
96+
; CHECK-NEXT: vid.v v10
97+
; CHECK-NEXT: vrsub.vx v12, v10, a1, v0.t
9898
; CHECK-NEXT: vrgather.vv v10, v8, v12, v0.t
9999
; CHECK-NEXT: vmv.v.v v8, v10
100100
; CHECK-NEXT: ret
@@ -120,10 +120,10 @@ define <vscale x 4 x float> @test_vp_reverse_nxv4f32(<vscale x 4 x float> %src,
120120
define <vscale x 4 x double> @test_vp_reverse_nxv4f64_masked(<vscale x 4 x double> %src, <vscale x 4 x i1> %mask, i32 zeroext %evl) {
121121
; CHECK-LABEL: test_vp_reverse_nxv4f64_masked:
122122
; CHECK: # %bb.0:
123+
; CHECK-NEXT: addi a1, a0, -1
123124
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
124-
; CHECK-NEXT: vid.v v12, v0.t
125-
; CHECK-NEXT: addi a0, a0, -1
126-
; CHECK-NEXT: vrsub.vx v16, v12, a0, v0.t
125+
; CHECK-NEXT: vid.v v12
126+
; CHECK-NEXT: vrsub.vx v16, v12, a1, v0.t
127127
; CHECK-NEXT: vrgather.vv v12, v8, v16, v0.t
128128
; CHECK-NEXT: vmv.v.v v8, v12
129129
; CHECK-NEXT: ret
@@ -149,10 +149,10 @@ define <vscale x 4 x double> @test_vp_reverse_nxv4f64(<vscale x 4 x double> %src
149149
define <vscale x 8 x float> @test_vp_reverse_nxv8f32_masked(<vscale x 8 x float> %src, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
150150
; CHECK-LABEL: test_vp_reverse_nxv8f32_masked:
151151
; CHECK: # %bb.0:
152+
; CHECK-NEXT: addi a1, a0, -1
152153
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
153-
; CHECK-NEXT: vid.v v12, v0.t
154-
; CHECK-NEXT: addi a0, a0, -1
155-
; CHECK-NEXT: vrsub.vx v16, v12, a0, v0.t
154+
; CHECK-NEXT: vid.v v12
155+
; CHECK-NEXT: vrsub.vx v16, v12, a1, v0.t
156156
; CHECK-NEXT: vrgather.vv v12, v8, v16, v0.t
157157
; CHECK-NEXT: vmv.v.v v8, v12
158158
; CHECK-NEXT: ret
@@ -178,10 +178,10 @@ define <vscale x 8 x float> @test_vp_reverse_nxv8f32(<vscale x 8 x float> %src,
178178
define <vscale x 8 x double> @test_vp_reverse_nxv8f64_masked(<vscale x 8 x double> %src, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
179179
; CHECK-LABEL: test_vp_reverse_nxv8f64_masked:
180180
; CHECK: # %bb.0:
181+
; CHECK-NEXT: addi a1, a0, -1
181182
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
182-
; CHECK-NEXT: vid.v v16, v0.t
183-
; CHECK-NEXT: addi a0, a0, -1
184-
; CHECK-NEXT: vrsub.vx v24, v16, a0, v0.t
183+
; CHECK-NEXT: vid.v v16
184+
; CHECK-NEXT: vrsub.vx v24, v16, a1, v0.t
185185
; CHECK-NEXT: vrgather.vv v16, v8, v24, v0.t
186186
; CHECK-NEXT: vmv.v.v v8, v16
187187
; CHECK-NEXT: ret
@@ -207,10 +207,10 @@ define <vscale x 8 x double> @test_vp_reverse_nxv8f64(<vscale x 8 x double> %src
207207
define <vscale x 16 x float> @test_vp_reverse_nxv16f32_masked(<vscale x 16 x float> %src, <vscale x 16 x i1> %mask, i32 zeroext %evl) {
208208
; CHECK-LABEL: test_vp_reverse_nxv16f32_masked:
209209
; CHECK: # %bb.0:
210+
; CHECK-NEXT: addi a1, a0, -1
210211
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
211-
; CHECK-NEXT: vid.v v16, v0.t
212-
; CHECK-NEXT: addi a0, a0, -1
213-
; CHECK-NEXT: vrsub.vx v24, v16, a0, v0.t
212+
; CHECK-NEXT: vid.v v16
213+
; CHECK-NEXT: vrsub.vx v24, v16, a1, v0.t
214214
; CHECK-NEXT: vrgather.vv v16, v8, v24, v0.t
215215
; CHECK-NEXT: vmv.v.v v8, v16
216216
; CHECK-NEXT: ret

llvm/test/CodeGen/RISCV/rvv/vp-reverse-int-fixed-vectors.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
define <2 x i64> @test_vp_reverse_v2i64_masked(<2 x i64> %src, <2 x i1> %mask, i32 zeroext %evl) {
66
; CHECK-LABEL: test_vp_reverse_v2i64_masked:
77
; CHECK: # %bb.0:
8+
; CHECK-NEXT: addi a1, a0, -1
89
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
9-
; CHECK-NEXT: vid.v v9, v0.t
10-
; CHECK-NEXT: addi a0, a0, -1
11-
; CHECK-NEXT: vrsub.vx v10, v9, a0, v0.t
10+
; CHECK-NEXT: vid.v v9
11+
; CHECK-NEXT: vrsub.vx v10, v9, a1, v0.t
1212
; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t
1313
; CHECK-NEXT: vmv.v.v v8, v9
1414
; CHECK-NEXT: ret
@@ -34,10 +34,10 @@ define <2 x i64> @test_vp_reverse_v2i64(<2 x i64> %src, i32 zeroext %evl) {
3434
define <4 x i32> @test_vp_reverse_v4i32_masked(<4 x i32> %src, <4 x i1> %mask, i32 zeroext %evl) {
3535
; CHECK-LABEL: test_vp_reverse_v4i32_masked:
3636
; CHECK: # %bb.0:
37+
; CHECK-NEXT: addi a1, a0, -1
3738
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
38-
; CHECK-NEXT: vid.v v9, v0.t
39-
; CHECK-NEXT: addi a0, a0, -1
40-
; CHECK-NEXT: vrsub.vx v10, v9, a0, v0.t
39+
; CHECK-NEXT: vid.v v9
40+
; CHECK-NEXT: vrsub.vx v10, v9, a1, v0.t
4141
; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t
4242
; CHECK-NEXT: vmv.v.v v8, v9
4343
; CHECK-NEXT: ret
@@ -63,10 +63,10 @@ define <4 x i32> @test_vp_reverse_v4i32(<4 x i32> %src, i32 zeroext %evl) {
6363
define <8 x i16> @test_vp_reverse_v8i16_masked(<8 x i16> %src, <8 x i1> %mask, i32 zeroext %evl) {
6464
; CHECK-LABEL: test_vp_reverse_v8i16_masked:
6565
; CHECK: # %bb.0:
66+
; CHECK-NEXT: addi a1, a0, -1
6667
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
67-
; CHECK-NEXT: vid.v v9, v0.t
68-
; CHECK-NEXT: addi a0, a0, -1
69-
; CHECK-NEXT: vrsub.vx v10, v9, a0, v0.t
68+
; CHECK-NEXT: vid.v v9
69+
; CHECK-NEXT: vrsub.vx v10, v9, a1, v0.t
7070
; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t
7171
; CHECK-NEXT: vmv.v.v v8, v9
7272
; CHECK-NEXT: ret
@@ -92,10 +92,10 @@ define <8 x i16> @test_vp_reverse_v8i16(<8 x i16> %src, i32 zeroext %evl) {
9292
define <16 x i8> @test_vp_reverse_v16i8_masked(<16 x i8> %src, <16 x i1> %mask, i32 zeroext %evl) {
9393
; CHECK-LABEL: test_vp_reverse_v16i8_masked:
9494
; CHECK: # %bb.0:
95+
; CHECK-NEXT: addi a1, a0, -1
9596
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
96-
; CHECK-NEXT: vid.v v10, v0.t
97-
; CHECK-NEXT: addi a0, a0, -1
98-
; CHECK-NEXT: vrsub.vx v10, v10, a0, v0.t
97+
; CHECK-NEXT: vid.v v10
98+
; CHECK-NEXT: vrsub.vx v10, v10, a1, v0.t
9999
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
100100
; CHECK-NEXT: vrgatherei16.vv v9, v8, v10, v0.t
101101
; CHECK-NEXT: vmv.v.v v8, v9

0 commit comments

Comments
 (0)