Skip to content

Commit 485b3af

Browse files
authored
[RISCV] Reduce minimum VL needed for vslidedown.vx in RISCVVLOptimizer (#168392)
Whenever #149042 is relanded we will soon start EVL tail folding vectorized loops that have live-outs, e.g.: ```c int f(int *x, int n) { for (int i = 0; i < n; i++) { int y = x[i] + 1; x[y] = y; } return y; } ``` These are vectorized by extracting the last "active lane" in the loop's exit: ```llvm loop: %vl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true) ... exit: %lastidx = sub i64 %vl, 1 %lastelt = extractelement <vscale x 4 x i32> %y, i64 %lastidx ``` Which in RISC-V translates to a vslidedown.vx with a VL of 1: ```llvm bb.loop: %vl:gprnox0 = PseudoVSETVLI ... %y:vr = PseudoVADD_VI_M1 $noreg, %x, 1, AVL=-1 ... bb.exit: %lastidx:gprnox0 = ADDI %vl, -1 %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %y, %lastidx, AVL=1 ``` However today we will fail to reduce the VL of %y in the loop and will end up with two extra VL toggles. The reason being that today RISCVVLOptimizer is conservative with vslidedown.vx as it can read the lanes of %y past its own VL. So in `getMinimumVLForUser` we say that vslidedown.vx demands the entirety of %y. One observation with the sequence above is that it only actually needs to read the first %vl lanes of %y, because the last lane of vs2 used is offset + 1. In this case, that's `%lastidx + 1 = %vl - 1 + 1 = %vl`. This PR teaches RISCVVLOptimizer about this case in `getMinimumVLForVSLIDEDOWN_VX`, and in doing so removes the VL toggles. The one case that I had to think about for a bit was whenever `ADDI %vl, -1` wraps, i.e. when %vl=0 and the resulting offset is all ones. This should always be larger than the largest VLMAX, so vs2 will be completely slid down and absent from the output. So we don't need to read anything from vs2. This patch on its own has no observable effect on llvm-test-suite or SPEC CPU 2017 w/ rva23u64 today.
1 parent ea26d92 commit 485b3af

File tree

3 files changed

+120
-1
lines changed

3 files changed

+120
-1
lines changed

llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ struct DemandedVL {
6262
};
6363

6464
class RISCVVLOptimizer : public MachineFunctionPass {
65-
const MachineRegisterInfo *MRI;
65+
MachineRegisterInfo *MRI;
6666
const MachineDominatorTree *MDT;
6767
const TargetInstrInfo *TII;
6868

@@ -1392,6 +1392,42 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
13921392
return true;
13931393
}
13941394

1395+
/// Given a vslidedown.vx like:
1396+
///
1397+
/// %slideamt = ADDI %x, -1
1398+
/// %v = PseudoVSLIDEDOWN_VX %passthru, %src, %slideamt, avl=1
1399+
///
1400+
/// %v will only read the first %slideamt + 1 lanes of %src, which = %x.
1401+
/// This is a common case when lowering extractelement.
1402+
///
1403+
/// Note that if %x is 0, %slideamt will be all ones. In this case %src will be
1404+
/// completely slid down and none of its lanes will be read (since %slideamt is
1405+
/// greater than the largest VLMAX of 65536) so we can demand any minimum VL.
1406+
static std::optional<DemandedVL>
1407+
getMinimumVLForVSLIDEDOWN_VX(const MachineOperand &UserOp,
1408+
const MachineRegisterInfo *MRI) {
1409+
const MachineInstr &MI = *UserOp.getParent();
1410+
if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VSLIDEDOWN_VX)
1411+
return std::nullopt;
1412+
// We're looking at what lanes are used from the src operand.
1413+
if (UserOp.getOperandNo() != 2)
1414+
return std::nullopt;
1415+
// For now, the AVL must be 1.
1416+
const MachineOperand &AVL = MI.getOperand(4);
1417+
if (!AVL.isImm() || AVL.getImm() != 1)
1418+
return std::nullopt;
1419+
// The slide amount must be %x - 1.
1420+
const MachineOperand &SlideAmt = MI.getOperand(3);
1421+
if (!SlideAmt.getReg().isVirtual())
1422+
return std::nullopt;
1423+
MachineInstr *SlideAmtDef = MRI->getUniqueVRegDef(SlideAmt.getReg());
1424+
if (SlideAmtDef->getOpcode() != RISCV::ADDI ||
1425+
SlideAmtDef->getOperand(2).getImm() != -AVL.getImm() ||
1426+
!SlideAmtDef->getOperand(1).getReg().isVirtual())
1427+
return std::nullopt;
1428+
return SlideAmtDef->getOperand(1);
1429+
}
1430+
13951431
DemandedVL
13961432
RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
13971433
const MachineInstr &UserMI = *UserOp.getParent();
@@ -1406,6 +1442,9 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
14061442
return DemandedVL::vlmax();
14071443
}
14081444

1445+
if (auto VL = getMinimumVLForVSLIDEDOWN_VX(UserOp, MRI))
1446+
return *VL;
1447+
14091448
if (RISCVII::readsPastVL(
14101449
TII->get(RISCV::getRVVMCOpcode(UserMI.getOpcode())).TSFlags)) {
14111450
LLVM_DEBUG(dbgs() << " Abort because used by unsafe instruction\n");
@@ -1624,6 +1663,7 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
16241663

16251664
// All our checks passed. We can reduce VL.
16261665
VLOp.ChangeToRegister(CommonVL->getReg(), false);
1666+
MRI->constrainRegClass(CommonVL->getReg(), &RISCV::GPRNoX0RegClass);
16271667
return true;
16281668
}
16291669

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
3+
4+
define i32 @loop_live_out(ptr %p, i64 %n) {
5+
; CHECK-LABEL: loop_live_out:
6+
; CHECK: # %bb.0: # %entry
7+
; CHECK-NEXT: mv a2, a0
8+
; CHECK-NEXT: .LBB0_1: # %loop
9+
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
10+
; CHECK-NEXT: vsetvli a3, a1, e32, m2, ta, ma
11+
; CHECK-NEXT: vle32.v v8, (a2)
12+
; CHECK-NEXT: sub a1, a1, a3
13+
; CHECK-NEXT: vadd.vi v8, v8, 1
14+
; CHECK-NEXT: vse32.v v8, (a2)
15+
; CHECK-NEXT: slli a2, a3, 2
16+
; CHECK-NEXT: add a2, a0, a2
17+
; CHECK-NEXT: bnez a1, .LBB0_1
18+
; CHECK-NEXT: # %bb.2: # %exit
19+
; CHECK-NEXT: addi a3, a3, -1
20+
; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
21+
; CHECK-NEXT: vslidedown.vx v8, v8, a3
22+
; CHECK-NEXT: vmv.x.s a0, v8
23+
; CHECK-NEXT: ret
24+
entry:
25+
br label %loop
26+
27+
loop:
28+
%avl = phi i64 [%n, %entry], [%avl.next, %loop]
29+
%gep = phi ptr [%p, %entry], [%gep.next, %loop]
30+
%vl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true)
31+
%x = call <vscale x 4 x i32> @llvm.vp.load(ptr %gep, <vscale x 4 x i1> splat (i1 true), i32 %vl)
32+
%y = add <vscale x 4 x i32> %x, splat (i32 1)
33+
call void @llvm.vp.store(<vscale x 4 x i32> %y, ptr %gep, <vscale x 4 x i1> splat (i1 true), i32 %vl)
34+
%vl.zext = zext i32 %vl to i64
35+
%avl.next = sub i64 %avl, %vl.zext
36+
%gep.next = getelementptr i32, ptr %p, i32 %vl
37+
%ec = icmp eq i64 %avl.next, 0
38+
br i1 %ec, label %exit, label %loop
39+
40+
exit:
41+
%lastidx = sub i64 %vl.zext, 1
42+
%lastelt = extractelement <vscale x 4 x i32> %y, i64 %lastidx
43+
ret i32 %lastelt
44+
}

llvm/test/CodeGen/RISCV/rvv/vl-opt.mir

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -778,3 +778,38 @@ body: |
778778
; CHECK: DBG_VALUE %0:vr
779779
DBG_VALUE %0:vr
780780
...
781+
---
782+
name: vslidedown_vx
783+
tracksRegLiveness: true
784+
body: |
785+
bb.0:
786+
liveins: $x8
787+
; CHECK-LABEL: name: vslidedown_vx
788+
; CHECK: liveins: $x8
789+
; CHECK-NEXT: {{ $}}
790+
; CHECK-NEXT: %x:gprnox0 = COPY $x8
791+
; CHECK-NEXT: %y:gprnox0 = ADDI %x, -1
792+
; CHECK-NEXT: %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, %x, 5 /* e32 */, 0 /* tu, mu */
793+
; CHECK-NEXT: %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */
794+
%x:gpr = COPY $x8
795+
%y:gprnox0 = ADDI %x, -1
796+
%v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */
797+
%w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */
798+
...
799+
---
800+
# Make sure we ignore LIs (ADDI $x0, -1)
801+
name: vslidedown_vx_li
802+
tracksRegLiveness: true
803+
body: |
804+
bb.0:
805+
liveins: $x8
806+
; CHECK-LABEL: name: vslidedown_vx_li
807+
; CHECK: liveins: $x8
808+
; CHECK-NEXT: {{ $}}
809+
; CHECK-NEXT: %y:gprnox0 = ADDI $x0, -1
810+
; CHECK-NEXT: %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */
811+
; CHECK-NEXT: %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */
812+
%y:gprnox0 = ADDI $x0, -1
813+
%v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */
814+
%w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */
815+
...

0 commit comments

Comments
 (0)