Skip to content

Commit 727770a

Browse files
committed
[RISCV] Adjust RISCVVectorMaskDAGMutation to look for copy to V0
This mutation was introduced in 01a15dc with the goal of avoiding many copies from V1-v31 to v0 immediately before a mask consuming instruction. I noticed in a workload that this was not applying to vmv.s.x (which we use for short vector masks). We'd had a whitelist of instructions. Instead, we can directly inspect the user of the current node to see if it's a copy to V0. This isn't quite precise (as the mask producing instruction could already be scheduled fairly far from it's single use), but is probably good enough. As with all schedule changes, results are mixed. Some significant improvements, some regressions.
1 parent 248be98 commit 727770a

22 files changed

+812
-975
lines changed

llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp

Lines changed: 21 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,8 @@
1313
// The reason why we need to do this:
1414
// 1. When tracking register pressure, we don't track physical registers.
1515
// 2. We have a RegisterClass for mask register (which is `VMV0`), but we don't
16-
// use it in most RVV pseudos (only used in inline asm constraint and add/sub
17-
// with carry instructions). Instead, we use physical register V0 directly
18-
// and insert a `$v0 = COPY ...` before the use. And, there is a fundamental
19-
// issue in register allocator when handling RegisterClass with only one
20-
// physical register, so we can't simply replace V0 with VMV0.
16+
// use it by the time we reach scheduling. Instead, we use physical
17+
// register V0 directly and insert a `$v0 = COPY ...` before the use.
2118
// 3. For mask producers, we are using VR RegisterClass (we can allocate V0-V31
2219
// to it). So if V0 is not available, there are still 31 available registers
2320
// out there.
@@ -43,83 +40,39 @@
4340

4441
namespace llvm {
4542

46-
static inline bool isVectorMaskProducer(const MachineInstr *MI) {
47-
switch (RISCV::getRVVMCOpcode(MI->getOpcode())) {
48-
// Vector Mask Instructions
49-
case RISCV::VMAND_MM:
50-
case RISCV::VMNAND_MM:
51-
case RISCV::VMANDN_MM:
52-
case RISCV::VMXOR_MM:
53-
case RISCV::VMOR_MM:
54-
case RISCV::VMNOR_MM:
55-
case RISCV::VMORN_MM:
56-
case RISCV::VMXNOR_MM:
57-
case RISCV::VMSBF_M:
58-
case RISCV::VMSIF_M:
59-
case RISCV::VMSOF_M:
60-
// Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
61-
case RISCV::VMADC_VV:
62-
case RISCV::VMADC_VX:
63-
case RISCV::VMADC_VI:
64-
case RISCV::VMADC_VVM:
65-
case RISCV::VMADC_VXM:
66-
case RISCV::VMADC_VIM:
67-
case RISCV::VMSBC_VV:
68-
case RISCV::VMSBC_VX:
69-
case RISCV::VMSBC_VVM:
70-
case RISCV::VMSBC_VXM:
71-
// Vector Integer Compare Instructions
72-
case RISCV::VMSEQ_VV:
73-
case RISCV::VMSEQ_VX:
74-
case RISCV::VMSEQ_VI:
75-
case RISCV::VMSNE_VV:
76-
case RISCV::VMSNE_VX:
77-
case RISCV::VMSNE_VI:
78-
case RISCV::VMSLT_VV:
79-
case RISCV::VMSLT_VX:
80-
case RISCV::VMSLTU_VV:
81-
case RISCV::VMSLTU_VX:
82-
case RISCV::VMSLE_VV:
83-
case RISCV::VMSLE_VX:
84-
case RISCV::VMSLE_VI:
85-
case RISCV::VMSLEU_VV:
86-
case RISCV::VMSLEU_VX:
87-
case RISCV::VMSLEU_VI:
88-
case RISCV::VMSGTU_VX:
89-
case RISCV::VMSGTU_VI:
90-
case RISCV::VMSGT_VX:
91-
case RISCV::VMSGT_VI:
92-
// Vector Floating-Point Compare Instructions
93-
case RISCV::VMFEQ_VV:
94-
case RISCV::VMFEQ_VF:
95-
case RISCV::VMFNE_VV:
96-
case RISCV::VMFNE_VF:
97-
case RISCV::VMFLT_VV:
98-
case RISCV::VMFLT_VF:
99-
case RISCV::VMFLE_VV:
100-
case RISCV::VMFLE_VF:
101-
case RISCV::VMFGT_VF:
102-
case RISCV::VMFGE_VF:
103-
return true;
104-
}
105-
return false;
106-
}
107-
10843
class RISCVVectorMaskDAGMutation : public ScheduleDAGMutation {
10944
private:
11045
const TargetRegisterInfo *TRI;
11146

11247
public:
11348
RISCVVectorMaskDAGMutation(const TargetRegisterInfo *TRI) : TRI(TRI) {}
11449

50+
bool isSoleUseCopyToV0(SUnit &SU) {
51+
if (SU.Succs.size() != 1)
52+
return false;
53+
SDep &Dep = SU.Succs[0];
54+
// Ignore dependencies other than data or strong ordering.
55+
if (Dep.isWeak())
56+
return false;
57+
58+
SUnit &DepSU = *Dep.getSUnit();
59+
if (DepSU.isBoundaryNode())
60+
return false;
61+
62+
const MachineInstr *DepMI = DepSU.getInstr();
63+
return DepMI->isCopy() && DepMI->getOperand(0).getReg() == RISCV::V0 &&
64+
DepMI->getOperand(1).getReg().isVirtual() &&
65+
DepMI->getOperand(1).getSubReg() == RISCV::NoSubRegister;
66+
}
67+
11568
void apply(ScheduleDAGInstrs *DAG) override {
11669
SUnit *NearestUseV0SU = nullptr;
11770
for (SUnit &SU : DAG->SUnits) {
11871
const MachineInstr *MI = SU.getInstr();
11972
if (MI->findRegisterUseOperand(RISCV::V0, TRI))
12073
NearestUseV0SU = &SU;
12174

122-
if (NearestUseV0SU && NearestUseV0SU != &SU && isVectorMaskProducer(MI) &&
75+
if (NearestUseV0SU && NearestUseV0SU != &SU && isSoleUseCopyToV0(SU) &&
12376
// For LMUL=8 cases, there will be more possibilities to spill.
12477
// FIXME: We should use RegPressureTracker to do fine-grained
12578
// controls.

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,11 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x,
4646
; CHECK-NEXT: vslideup.vi v12, v10, 2, v0.t
4747
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
4848
; CHECK-NEXT: vmv.v.i v0, 2
49-
; CHECK-NEXT: vmv.v.i v10, 12
5049
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu
5150
; CHECK-NEXT: vslidedown.vi v8, v8, 6, v0.t
52-
; CHECK-NEXT: vmv1r.v v0, v10
51+
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
52+
; CHECK-NEXT: vmv.v.i v0, 12
53+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
5354
; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0
5455
; CHECK-NEXT: ret
5556
%z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> <i32 0, i32 7, i32 8, i32 15>

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll

Lines changed: 39 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -698,15 +698,16 @@ define void @buildvec_seq_v9i8(ptr %x) {
698698
; CHECK: # %bb.0:
699699
; CHECK-NEXT: li a1, 73
700700
; CHECK-NEXT: vsetivli zero, 9, e8, m1, ta, ma
701-
; CHECK-NEXT: vmv.v.i v9, 3
701+
; CHECK-NEXT: vmv.v.i v8, 3
702702
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
703703
; CHECK-NEXT: vmv.s.x v0, a1
704704
; CHECK-NEXT: li a1, 146
705-
; CHECK-NEXT: vmv.s.x v8, a1
706705
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
707-
; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
708-
; CHECK-NEXT: vmv1r.v v0, v8
709-
; CHECK-NEXT: vmerge.vim v8, v9, 2, v0
706+
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
707+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
708+
; CHECK-NEXT: vmv.s.x v0, a1
709+
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
710+
; CHECK-NEXT: vmerge.vim v8, v8, 2, v0
710711
; CHECK-NEXT: vse8.v v8, (a0)
711712
; CHECK-NEXT: ret
712713
store <9 x i8> <i8 1, i8 2, i8 3, i8 1, i8 2, i8 3, i8 1, i8 2, i8 3>, ptr %x
@@ -973,27 +974,27 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
973974
; RV32: # %bb.0:
974975
; RV32-NEXT: vsetivli zero, 16, e32, mf2, ta, ma
975976
; RV32-NEXT: vmv.v.i v0, 15
976-
; RV32-NEXT: vmv.v.i v9, 0
977+
; RV32-NEXT: vmv.v.i v8, 0
977978
; RV32-NEXT: li a0, 512
978979
; RV32-NEXT: li a1, 240
979-
; RV32-NEXT: vmv.s.x v8, a1
980-
; RV32-NEXT: li a1, 15
981-
; RV32-NEXT: vmerge.vim v10, v9, -1, v0
980+
; RV32-NEXT: vmerge.vim v9, v8, -1, v0
982981
; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma
983982
; RV32-NEXT: vmv.v.i v12, 3
984-
; RV32-NEXT: slli a1, a1, 8
985-
; RV32-NEXT: vmv1r.v v0, v10
983+
; RV32-NEXT: vmv1r.v v0, v9
986984
; RV32-NEXT: vmerge.vim v12, v12, 0, v0
987-
; RV32-NEXT: vmv1r.v v0, v8
985+
; RV32-NEXT: vsetvli zero, zero, e16, m8, ta, ma
986+
; RV32-NEXT: vmv.s.x v0, a1
987+
; RV32-NEXT: li a1, 15
988988
; RV32-NEXT: vsetivli zero, 16, e32, mf2, ta, ma
989-
; RV32-NEXT: vmerge.vim v10, v9, -1, v0
990-
; RV32-NEXT: vmv.s.x v8, a1
991-
; RV32-NEXT: vmv1r.v v0, v10
989+
; RV32-NEXT: vmerge.vim v9, v8, -1, v0
990+
; RV32-NEXT: slli a1, a1, 8
991+
; RV32-NEXT: vmv1r.v v0, v9
992992
; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma
993993
; RV32-NEXT: vmerge.vim v12, v12, 1, v0
994-
; RV32-NEXT: vmv1r.v v0, v8
994+
; RV32-NEXT: vsetvli zero, zero, e16, m8, ta, ma
995+
; RV32-NEXT: vmv.s.x v0, a1
995996
; RV32-NEXT: vsetivli zero, 16, e32, mf2, ta, ma
996-
; RV32-NEXT: vmerge.vim v8, v9, -1, v0
997+
; RV32-NEXT: vmerge.vim v8, v8, -1, v0
997998
; RV32-NEXT: vmv1r.v v0, v8
998999
; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma
9991000
; RV32-NEXT: vmerge.vim v8, v12, 2, v0
@@ -1003,25 +1004,23 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
10031004
; RV64V: # %bb.0:
10041005
; RV64V-NEXT: vsetivli zero, 8, e64, m1, ta, ma
10051006
; RV64V-NEXT: vmv.v.i v0, 3
1006-
; RV64V-NEXT: vmv.v.i v9, 0
1007+
; RV64V-NEXT: vmv.v.i v8, 0
10071008
; RV64V-NEXT: li a0, 512
1008-
; RV64V-NEXT: vmv.v.i v8, 12
1009-
; RV64V-NEXT: li a1, 48
1010-
; RV64V-NEXT: vmerge.vim v10, v9, -1, v0
1009+
; RV64V-NEXT: vmerge.vim v9, v8, -1, v0
10111010
; RV64V-NEXT: vsetvli zero, a0, e8, m4, ta, ma
10121011
; RV64V-NEXT: vmv.v.i v12, 3
1013-
; RV64V-NEXT: vmv1r.v v0, v10
1012+
; RV64V-NEXT: vmv1r.v v0, v9
10141013
; RV64V-NEXT: vmerge.vim v12, v12, 0, v0
1015-
; RV64V-NEXT: vmv1r.v v0, v8
10161014
; RV64V-NEXT: vsetivli zero, 8, e64, m1, ta, ma
1017-
; RV64V-NEXT: vmerge.vim v10, v9, -1, v0
1018-
; RV64V-NEXT: vmv.s.x v8, a1
1019-
; RV64V-NEXT: vmv.v.v v0, v10
1015+
; RV64V-NEXT: vmv.v.i v0, 12
1016+
; RV64V-NEXT: vmerge.vim v9, v8, -1, v0
1017+
; RV64V-NEXT: li a1, 48
1018+
; RV64V-NEXT: vmv.v.v v0, v9
10201019
; RV64V-NEXT: vsetvli zero, a0, e8, m4, ta, ma
10211020
; RV64V-NEXT: vmerge.vim v12, v12, 1, v0
1022-
; RV64V-NEXT: vmv1r.v v0, v8
1021+
; RV64V-NEXT: vmv.s.x v0, a1
10231022
; RV64V-NEXT: vsetivli zero, 8, e64, m1, ta, ma
1024-
; RV64V-NEXT: vmerge.vim v8, v9, -1, v0
1023+
; RV64V-NEXT: vmerge.vim v8, v8, -1, v0
10251024
; RV64V-NEXT: vmv.v.v v0, v8
10261025
; RV64V-NEXT: vsetvli zero, a0, e8, m4, ta, ma
10271026
; RV64V-NEXT: vmerge.vim v8, v12, 2, v0
@@ -1031,27 +1030,27 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
10311030
; RV64ZVE32: # %bb.0:
10321031
; RV64ZVE32-NEXT: vsetivli zero, 16, e32, m1, ta, ma
10331032
; RV64ZVE32-NEXT: vmv.v.i v0, 15
1034-
; RV64ZVE32-NEXT: vmv.v.i v9, 0
1033+
; RV64ZVE32-NEXT: vmv.v.i v8, 0
10351034
; RV64ZVE32-NEXT: li a0, 512
10361035
; RV64ZVE32-NEXT: li a1, 240
1037-
; RV64ZVE32-NEXT: vmv.s.x v8, a1
1038-
; RV64ZVE32-NEXT: li a1, 15
1039-
; RV64ZVE32-NEXT: vmerge.vim v10, v9, -1, v0
1036+
; RV64ZVE32-NEXT: vmerge.vim v9, v8, -1, v0
10401037
; RV64ZVE32-NEXT: vsetvli zero, a0, e8, m4, ta, ma
10411038
; RV64ZVE32-NEXT: vmv.v.i v12, 3
1042-
; RV64ZVE32-NEXT: slli a1, a1, 8
1043-
; RV64ZVE32-NEXT: vmv1r.v v0, v10
1039+
; RV64ZVE32-NEXT: vmv1r.v v0, v9
10441040
; RV64ZVE32-NEXT: vmerge.vim v12, v12, 0, v0
1045-
; RV64ZVE32-NEXT: vmv1r.v v0, v8
1041+
; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m8, ta, ma
1042+
; RV64ZVE32-NEXT: vmv.s.x v0, a1
1043+
; RV64ZVE32-NEXT: li a1, 15
10461044
; RV64ZVE32-NEXT: vsetivli zero, 16, e32, m1, ta, ma
1047-
; RV64ZVE32-NEXT: vmerge.vim v10, v9, -1, v0
1048-
; RV64ZVE32-NEXT: vmv.s.x v8, a1
1049-
; RV64ZVE32-NEXT: vmv.v.v v0, v10
1045+
; RV64ZVE32-NEXT: vmerge.vim v9, v8, -1, v0
1046+
; RV64ZVE32-NEXT: slli a1, a1, 8
1047+
; RV64ZVE32-NEXT: vmv.v.v v0, v9
10501048
; RV64ZVE32-NEXT: vsetvli zero, a0, e8, m4, ta, ma
10511049
; RV64ZVE32-NEXT: vmerge.vim v12, v12, 1, v0
1052-
; RV64ZVE32-NEXT: vmv1r.v v0, v8
1050+
; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m8, ta, ma
1051+
; RV64ZVE32-NEXT: vmv.s.x v0, a1
10531052
; RV64ZVE32-NEXT: vsetivli zero, 16, e32, m1, ta, ma
1054-
; RV64ZVE32-NEXT: vmerge.vim v8, v9, -1, v0
1053+
; RV64ZVE32-NEXT: vmerge.vim v8, v8, -1, v0
10551054
; RV64ZVE32-NEXT: vmv.v.v v0, v8
10561055
; RV64ZVE32-NEXT: vsetvli zero, a0, e8, m4, ta, ma
10571056
; RV64ZVE32-NEXT: vmerge.vim v8, v12, 2, v0

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -105,11 +105,10 @@ define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) {
105105
; CHECK: # %bb.0:
106106
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
107107
; CHECK-NEXT: vmv.v.i v0, 2
108-
; CHECK-NEXT: vmv.v.i v9, 3
109108
; CHECK-NEXT: vslidedown.vi v8, v8, 2, v0.t
110-
; CHECK-NEXT: vmv.v.i v10, 5
111-
; CHECK-NEXT: vmv1r.v v0, v9
112-
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
109+
; CHECK-NEXT: vmv.v.i v0, 3
110+
; CHECK-NEXT: vmv.v.i v9, 5
111+
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
113112
; CHECK-NEXT: ret
114113
%s = shufflevector <4 x i16> %x, <4 x i16> <i16 5, i16 5, i16 5, i16 5>, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
115114
ret <4 x i16> %s
@@ -971,13 +970,12 @@ define <8 x i32> @shuffle_repeat3_singlesrc_e32(<8 x i32> %v) {
971970
; CHECK: # %bb.0:
972971
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
973972
; CHECK-NEXT: vmv.v.i v0, 7
974-
; CHECK-NEXT: vmv.v.i v10, 1
973+
; CHECK-NEXT: vmv.v.i v9, 1
975974
; CHECK-NEXT: li a0, 192
976-
; CHECK-NEXT: vmv.s.x v9, a0
975+
; CHECK-NEXT: vmerge.vim v9, v9, 0, v0
976+
; CHECK-NEXT: vmv.s.x v0, a0
977977
; CHECK-NEXT: csrr a0, vlenb
978-
; CHECK-NEXT: vmerge.vim v10, v10, 0, v0
979-
; CHECK-NEXT: vmv.v.v v0, v9
980-
; CHECK-NEXT: vmerge.vim v9, v10, 2, v0
978+
; CHECK-NEXT: vmerge.vim v9, v9, 2, v0
981979
; CHECK-NEXT: srli a0, a0, 2
982980
; CHECK-NEXT: vslidedown.vx v10, v9, a0
983981
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma

0 commit comments

Comments
 (0)