Skip to content

Commit 116ca95

Browse files
authored
Greedy: Take copy hints involving subregisters (#159570)
Previously this would only accept full copy hints. This relaxes this to accept some subregister copies. Specifically, this now accepts: - Copies to/from physical registers if there is a compatible super register - Subreg-to-subreg copies This has the potential to repeatedly add the same hint to the hint vector, but not sure if that's a real problem.
1 parent 33e8e5a commit 116ca95

File tree

9 files changed

+89
-89
lines changed

9 files changed

+89
-89
lines changed

llvm/lib/CodeGen/RegAllocGreedy.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2387,19 +2387,42 @@ void RAGreedy::initializeCSRCost() {
23872387
/// The results are stored into \p Out.
23882388
/// \p Out is not cleared before being populated.
23892389
void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
2390+
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
2391+
23902392
for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
2391-
if (!TII->isFullCopyInstr(Instr))
2393+
if (!Instr.isCopy())
23922394
continue;
2395+
23932396
// Look for the other end of the copy.
23942397
Register OtherReg = Instr.getOperand(0).getReg();
2398+
unsigned OtherSubReg = Instr.getOperand(0).getSubReg();
2399+
unsigned SubReg = Instr.getOperand(1).getSubReg();
2400+
23952401
if (OtherReg == Reg) {
23962402
OtherReg = Instr.getOperand(1).getReg();
2403+
OtherSubReg = Instr.getOperand(1).getSubReg();
2404+
SubReg = Instr.getOperand(0).getSubReg();
23972405
if (OtherReg == Reg)
23982406
continue;
23992407
}
2408+
24002409
// Get the current assignment.
24012410
MCRegister OtherPhysReg =
24022411
OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
2412+
if (OtherSubReg) {
2413+
if (OtherReg.isPhysical()) {
2414+
MCRegister Tuple =
2415+
TRI->getMatchingSuperReg(OtherPhysReg, OtherSubReg, RC);
2416+
if (!Tuple)
2417+
continue;
2418+
OtherPhysReg = Tuple;
2419+
} else {
2420+
// TODO: There should be a hinting mechanism for subregisters
2421+
if (SubReg != OtherSubReg)
2422+
continue;
2423+
}
2424+
}
2425+
24032426
// Push the collected information.
24042427
Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
24052428
OtherPhysReg));

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159246,7 +159246,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
159246159246
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v61
159247159247
; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v61
159248159248
; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v60
159249-
; GFX9-NEXT: v_mov_b32_e32 v33, v60
159249+
; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v60
159250159250
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
159251159251
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
159252159252
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
@@ -159259,7 +159259,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
159259159259
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48
159260159260
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58
159261159261
; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v58
159262-
; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v33
159263159262
; GFX9-NEXT: s_waitcnt vmcnt(3)
159264159263
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v61
159265159264
; GFX9-NEXT: s_waitcnt vmcnt(2)

llvm/test/CodeGen/AMDGPU/load-global-i16.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7398,7 +7398,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
73987398
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15
73997399
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17
74007400
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2)
7401-
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v20
7401+
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20
74027402
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16
74037403
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14
74047404
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v14
@@ -7413,7 +7413,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
74137413
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v18
74147414
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v18
74157415
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v20
7416-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v5
74177416
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19
74187417
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v19
74197418
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21

llvm/test/CodeGen/AMDGPU/load-local-i16.ll

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3851,9 +3851,9 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
38513851
; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24
38523852
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
38533853
; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
3854-
; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11
3854+
; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v11
38553855
; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10
3856-
; VI-DS128-NEXT: v_mov_b32_e32 v31, v15
3856+
; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9
38573857
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
38583858
; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
38593859
; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
@@ -3864,17 +3864,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
38643864
; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
38653865
; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
38663866
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
3867-
; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9
38683867
; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8
38693868
; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11
38703869
; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10
3870+
; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9
38713871
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
38723872
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
38733873
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
38743874
; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
38753875
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
38763876
; VI-DS128-NEXT: v_mov_b32_e32 v24, s0
3877-
; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9
38783877
; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8
38793878
; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
38803879
; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
@@ -3944,7 +3943,7 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
39443943
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
39453944
; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
39463945
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
3947-
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11
3946+
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v11
39483947
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
39493948
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19
39503949
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18
@@ -3992,8 +3991,8 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
39923991
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
39933992
; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
39943993
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10
3995-
; GFX9-DS128-NEXT: v_mov_b32_e32 v31, v15
39963994
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9
3995+
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8
39973996
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
39983997
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
39993998
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
@@ -4004,17 +4003,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
40044003
; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
40054004
; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
40064005
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
4007-
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8
40084006
; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11
40094007
; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10
40104008
; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9
4009+
; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8
40114010
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
40124011
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
40134012
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
40144013
; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
40154014
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
40164015
; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0
4017-
; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8
40184016
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
40194017
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
40204018
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
@@ -4890,7 +4888,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
48904888
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
48914889
; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40
48924890
; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
4893-
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11
4891+
; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v11
48944892
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
48954893
; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
48964894
; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
@@ -4901,14 +4899,13 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
49014899
; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
49024900
; VI-DS128-NEXT: v_mov_b32_e32 v32, s0
49034901
; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10
4904-
; VI-DS128-NEXT: v_mov_b32_e32 v23, v15
49054902
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9
4903+
; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8
49064904
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
49074905
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
49084906
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
49094907
; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
49104908
; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
4911-
; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8
49124909
; VI-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16
49134910
; VI-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16
49144911
; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16
@@ -4986,7 +4983,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
49864983
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
49874984
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
49884985
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
4989-
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11
4986+
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v11
49904987
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
49914988
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19
49924989
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18
@@ -5031,15 +5028,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
50315028
; GFX9-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16
50325029
; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
50335030
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0
5034-
; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v15
50355031
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9
50365032
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8
5033+
; GFX9-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16
50375034
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
50385035
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
50395036
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
50405037
; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
50415038
; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
5042-
; GFX9-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16
50435039
; GFX9-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16
50445040
; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16
50455041
; GFX9-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16

llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
101101
; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
102102
; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
103103
; CHECK-NEXT: ;;#ASMSTART
104-
; CHECK-NEXT: ; def v[10:13]
104+
; CHECK-NEXT: ; def v[6:9]
105105
; CHECK-NEXT: ;;#ASMEND
106106
; CHECK-NEXT: v_mov_b32_e32 v0, 0
107107
; CHECK-NEXT: ;;#ASMSTART
@@ -142,7 +142,7 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
142142
; CHECK-NEXT: s_waitcnt vmcnt(0)
143143
; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
144144
; CHECK-NEXT: s_waitcnt vmcnt(0)
145-
; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
145+
; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17]
146146
; CHECK-NEXT: s_waitcnt vmcnt(0)
147147
; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
148148
; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -306,10 +306,10 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
306306
; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
307307
; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
308308
; CHECK-NEXT: ;;#ASMSTART
309-
; CHECK-NEXT: ; def v[10:13]
309+
; CHECK-NEXT: ; def v[8:11]
310310
; CHECK-NEXT: ;;#ASMEND
311311
; CHECK-NEXT: ;;#ASMSTART
312-
; CHECK-NEXT: ; def v[14:17]
312+
; CHECK-NEXT: ; def v[12:15]
313313
; CHECK-NEXT: ;;#ASMEND
314314
; CHECK-NEXT: ;;#ASMSTART
315315
; CHECK-NEXT: ; def a[0:31]
@@ -349,9 +349,9 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
349349
; CHECK-NEXT: s_waitcnt vmcnt(0)
350350
; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
351351
; CHECK-NEXT: s_waitcnt vmcnt(0)
352-
; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
352+
; CHECK-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
353353
; CHECK-NEXT: s_waitcnt vmcnt(0)
354-
; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17]
354+
; CHECK-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
355355
; CHECK-NEXT: s_waitcnt vmcnt(0)
356356
; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
357357
; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -415,8 +415,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
415415
; RV32-NEXT: mul a4, a4, a5
416416
; RV32-NEXT: add a4, sp, a4
417417
; RV32-NEXT: addi a4, a4, 16
418-
; RV32-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
419-
; RV32-NEXT: vmv4r.v v8, v24
418+
; RV32-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
420419
; RV32-NEXT: csrr a4, vlenb
421420
; RV32-NEXT: slli a4, a4, 4
422421
; RV32-NEXT: add a4, sp, a4
@@ -726,8 +725,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
726725
; RV64-NEXT: mul a4, a4, a5
727726
; RV64-NEXT: add a4, sp, a4
728727
; RV64-NEXT: addi a4, a4, 32
729-
; RV64-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
730-
; RV64-NEXT: vmv4r.v v8, v24
728+
; RV64-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
731729
; RV64-NEXT: csrr a4, vlenb
732730
; RV64-NEXT: slli a4, a4, 4
733731
; RV64-NEXT: add a4, sp, a4

llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8831,8 +8831,7 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
88318831
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
88328832
; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
88338833
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
8834-
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
8835-
; ZVFHMIN-NEXT: vmv.v.v v4, v12
8834+
; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
88368835
; ZVFHMIN-NEXT: bltu a1, a0, .LBB286_2
88378836
; ZVFHMIN-NEXT: # %bb.1:
88388837
; ZVFHMIN-NEXT: mv a1, a0
@@ -9460,8 +9459,7 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %v
94609459
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
94619460
; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t
94629461
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
9463-
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
9464-
; ZVFHMIN-NEXT: vmv.v.v v4, v12
9462+
; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
94659463
; ZVFHMIN-NEXT: bltu a0, a1, .LBB291_2
94669464
; ZVFHMIN-NEXT: # %bb.1:
94679465
; ZVFHMIN-NEXT: mv a0, a1
@@ -9832,8 +9830,7 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
98329830
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
98339831
; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t
98349832
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
9835-
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
9836-
; ZVFHMIN-NEXT: vmv.v.v v4, v12
9833+
; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
98379834
; ZVFHMIN-NEXT: bltu a0, a1, .LBB294_2
98389835
; ZVFHMIN-NEXT: # %bb.1:
98399836
; ZVFHMIN-NEXT: mv a0, a1
@@ -10347,8 +10344,7 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
1034710344
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
1034810345
; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
1034910346
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
10350-
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
10351-
; ZVFHMIN-NEXT: vmv.v.v v4, v12
10347+
; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
1035210348
; ZVFHMIN-NEXT: bltu a1, a0, .LBB298_2
1035310349
; ZVFHMIN-NEXT: # %bb.1:
1035410350
; ZVFHMIN-NEXT: mv a1, a0
@@ -10975,8 +10971,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
1097510971
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
1097610972
; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t
1097710973
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
10978-
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
10979-
; ZVFHMIN-NEXT: vmv.v.v v4, v12
10974+
; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
1098010975
; ZVFHMIN-NEXT: bltu a0, a1, .LBB303_2
1098110976
; ZVFHMIN-NEXT: # %bb.1:
1098210977
; ZVFHMIN-NEXT: mv a0, a1
@@ -11343,8 +11338,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
1134311338
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
1134411339
; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
1134511340
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
11346-
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
11347-
; ZVFHMIN-NEXT: vmv.v.v v4, v12
11341+
; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
1134811342
; ZVFHMIN-NEXT: bltu a0, a1, .LBB306_2
1134911343
; ZVFHMIN-NEXT: # %bb.1:
1135011344
; ZVFHMIN-NEXT: mv a0, a1
@@ -11453,12 +11447,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
1145311447
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
1145411448
; ZVFHMIN-NEXT: addi a2, sp, 16
1145511449
; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
11456-
; ZVFHMIN-NEXT: vmv4r.v v8, v24
1145711450
; ZVFHMIN-NEXT: csrr a2, vlenb
1145811451
; ZVFHMIN-NEXT: slli a2, a2, 4
1145911452
; ZVFHMIN-NEXT: add a2, sp, a2
1146011453
; ZVFHMIN-NEXT: addi a2, a2, 16
11461-
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
11454+
; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
1146211455
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
1146311456
; ZVFHMIN-NEXT: csrr a2, vlenb
1146411457
; ZVFHMIN-NEXT: slli a2, a2, 3
@@ -11580,12 +11573,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
1158011573
; ZVFHMIN-NEXT: sltu a3, a0, a2
1158111574
; ZVFHMIN-NEXT: addi a3, a3, -1
1158211575
; ZVFHMIN-NEXT: and a2, a3, a2
11583-
; ZVFHMIN-NEXT: vmv4r.v v8, v16
1158411576
; ZVFHMIN-NEXT: csrr a3, vlenb
1158511577
; ZVFHMIN-NEXT: slli a3, a3, 3
1158611578
; ZVFHMIN-NEXT: add a3, sp, a3
1158711579
; ZVFHMIN-NEXT: addi a3, a3, 16
11588-
; ZVFHMIN-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
11580+
; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
1158911581
; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
1159011582
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
1159111583
; ZVFHMIN-NEXT: csrr a2, vlenb

0 commit comments

Comments
 (0)