Skip to content

Commit bc323b6

Browse files
authored
AMDGPU: Stop implementing shouldCoalesce (#168988)
Use the default, which freely coalesces anything it can. This mostly shows improvements, with a handful of regressions. The main concern would be if introducing wider registers is more likely to push the register usage up to the next occupancy tier.
1 parent 77c329f commit bc323b6

File tree

66 files changed

+61640
-83855
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+61640
-83855
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3755,20 +3755,6 @@ bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
37553755
return RC && isAGPRClass(RC);
37563756
}
37573757

3758-
bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
3759-
const TargetRegisterClass *SrcRC,
3760-
unsigned SubReg,
3761-
const TargetRegisterClass *DstRC,
3762-
unsigned DstSubReg,
3763-
const TargetRegisterClass *NewRC,
3764-
LiveIntervals &LIS) const {
3765-
// TODO: This should be more aggressive, but be more cautious with very wide
3766-
// tuples.
3767-
unsigned NewSize = getRegSizeInBits(*NewRC);
3768-
return NewSize <= 128 || NewSize <= getRegSizeInBits(*SrcRC) ||
3769-
NewSize <= getRegSizeInBits(*DstRC);
3770-
}
3771-
37723758
unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
37733759
MachineFunction &MF) const {
37743760
unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -346,14 +346,6 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
346346
ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
347347
unsigned EltSize) const;
348348

349-
bool shouldCoalesce(MachineInstr *MI,
350-
const TargetRegisterClass *SrcRC,
351-
unsigned SubReg,
352-
const TargetRegisterClass *DstRC,
353-
unsigned DstSubReg,
354-
const TargetRegisterClass *NewRC,
355-
LiveIntervals &LIS) const override;
356-
357349
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
358350
MachineFunction &MF) const override;
359351

llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

Lines changed: 340 additions & 341 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll

Lines changed: 112 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -2853,52 +2853,50 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3
28532853
; SI: ; %bb.0:
28542854
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28552855
; SI-NEXT: s_cmp_lg_u32 s24, 0
2856-
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17
2857-
; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16
2858-
; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19
2859-
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18
2860-
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21
2861-
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20
2862-
; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23
2863-
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22
2856+
; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17
2857+
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16
2858+
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19
2859+
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18
2860+
; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21
2861+
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20
2862+
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23
2863+
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22
28642864
; SI-NEXT: s_cbranch_scc0 .LBB23_4
28652865
; SI-NEXT: ; %bb.1: ; %cmp.false
2866-
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15
2867-
; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16
2868-
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14
2869-
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
2870-
; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16
2871-
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13
2872-
; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16
2873-
; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16
2874-
; SI-NEXT: v_mov_b32_e32 v3, v16
2866+
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16
2867+
; SI-NEXT: v_lshr_b64 v[0:1], v[11:12], 16
2868+
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15
2869+
; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 16
2870+
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v14
2871+
; SI-NEXT: v_lshr_b64 v[2:3], v[7:8], 16
2872+
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13
2873+
; SI-NEXT: v_lshr_b64 v[3:4], v[5:6], 16
28752874
; SI-NEXT: s_cbranch_execnz .LBB23_3
28762875
; SI-NEXT: .LBB23_2: ; %cmp.true
2877-
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15
2878-
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
2876+
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
2877+
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
28792878
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
28802879
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
28812880
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
28822881
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
2883-
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14
2884-
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
2882+
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15
2883+
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9
28852884
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
28862885
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
28872886
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2888-
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12
28892887
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
2890-
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13
2891-
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
2892-
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
2893-
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
2888+
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v14
2889+
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
28942890
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
2895-
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
2896-
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
28972891
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
28982892
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2899-
; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16
29002893
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
2901-
; SI-NEXT: v_mov_b32_e32 v3, v4
2894+
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13
2895+
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
2896+
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
2897+
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
2898+
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
2899+
; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16
29022900
; SI-NEXT: .LBB23_3: ; %end
29032901
; SI-NEXT: s_setpc_b64 s[30:31]
29042902
; SI-NEXT: .LBB23_4:
@@ -7396,52 +7394,50 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a,
73967394
; SI: ; %bb.0:
73977395
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73987396
; SI-NEXT: s_cmp_lg_u32 s24, 0
7399-
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17
7400-
; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16
7401-
; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19
7402-
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18
7403-
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21
7404-
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20
7405-
; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23
7406-
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22
7397+
; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17
7398+
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16
7399+
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19
7400+
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18
7401+
; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21
7402+
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20
7403+
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23
7404+
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22
74077405
; SI-NEXT: s_cbranch_scc0 .LBB47_4
74087406
; SI-NEXT: ; %bb.1: ; %cmp.false
7409-
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15
7410-
; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16
7411-
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14
7412-
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
7413-
; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16
7414-
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13
7415-
; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16
7416-
; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16
7417-
; SI-NEXT: v_mov_b32_e32 v3, v16
7407+
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16
7408+
; SI-NEXT: v_lshr_b64 v[0:1], v[11:12], 16
7409+
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15
7410+
; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 16
7411+
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v14
7412+
; SI-NEXT: v_lshr_b64 v[2:3], v[7:8], 16
7413+
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13
7414+
; SI-NEXT: v_lshr_b64 v[3:4], v[5:6], 16
74187415
; SI-NEXT: s_cbranch_execnz .LBB47_3
74197416
; SI-NEXT: .LBB47_2: ; %cmp.true
7420-
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15
7421-
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
7417+
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
7418+
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
74227419
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
74237420
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
74247421
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
74257422
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
7426-
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14
7427-
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
7423+
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15
7424+
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9
74287425
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
74297426
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
74307427
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
7431-
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12
74327428
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
7433-
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13
7434-
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
7435-
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
7436-
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
7429+
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v14
7430+
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
74377431
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
7438-
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
7439-
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
74407432
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
74417433
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
7442-
; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16
74437434
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
7444-
; SI-NEXT: v_mov_b32_e32 v3, v4
7435+
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13
7436+
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
7437+
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
7438+
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
7439+
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
7440+
; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16
74457441
; SI-NEXT: .LBB47_3: ; %end
74467442
; SI-NEXT: s_setpc_b64 s[30:31]
74477443
; SI-NEXT: .LBB47_4:
@@ -11589,52 +11585,50 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3
1158911585
; SI: ; %bb.0:
1159011586
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1159111587
; SI-NEXT: s_cmp_lg_u32 s24, 0
11592-
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17
11593-
; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16
11594-
; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19
11595-
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18
11596-
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21
11597-
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20
11598-
; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23
11599-
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22
11588+
; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17
11589+
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16
11590+
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19
11591+
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18
11592+
; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21
11593+
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20
11594+
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23
11595+
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22
1160011596
; SI-NEXT: s_cbranch_scc0 .LBB67_4
1160111597
; SI-NEXT: ; %bb.1: ; %cmp.false
11602-
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15
11603-
; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16
11604-
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14
11605-
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
11606-
; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16
11607-
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13
11608-
; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16
11609-
; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16
11610-
; SI-NEXT: v_mov_b32_e32 v3, v16
11598+
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16
11599+
; SI-NEXT: v_lshr_b64 v[0:1], v[11:12], 16
11600+
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15
11601+
; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 16
11602+
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v14
11603+
; SI-NEXT: v_lshr_b64 v[2:3], v[7:8], 16
11604+
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13
11605+
; SI-NEXT: v_lshr_b64 v[3:4], v[5:6], 16
1161111606
; SI-NEXT: s_cbranch_execnz .LBB67_3
1161211607
; SI-NEXT: .LBB67_2: ; %cmp.true
11613-
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15
11614-
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
11608+
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
11609+
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
1161511610
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
1161611611
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
1161711612
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1161811613
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
11619-
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14
11620-
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
11614+
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15
11615+
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9
1162111616
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
1162211617
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
1162311618
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
11624-
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12
1162511619
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
11626-
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13
11627-
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
11628-
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
11629-
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
11620+
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v14
11621+
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
1163011622
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
11631-
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
11632-
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
1163311623
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
1163411624
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
11635-
; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16
1163611625
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
11637-
; SI-NEXT: v_mov_b32_e32 v3, v4
11626+
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13
11627+
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
11628+
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
11629+
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
11630+
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
11631+
; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16
1163811632
; SI-NEXT: .LBB67_3: ; %end
1163911633
; SI-NEXT: s_setpc_b64 s[30:31]
1164011634
; SI-NEXT: .LBB67_4:
@@ -15361,52 +15355,50 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a,
1536115355
; SI: ; %bb.0:
1536215356
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1536315357
; SI-NEXT: s_cmp_lg_u32 s24, 0
15364-
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17
15365-
; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16
15366-
; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19
15367-
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18
15368-
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21
15369-
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20
15370-
; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23
15371-
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22
15358+
; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17
15359+
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16
15360+
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19
15361+
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18
15362+
; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21
15363+
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20
15364+
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23
15365+
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22
1537215366
; SI-NEXT: s_cbranch_scc0 .LBB83_4
1537315367
; SI-NEXT: ; %bb.1: ; %cmp.false
15374-
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15
15375-
; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16
15376-
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14
15377-
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
15378-
; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16
15379-
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13
15380-
; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16
15381-
; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16
15382-
; SI-NEXT: v_mov_b32_e32 v3, v16
15368+
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16
15369+
; SI-NEXT: v_lshr_b64 v[0:1], v[11:12], 16
15370+
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15
15371+
; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 16
15372+
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v14
15373+
; SI-NEXT: v_lshr_b64 v[2:3], v[7:8], 16
15374+
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13
15375+
; SI-NEXT: v_lshr_b64 v[3:4], v[5:6], 16
1538315376
; SI-NEXT: s_cbranch_execnz .LBB83_3
1538415377
; SI-NEXT: .LBB83_2: ; %cmp.true
15385-
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15
15386-
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
15378+
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
15379+
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
1538715380
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
1538815381
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
1538915382
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1539015383
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
15391-
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14
15392-
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
15384+
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15
15385+
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9
1539315386
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
1539415387
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
1539515388
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
15396-
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12
1539715389
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
15398-
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13
15399-
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
15400-
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
15401-
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
15390+
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v14
15391+
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
1540215392
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
15403-
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
15404-
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
1540515393
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
1540615394
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
15407-
; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16
1540815395
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
15409-
; SI-NEXT: v_mov_b32_e32 v3, v4
15396+
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13
15397+
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
15398+
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
15399+
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
15400+
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
15401+
; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16
1541015402
; SI-NEXT: .LBB83_3: ; %end
1541115403
; SI-NEXT: s_setpc_b64 s[30:31]
1541215404
; SI-NEXT: .LBB83_4:

0 commit comments

Comments
 (0)