Skip to content

Commit 702bd09

Browse files
authored
[AMDGPU] Add freeze for LowerSELECT (llvm#148796) (llvm#3267)
2 parents 48581ba + 49ee831 commit 702bd09

11 files changed

+808
-765
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11186,7 +11186,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
1118611186
assert(VT.getSizeInBits() == 64);
1118711187

1118811188
SDLoc DL(Op);
11189-
SDValue Cond = Op.getOperand(0);
11189+
SDValue Cond = DAG.getFreeze(Op.getOperand(0));
1119011190

1119111191
SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
1119211192
SDValue One = DAG.getConstant(1, DL, MVT::i32);

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7791,7 +7791,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
77917791
;
77927792
; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
77937793
; GFX6: ; %bb.0:
7794-
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd
7794+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
77957795
; GFX6-NEXT: s_mov_b32 s7, 0xf000
77967796
; GFX6-NEXT: s_mov_b32 s6, -1
77977797
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -7927,7 +7927,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
79277927
;
79287928
; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
79297929
; GFX9: ; %bb.0:
7930-
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34
7930+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
79317931
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
79327932
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
79337933
; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
@@ -8982,7 +8982,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
89828982
;
89838983
; GFX6-LABEL: srem_i64_pow2_shl_denom:
89848984
; GFX6: ; %bb.0:
8985-
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd
8985+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
89868986
; GFX6-NEXT: s_mov_b32 s7, 0xf000
89878987
; GFX6-NEXT: s_mov_b32 s6, -1
89888988
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -9116,7 +9116,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
91169116
;
91179117
; GFX9-LABEL: srem_i64_pow2_shl_denom:
91189118
; GFX9: ; %bb.0:
9119-
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34
9119+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
91209120
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
91219121
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
91229122
; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
@@ -10096,9 +10096,15 @@ define i64 @udiv_i64_9divbits(i8 %size) {
1009610096
}
1009710097

1009810098
define <2 x i64> @srem_zero_zero() {
10099-
; GCN-LABEL: kernel:
10100-
; GCN: ; %bb.0: ; %entry
10101-
; GCN-NEXT: s_endpgm
10099+
; GFX6-LABEL: srem_zero_zero:
10100+
; GFX6: ; %bb.0: ; %entry
10101+
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10102+
; GFX6-NEXT: s_setpc_b64 s[30:31]
10103+
;
10104+
; GFX9-LABEL: srem_zero_zero:
10105+
; GFX9: ; %bb.0: ; %entry
10106+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10107+
; GFX9-NEXT: s_setpc_b64 s[30:31]
1010210108
entry:
1010310109
%B = srem <2 x i64> zeroinitializer, zeroinitializer
1010410110
ret <2 x i64> %B

llvm/test/CodeGen/AMDGPU/div_i128.ll

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -513,16 +513,19 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
513513
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
514514
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
515515
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
516+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
516517
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
517-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
518+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13]
519+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
518520
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
519-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
521+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
520522
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
521523
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
522524
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
523525
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
526+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
524527
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
525-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
528+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
526529
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
527530
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
528531
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
@@ -2694,16 +2697,19 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
26942697
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
26952698
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
26962699
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
2700+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
26972701
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
2698-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
2702+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13]
2703+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
26992704
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
2700-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
2705+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
27012706
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
27022707
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
27032708
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
27042709
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
2710+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
27052711
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
2706-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
2712+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
27072713
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
27082714
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
27092715
; GFX9-O0-NEXT: ; implicit-def: $sgpr8

llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -141,18 +141,19 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
141141
; DAGISEL-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
142142
; DAGISEL-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
143143
; DAGISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
144-
; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
145-
; DAGISEL-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
144+
; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
145+
; DAGISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]]
146+
; DAGISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
146147
; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
147-
; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY4]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
148-
; DAGISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
148+
; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY5]], [[COPY4]], implicit $exec
149+
; DAGISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
149150
; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
150-
; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, killed [[COPY5]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
151-
; DAGISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
152-
; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY6]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
153-
; DAGISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
151+
; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, killed [[COPY6]], [[COPY4]], implicit $exec
152+
; DAGISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
153+
; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY7]], [[COPY4]], implicit $exec
154+
; DAGISEL-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
154155
; DAGISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 3
155-
; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[COPY7]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
156+
; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[COPY8]], [[COPY4]], implicit $exec
156157
; DAGISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], killed [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec
157158
; DAGISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec
158159
; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]]

llvm/test/CodeGen/AMDGPU/rem_i128.ll

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -551,16 +551,19 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
551551
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
552552
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
553553
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
554+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
554555
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
555-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
556+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13]
557+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
556558
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
557-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
559+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
558560
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
559561
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
560562
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
561563
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
564+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
562565
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
563-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
566+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
564567
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
565568
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
566569
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
@@ -1927,16 +1930,19 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
19271930
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
19281931
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
19291932
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
1933+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
19301934
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
1931-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
1935+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13]
1936+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
19321937
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
1933-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
1938+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
19341939
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
19351940
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
19361941
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
19371942
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
1943+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
19381944
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
1939-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
1945+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
19401946
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
19411947
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
19421948
; GFX9-O0-NEXT: ; implicit-def: $sgpr8

llvm/test/CodeGen/AMDGPU/select-undef.ll

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -841,3 +841,23 @@ ret:
841841
ret void
842842
}
843843

844+
define i64 @poison_should_freeze(i1 %cond1, i32 %val, i16 %val2, i64 %a, i64 %b) {
845+
; GCN-LABEL: poison_should_freeze:
846+
; GCN: ; %bb.0:
847+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
848+
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
849+
; GCN-NEXT: v_mov_b32_e32 v7, 0x5040100
850+
; GCN-NEXT: v_perm_b32 v2, v2, s4, v7
851+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
852+
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
853+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
854+
; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
855+
; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc
856+
; GCN-NEXT: s_setpc_b64 s[30:31]
857+
%poisonv = insertelement <2 x i16> poison, i16 %val2, i32 1
858+
%poison = bitcast <2 x i16> %poisonv to i32
859+
%cond2 = select i1 %cond1, i32 %poison, i32 %val
860+
%cmp = icmp eq i32 %cond2, 0
861+
%select = select i1 %cmp, i64 %a, i64 %b
862+
ret i64 %select
863+
}

llvm/test/CodeGen/AMDGPU/srem64.ll

Lines changed: 44 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -921,58 +921,60 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
921921
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
922922
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
923923
; GCN-NEXT: s_waitcnt lgkmcnt(0)
924-
; GCN-NEXT: s_ashr_i64 s[10:11], s[2:3], 31
925-
; GCN-NEXT: s_ashr_i64 s[6:7], s[4:5], 31
926-
; GCN-NEXT: s_ashr_i32 s4, s5, 31
927-
; GCN-NEXT: s_add_u32 s6, s6, s4
928-
; GCN-NEXT: s_mov_b32 s5, s4
929-
; GCN-NEXT: s_addc_u32 s7, s7, s4
930-
; GCN-NEXT: s_xor_b64 s[8:9], s[6:7], s[4:5]
924+
; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 31
925+
; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 31
926+
; GCN-NEXT: s_ashr_i32 s6, s5, 31
927+
; GCN-NEXT: s_add_u32 s4, s4, s6
928+
; GCN-NEXT: s_mov_b32 s7, s6
929+
; GCN-NEXT: s_addc_u32 s5, s5, s6
930+
; GCN-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
931931
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
932932
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9
933-
; GCN-NEXT: s_sub_u32 s2, 0, s8
934-
; GCN-NEXT: s_subb_u32 s4, 0, s9
935-
; GCN-NEXT: s_ashr_i32 s12, s3, 31
933+
; GCN-NEXT: s_sub_u32 s4, 0, s8
934+
; GCN-NEXT: s_subb_u32 s5, 0, s9
935+
; GCN-NEXT: s_ashr_i32 s10, s3, 31
936936
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
937937
; GCN-NEXT: v_rcp_f32_e32 v0, v0
938-
; GCN-NEXT: s_mov_b32 s13, s12
939-
; GCN-NEXT: s_mov_b32 s5, s1
940-
; GCN-NEXT: s_mov_b32 s7, 0xf000
938+
; GCN-NEXT: s_add_u32 s2, s2, s10
939+
; GCN-NEXT: s_mov_b32 s11, s10
940+
; GCN-NEXT: s_addc_u32 s3, s3, s10
941941
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
942942
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
943943
; GCN-NEXT: v_trunc_f32_e32 v1, v1
944944
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
945945
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
946946
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
947+
; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11]
948+
; GCN-NEXT: s_mov_b32 s7, 0xf000
949+
; GCN-NEXT: v_mul_lo_u32 v2, s4, v1
950+
; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
951+
; GCN-NEXT: v_mul_lo_u32 v5, s5, v0
952+
; GCN-NEXT: v_mul_lo_u32 v4, s4, v0
947953
; GCN-NEXT: s_mov_b32 s6, -1
948-
; GCN-NEXT: v_mul_lo_u32 v2, s2, v1
949-
; GCN-NEXT: v_mul_hi_u32 v3, s2, v0
950-
; GCN-NEXT: v_mul_lo_u32 v5, s4, v0
951-
; GCN-NEXT: v_mul_lo_u32 v4, s2, v0
952954
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
953955
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
954956
; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
955957
; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
956-
; GCN-NEXT: v_mul_hi_u32 v6, v0, v2
957-
; GCN-NEXT: v_mul_hi_u32 v7, v1, v2
958-
; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
959-
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
960-
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
958+
; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
961959
; GCN-NEXT: v_mul_lo_u32 v6, v1, v4
962960
; GCN-NEXT: v_mul_hi_u32 v4, v1, v4
961+
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
962+
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
963+
; GCN-NEXT: v_mul_hi_u32 v7, v1, v2
964+
; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
963965
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6
964966
; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
965967
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
966968
; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
967969
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
968970
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
969971
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
970-
; GCN-NEXT: v_mul_lo_u32 v2, s2, v1
971-
; GCN-NEXT: v_mul_hi_u32 v3, s2, v0
972-
; GCN-NEXT: v_mul_lo_u32 v4, s4, v0
973-
; GCN-NEXT: s_mov_b32 s4, s0
972+
; GCN-NEXT: v_mul_lo_u32 v2, s4, v1
973+
; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
974+
; GCN-NEXT: v_mul_lo_u32 v4, s5, v0
975+
; GCN-NEXT: s_mov_b32 s5, s1
974976
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
975-
; GCN-NEXT: v_mul_lo_u32 v3, s2, v0
977+
; GCN-NEXT: v_mul_lo_u32 v3, s4, v0
976978
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
977979
; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
978980
; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
@@ -988,20 +990,18 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
988990
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
989991
; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
990992
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
991-
; GCN-NEXT: s_add_u32 s2, s10, s12
992993
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
993-
; GCN-NEXT: s_addc_u32 s3, s11, s12
994994
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
995-
; GCN-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13]
996-
; GCN-NEXT: v_mul_lo_u32 v2, s10, v1
997-
; GCN-NEXT: v_mul_hi_u32 v3, s10, v0
998-
; GCN-NEXT: v_mul_hi_u32 v4, s10, v1
999-
; GCN-NEXT: v_mul_hi_u32 v5, s11, v1
1000-
; GCN-NEXT: v_mul_lo_u32 v1, s11, v1
995+
; GCN-NEXT: v_mul_lo_u32 v2, s12, v1
996+
; GCN-NEXT: v_mul_hi_u32 v3, s12, v0
997+
; GCN-NEXT: v_mul_hi_u32 v4, s12, v1
998+
; GCN-NEXT: v_mul_hi_u32 v5, s13, v1
999+
; GCN-NEXT: v_mul_lo_u32 v1, s13, v1
10011000
; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
10021001
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
1003-
; GCN-NEXT: v_mul_lo_u32 v4, s11, v0
1004-
; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
1002+
; GCN-NEXT: v_mul_lo_u32 v4, s13, v0
1003+
; GCN-NEXT: v_mul_hi_u32 v0, s13, v0
1004+
; GCN-NEXT: s_mov_b32 s4, s0
10051005
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
10061006
; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
10071007
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
@@ -1013,9 +1013,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
10131013
; GCN-NEXT: v_mul_lo_u32 v0, s8, v0
10141014
; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
10151015
; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
1016-
; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1
1016+
; GCN-NEXT: v_sub_i32_e32 v2, vcc, s13, v1
10171017
; GCN-NEXT: v_mov_b32_e32 v3, s9
1018-
; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0
1018+
; GCN-NEXT: v_sub_i32_e32 v0, vcc, s12, v0
10191019
; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
10201020
; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0
10211021
; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
@@ -1030,7 +1030,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
10301030
; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
10311031
; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
10321032
; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1]
1033-
; GCN-NEXT: v_mov_b32_e32 v4, s11
1033+
; GCN-NEXT: v_mov_b32_e32 v4, s13
10341034
; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
10351035
; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1
10361036
; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
@@ -1042,10 +1042,10 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
10421042
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
10431043
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
10441044
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1045-
; GCN-NEXT: v_xor_b32_e32 v0, s12, v0
1046-
; GCN-NEXT: v_xor_b32_e32 v1, s12, v1
1047-
; GCN-NEXT: v_mov_b32_e32 v2, s12
1048-
; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0
1045+
; GCN-NEXT: v_xor_b32_e32 v0, s10, v0
1046+
; GCN-NEXT: v_xor_b32_e32 v1, s10, v1
1047+
; GCN-NEXT: v_mov_b32_e32 v2, s10
1048+
; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0
10491049
; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
10501050
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
10511051
; GCN-NEXT: s_endpgm

0 commit comments

Comments
 (0)