Skip to content

Commit 5fb2f95

Browse files
committed
Revert "[AMDGPU] Add freeze for LowerSELECT (llvm#148796)"
causes rocPRIM to build infinitely This reverts commit f761d73.
1 parent 281b161 commit 5fb2f95

13 files changed

+759
-803
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11133,7 +11133,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
1113311133
assert(VT.getSizeInBits() == 64);
1113411134

1113511135
SDLoc DL(Op);
11136-
SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11136+
SDValue Cond = Op.getOperand(0);
1113711137

1113811138
SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
1113911139
SDValue One = DAG.getConstant(1, DL, MVT::i32);

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7791,7 +7791,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
77917791
;
77927792
; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
77937793
; GFX6: ; %bb.0:
7794-
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
7794+
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd
77957795
; GFX6-NEXT: s_mov_b32 s7, 0xf000
77967796
; GFX6-NEXT: s_mov_b32 s6, -1
77977797
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -7927,7 +7927,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
79277927
;
79287928
; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
79297929
; GFX9: ; %bb.0:
7930-
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
7930+
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34
79317931
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
79327932
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
79337933
; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
@@ -8982,7 +8982,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
89828982
;
89838983
; GFX6-LABEL: srem_i64_pow2_shl_denom:
89848984
; GFX6: ; %bb.0:
8985-
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
8985+
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd
89868986
; GFX6-NEXT: s_mov_b32 s7, 0xf000
89878987
; GFX6-NEXT: s_mov_b32 s6, -1
89888988
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -9116,7 +9116,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
91169116
;
91179117
; GFX9-LABEL: srem_i64_pow2_shl_denom:
91189118
; GFX9: ; %bb.0:
9119-
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
9119+
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34
91209120
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
91219121
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
91229122
; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
@@ -10096,15 +10096,9 @@ define i64 @udiv_i64_9divbits(i8 %size) {
1009610096
}
1009710097

1009810098
define <2 x i64> @srem_zero_zero() {
10099-
; GFX6-LABEL: srem_zero_zero:
10100-
; GFX6: ; %bb.0: ; %entry
10101-
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10102-
; GFX6-NEXT: s_setpc_b64 s[30:31]
10103-
;
10104-
; GFX9-LABEL: srem_zero_zero:
10105-
; GFX9: ; %bb.0: ; %entry
10106-
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10107-
; GFX9-NEXT: s_setpc_b64 s[30:31]
10099+
; GCN-LABEL: kernel:
10100+
; GCN: ; %bb.0: ; %entry
10101+
; GCN-NEXT: s_endpgm
1010810102
entry:
1010910103
%B = srem <2 x i64> zeroinitializer, zeroinitializer
1011010104
ret <2 x i64> %B

llvm/test/CodeGen/AMDGPU/div_i128.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -521,19 +521,16 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
521521
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
522522
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
523523
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
524-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
525524
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
526-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13]
527-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
525+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
528526
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
529-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
527+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
530528
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
531529
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
532530
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
533531
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
534-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
535532
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
536-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
533+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
537534
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
538535
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
539536
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
@@ -2713,19 +2710,16 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
27132710
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
27142711
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
27152712
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
2716-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
27172713
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
2718-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13]
2719-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
2714+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
27202715
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
2721-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
2716+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
27222717
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
27232718
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
27242719
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
27252720
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
2726-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
27272721
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
2728-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
2722+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
27292723
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
27302724
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
27312725
; GFX9-O0-NEXT: ; implicit-def: $sgpr8

llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
21
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
32
; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
43
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s

llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
7676
; SI-NEXT: s_waitcnt lgkmcnt(0)
7777
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
7878
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
79-
; SI-NEXT: s_movk_i32 s4, 0xfc01
8079
; SI-NEXT: s_mov_b32 s2, -1
8180
; SI-NEXT: s_mov_b32 s3, 0xfffff
8281
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
8382
; SI-NEXT: s_waitcnt vmcnt(0)
8483
; SI-NEXT: v_bfe_u32 v4, v3, 20, 11
85-
; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4
84+
; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4
8685
; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6
8786
; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
8887
; SI-NEXT: v_not_b32_e32 v5, v5

llvm/test/CodeGen/AMDGPU/rem_i128.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -559,19 +559,16 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
559559
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
560560
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
561561
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
562-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
563562
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
564-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13]
565-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
563+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
566564
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
567-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
565+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
568566
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
569567
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
570568
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
571569
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
572-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
573570
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
574-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
571+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
575572
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
576573
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
577574
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
@@ -1946,19 +1943,16 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
19461943
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
19471944
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
19481945
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
1949-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
19501946
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
1951-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13]
1952-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
1947+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
19531948
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
1954-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
1949+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
19551950
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
19561951
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
19571952
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
19581953
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
1959-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
19601954
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
1961-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
1955+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
19621956
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
19631957
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
19641958
; GFX9-O0-NEXT: ; implicit-def: $sgpr8

llvm/test/CodeGen/AMDGPU/select-undef.ll

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -841,23 +841,3 @@ ret:
841841
ret void
842842
}
843843

844-
define i64 @poison_should_freeze(i1 %cond1, i32 %val, i16 %val2, i64 %a, i64 %b) {
845-
; GCN-LABEL: poison_should_freeze:
846-
; GCN: ; %bb.0:
847-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
848-
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
849-
; GCN-NEXT: v_mov_b32_e32 v7, 0x5040100
850-
; GCN-NEXT: v_perm_b32 v2, v2, s4, v7
851-
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
852-
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
853-
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
854-
; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
855-
; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc
856-
; GCN-NEXT: s_setpc_b64 s[30:31]
857-
%poisonv = insertelement <2 x i16> poison, i16 %val2, i32 1
858-
%poison = bitcast <2 x i16> %poisonv to i32
859-
%cond2 = select i1 %cond1, i32 %poison, i32 %val
860-
%cmp = icmp eq i32 %cond2, 0
861-
%select = select i1 %cmp, i64 %a, i64 %b
862-
ret i64 %select
863-
}

llvm/test/CodeGen/AMDGPU/srem.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1819,7 +1819,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
18191819
; TAHITI-NEXT: v_mul_hi_u32 v1, v0, v1
18201820
; TAHITI-NEXT: v_mul_lo_u32 v1, v1, v2
18211821
; TAHITI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
1822-
; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2
1822+
; TAHITI-NEXT: v_subrev_i32_e32 v1, vcc, v2, v0
18231823
; TAHITI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
18241824
; TAHITI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
18251825
; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2
@@ -6232,7 +6232,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
62326232
; TONGA-NEXT: v_mul_hi_u32 v8, v14, v8
62336233
; TONGA-NEXT: v_mul_lo_u32 v8, v8, v10
62346234
; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v14, v8
6235-
; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10
6235+
; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v8
62366236
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10
62376237
; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
62386238
; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10

llvm/test/CodeGen/AMDGPU/srem64.ll

Lines changed: 44 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -921,60 +921,58 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
921921
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
922922
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
923923
; GCN-NEXT: s_waitcnt lgkmcnt(0)
924-
; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 31
925-
; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 31
926-
; GCN-NEXT: s_ashr_i32 s6, s5, 31
927-
; GCN-NEXT: s_add_u32 s4, s4, s6
928-
; GCN-NEXT: s_mov_b32 s7, s6
929-
; GCN-NEXT: s_addc_u32 s5, s5, s6
930-
; GCN-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
924+
; GCN-NEXT: s_ashr_i64 s[10:11], s[2:3], 31
925+
; GCN-NEXT: s_ashr_i64 s[6:7], s[4:5], 31
926+
; GCN-NEXT: s_ashr_i32 s4, s5, 31
927+
; GCN-NEXT: s_add_u32 s6, s6, s4
928+
; GCN-NEXT: s_mov_b32 s5, s4
929+
; GCN-NEXT: s_addc_u32 s7, s7, s4
930+
; GCN-NEXT: s_xor_b64 s[8:9], s[6:7], s[4:5]
931931
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
932932
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9
933-
; GCN-NEXT: s_sub_u32 s4, 0, s8
934-
; GCN-NEXT: s_subb_u32 s5, 0, s9
935-
; GCN-NEXT: s_ashr_i32 s10, s3, 31
933+
; GCN-NEXT: s_sub_u32 s2, 0, s8
934+
; GCN-NEXT: s_subb_u32 s4, 0, s9
935+
; GCN-NEXT: s_ashr_i32 s12, s3, 31
936936
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
937937
; GCN-NEXT: v_rcp_f32_e32 v0, v0
938-
; GCN-NEXT: s_add_u32 s2, s2, s10
939-
; GCN-NEXT: s_mov_b32 s11, s10
940-
; GCN-NEXT: s_addc_u32 s3, s3, s10
938+
; GCN-NEXT: s_mov_b32 s13, s12
939+
; GCN-NEXT: s_mov_b32 s5, s1
940+
; GCN-NEXT: s_mov_b32 s7, 0xf000
941941
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
942942
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
943943
; GCN-NEXT: v_trunc_f32_e32 v1, v1
944944
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
945945
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
946946
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
947-
; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11]
948-
; GCN-NEXT: s_mov_b32 s7, 0xf000
949-
; GCN-NEXT: v_mul_lo_u32 v2, s4, v1
950-
; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
951-
; GCN-NEXT: v_mul_lo_u32 v5, s5, v0
952-
; GCN-NEXT: v_mul_lo_u32 v4, s4, v0
953947
; GCN-NEXT: s_mov_b32 s6, -1
948+
; GCN-NEXT: v_mul_lo_u32 v2, s2, v1
949+
; GCN-NEXT: v_mul_hi_u32 v3, s2, v0
950+
; GCN-NEXT: v_mul_lo_u32 v5, s4, v0
951+
; GCN-NEXT: v_mul_lo_u32 v4, s2, v0
954952
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
955953
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
956954
; GCN-NEXT: v_mul_hi_u32 v3, v0, v4
957955
; GCN-NEXT: v_mul_lo_u32 v5, v0, v2
958-
; GCN-NEXT: v_mul_hi_u32 v7, v0, v2
959-
; GCN-NEXT: v_mul_lo_u32 v6, v1, v4
960-
; GCN-NEXT: v_mul_hi_u32 v4, v1, v4
961-
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
962-
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
956+
; GCN-NEXT: v_mul_hi_u32 v6, v0, v2
963957
; GCN-NEXT: v_mul_hi_u32 v7, v1, v2
964958
; GCN-NEXT: v_mul_lo_u32 v2, v1, v2
959+
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
960+
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
961+
; GCN-NEXT: v_mul_lo_u32 v6, v1, v4
962+
; GCN-NEXT: v_mul_hi_u32 v4, v1, v4
965963
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6
966964
; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
967965
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
968966
; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
969967
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
970968
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
971969
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
972-
; GCN-NEXT: v_mul_lo_u32 v2, s4, v1
973-
; GCN-NEXT: v_mul_hi_u32 v3, s4, v0
974-
; GCN-NEXT: v_mul_lo_u32 v4, s5, v0
975-
; GCN-NEXT: s_mov_b32 s5, s1
970+
; GCN-NEXT: v_mul_lo_u32 v2, s2, v1
971+
; GCN-NEXT: v_mul_hi_u32 v3, s2, v0
972+
; GCN-NEXT: v_mul_lo_u32 v4, s4, v0
973+
; GCN-NEXT: s_mov_b32 s4, s0
976974
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
977-
; GCN-NEXT: v_mul_lo_u32 v3, s4, v0
975+
; GCN-NEXT: v_mul_lo_u32 v3, s2, v0
978976
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
979977
; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
980978
; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
@@ -990,18 +988,20 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
990988
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
991989
; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
992990
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
991+
; GCN-NEXT: s_add_u32 s2, s10, s12
993992
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
993+
; GCN-NEXT: s_addc_u32 s3, s11, s12
994994
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
995-
; GCN-NEXT: v_mul_lo_u32 v2, s12, v1
996-
; GCN-NEXT: v_mul_hi_u32 v3, s12, v0
997-
; GCN-NEXT: v_mul_hi_u32 v4, s12, v1
998-
; GCN-NEXT: v_mul_hi_u32 v5, s13, v1
999-
; GCN-NEXT: v_mul_lo_u32 v1, s13, v1
995+
; GCN-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13]
996+
; GCN-NEXT: v_mul_lo_u32 v2, s10, v1
997+
; GCN-NEXT: v_mul_hi_u32 v3, s10, v0
998+
; GCN-NEXT: v_mul_hi_u32 v4, s10, v1
999+
; GCN-NEXT: v_mul_hi_u32 v5, s11, v1
1000+
; GCN-NEXT: v_mul_lo_u32 v1, s11, v1
10001001
; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
10011002
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
1002-
; GCN-NEXT: v_mul_lo_u32 v4, s13, v0
1003-
; GCN-NEXT: v_mul_hi_u32 v0, s13, v0
1004-
; GCN-NEXT: s_mov_b32 s4, s0
1003+
; GCN-NEXT: v_mul_lo_u32 v4, s11, v0
1004+
; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
10051005
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
10061006
; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
10071007
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
@@ -1013,9 +1013,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
10131013
; GCN-NEXT: v_mul_lo_u32 v0, s8, v0
10141014
; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2
10151015
; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
1016-
; GCN-NEXT: v_sub_i32_e32 v2, vcc, s13, v1
1016+
; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1
10171017
; GCN-NEXT: v_mov_b32_e32 v3, s9
1018-
; GCN-NEXT: v_sub_i32_e32 v0, vcc, s12, v0
1018+
; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0
10191019
; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
10201020
; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0
10211021
; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
@@ -1030,7 +1030,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
10301030
; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
10311031
; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
10321032
; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1]
1033-
; GCN-NEXT: v_mov_b32_e32 v4, s13
1033+
; GCN-NEXT: v_mov_b32_e32 v4, s11
10341034
; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
10351035
; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1
10361036
; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
@@ -1042,10 +1042,10 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
10421042
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
10431043
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
10441044
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1045-
; GCN-NEXT: v_xor_b32_e32 v0, s10, v0
1046-
; GCN-NEXT: v_xor_b32_e32 v1, s10, v1
1047-
; GCN-NEXT: v_mov_b32_e32 v2, s10
1048-
; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0
1045+
; GCN-NEXT: v_xor_b32_e32 v0, s12, v0
1046+
; GCN-NEXT: v_xor_b32_e32 v1, s12, v1
1047+
; GCN-NEXT: v_mov_b32_e32 v2, s12
1048+
; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0
10491049
; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
10501050
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
10511051
; GCN-NEXT: s_endpgm

0 commit comments

Comments
 (0)