Skip to content

Commit c37942d

Browse files
authored
[DAG] visitFREEZE - limit freezing of multiple operands (llvm#149797)
This is a partial revert of llvm#145939 (I've kept the BUILD_VECTOR(FREEZE(UNDEF), FREEZE(UNDEF), elt2, ...) canonicalization) as we're getting reports of infinite loops (llvm#148084). The issue appears to be due to deep chains of nodes and how visitFREEZE replaces all instances of an operand with a common frozen version - other users of the original frozen node then get added back to the worklist but might no longer be able to confirm a node isn't poison due to recursion depth limits on isGuaranteedNotToBeUndefOrPoison. The issue still exists with the old implementation but by only allowing a single frozen operand it helps prevent cases of interdependent frozen nodes. I'm still working on supporting multiple operands as its critical for topological DAG handling but need to get a fix in for trunk and 21.x. Fixes llvm#148084
1 parent eaa67a3 commit c37942d

18 files changed

+1847
-1730
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16738,14 +16738,27 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
1673816738
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
1673916739
// Try to push freeze through instructions that propagate but don't produce
1674016740
// poison as far as possible. If an operand of freeze follows three
16741-
// conditions 1) one-use, and 2) does not produce poison then push
16741+
// conditions 1) one-use, 2) does not produce poison, and 3) has all but one
16742+
// guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push
1674216743
// the freeze through to the operands that are not guaranteed non-poison.
1674316744
// NOTE: we will strip poison-generating flags, so ignore them here.
1674416745
if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false,
1674516746
/*ConsiderFlags*/ false) ||
1674616747
N0->getNumValues() != 1 || !N0->hasOneUse())
1674716748
return SDValue();
1674816749

16750+
// TOOD: we should always allow multiple operands, however this increases the
16751+
// likelihood of infinite loops due to the ReplaceAllUsesOfValueWith call
16752+
// below causing later nodes that share frozen operands to fold again and no
16753+
// longer being able to confirm other operands are not poison due to recursion
16754+
// depth limits on isGuaranteedNotToBeUndefOrPoison.
16755+
bool AllowMultipleMaybePoisonOperands =
16756+
N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC ||
16757+
N0.getOpcode() == ISD::BUILD_VECTOR ||
16758+
N0.getOpcode() == ISD::BUILD_PAIR ||
16759+
N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
16760+
N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL;
16761+
1674916762
// Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
1675016763
// ones" or "constant" into something that depends on FrozenUndef. We can
1675116764
// instead pick undef values to keep those properties, while at the same time
@@ -16772,8 +16785,16 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
1677216785
if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
1677316786
/*Depth*/ 1))
1677416787
continue;
16775-
if (MaybePoisonOperands.insert(Op).second)
16788+
bool HadMaybePoisonOperands = !MaybePoisonOperands.empty();
16789+
bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second;
16790+
if (IsNewMaybePoisonOperand)
1677616791
MaybePoisonOperandNumbers.push_back(OpNo);
16792+
if (!HadMaybePoisonOperands)
16793+
continue;
16794+
if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) {
16795+
// Multiple maybe-poison ops when not allowed - bail out.
16796+
return SDValue();
16797+
}
1677716798
}
1677816799
// NOTE: the whole op may be not guaranteed to not be undef or poison because
1677916800
// it could create undef or poison due to it's poison-generating flags.

llvm/test/CodeGen/AMDGPU/div_i128.ll

Lines changed: 24 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -475,28 +475,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
475475
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
476476
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
477477
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
478+
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
479+
; GFX9-O0-NEXT: s_nop 0
480+
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
478481
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
479482
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
480483
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
481484
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
482-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
483-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
484-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
485-
; GFX9-O0-NEXT: s_nop 0
486-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
487-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
488-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
489-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
485+
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
490486
; GFX9-O0-NEXT: s_nop 0
491-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
492-
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
493-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
487+
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
488+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
494489
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
495-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
496-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
490+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
497491
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
498-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
499-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
492+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
500493
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
501494
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
502495
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -507,7 +500,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
507500
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
508501
; GFX9-O0-NEXT: s_mov_b32 s14, s13
509502
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
510-
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
511503
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
512504
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
513505
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1046,10 +1038,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
10461038
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
10471039
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
10481040
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1049-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1050-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1051-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1052-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1041+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1042+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1043+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1044+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
10531045
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
10541046
; GFX9-O0-NEXT: s_mov_b32 s5, s6
10551047
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -2667,28 +2659,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
26672659
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
26682660
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
26692661
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
2662+
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
2663+
; GFX9-O0-NEXT: s_nop 0
2664+
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
26702665
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
26712666
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
26722667
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
26732668
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
2674-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
2675-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
2676-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
2677-
; GFX9-O0-NEXT: s_nop 0
2678-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
2679-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
2680-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
2681-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
2669+
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
26822670
; GFX9-O0-NEXT: s_nop 0
2683-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
2684-
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
2685-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
2671+
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
2672+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
26862673
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
2687-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
2688-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
2674+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
26892675
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
2690-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
2691-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
2676+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
26922677
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
26932678
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
26942679
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -2699,7 +2684,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
26992684
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
27002685
; GFX9-O0-NEXT: s_mov_b32 s14, s13
27012686
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
2702-
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
27032687
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
27042688
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
27052689
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -3238,10 +3222,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
32383222
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
32393223
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
32403224
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
3241-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
3242-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
3243-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
3244-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
3225+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
3226+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
3227+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
3228+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
32453229
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
32463230
; GFX9-O0-NEXT: s_mov_b32 s5, s6
32473231
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)

llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
7676
; SI-NEXT: s_waitcnt lgkmcnt(0)
7777
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
7878
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
79-
; SI-NEXT: s_movk_i32 s4, 0xfc01
8079
; SI-NEXT: s_mov_b32 s2, -1
8180
; SI-NEXT: s_mov_b32 s3, 0xfffff
8281
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
8382
; SI-NEXT: s_waitcnt vmcnt(0)
8483
; SI-NEXT: v_bfe_u32 v4, v3, 20, 11
85-
; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4
84+
; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4
8685
; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6
8786
; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
8887
; SI-NEXT: v_not_b32_e32 v5, v5

llvm/test/CodeGen/AMDGPU/rem_i128.ll

Lines changed: 24 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -513,28 +513,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
513513
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
514514
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
515515
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
516+
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
517+
; GFX9-O0-NEXT: s_nop 0
518+
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
516519
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
517520
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
518521
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
519522
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
520-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
521-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
522-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
523-
; GFX9-O0-NEXT: s_nop 0
524-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
525-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
526-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
527-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
523+
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
528524
; GFX9-O0-NEXT: s_nop 0
529-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
530-
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
531-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
525+
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
526+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
532527
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
533-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
534-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
528+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
535529
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
536-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
537-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
530+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
538531
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
539532
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
540533
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -545,7 +538,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
545538
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
546539
; GFX9-O0-NEXT: s_mov_b32 s14, s13
547540
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
548-
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
549541
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
550542
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
551543
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1084,10 +1076,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
10841076
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
10851077
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
10861078
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1087-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1088-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1089-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1090-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1079+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1080+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1081+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1082+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
10911083
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
10921084
; GFX9-O0-NEXT: s_mov_b32 s5, s6
10931085
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -1900,28 +1892,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
19001892
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
19011893
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
19021894
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
1895+
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
1896+
; GFX9-O0-NEXT: s_nop 0
1897+
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
19031898
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
19041899
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
19051900
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
19061901
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
1907-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
1908-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
1909-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
1910-
; GFX9-O0-NEXT: s_nop 0
1911-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
1912-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
1913-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
1914-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
1902+
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
19151903
; GFX9-O0-NEXT: s_nop 0
1916-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
1917-
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
1918-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
1904+
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
1905+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
19191906
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
1920-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
1921-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
1907+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
19221908
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
1923-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
1924-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
1909+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
19251910
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
19261911
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
19271912
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -1932,7 +1917,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
19321917
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
19331918
; GFX9-O0-NEXT: s_mov_b32 s14, s13
19341919
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
1935-
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
19361920
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
19371921
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
19381922
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -2471,10 +2455,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
24712455
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
24722456
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
24732457
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
2474-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
2475-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
2476-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
2477-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
2458+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
2459+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
2460+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
2461+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
24782462
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
24792463
; GFX9-O0-NEXT: s_mov_b32 s5, s6
24802464
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)

llvm/test/CodeGen/AMDGPU/srem.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1819,7 +1819,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
18191819
; TAHITI-NEXT: v_mul_hi_u32 v1, v0, v1
18201820
; TAHITI-NEXT: v_mul_lo_u32 v1, v1, v2
18211821
; TAHITI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
1822-
; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2
1822+
; TAHITI-NEXT: v_subrev_i32_e32 v1, vcc, v2, v0
18231823
; TAHITI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
18241824
; TAHITI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
18251825
; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2
@@ -6232,7 +6232,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
62326232
; TONGA-NEXT: v_mul_hi_u32 v8, v14, v8
62336233
; TONGA-NEXT: v_mul_lo_u32 v8, v8, v10
62346234
; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v14, v8
6235-
; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10
6235+
; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v8
62366236
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10
62376237
; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
62386238
; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10

0 commit comments

Comments
 (0)