Skip to content

Commit 8fc553e

Browse files
committed
[WIP][DAG] visitFREEZE - always allow freezing multiple operands
Remove the limited freeze multiple operand handling, always freeze all operands and rely on later visitFREEZE calls to merge frozen/unfrozen versions of each node to prevent infinite loops. Fixes #149798
1 parent 9315d70 commit 8fc553e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+2493
-2640
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 4 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -16810,18 +16810,6 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
1681016810
N0->getNumValues() != 1 || !N0->hasOneUse())
1681116811
return SDValue();
1681216812

16813-
// TOOD: we should always allow multiple operands, however this increases the
16814-
// likelihood of infinite loops due to the ReplaceAllUsesOfValueWith call
16815-
// below causing later nodes that share frozen operands to fold again and no
16816-
// longer being able to confirm other operands are not poison due to recursion
16817-
// depth limits on isGuaranteedNotToBeUndefOrPoison.
16818-
bool AllowMultipleMaybePoisonOperands =
16819-
N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC ||
16820-
N0.getOpcode() == ISD::BUILD_VECTOR ||
16821-
N0.getOpcode() == ISD::BUILD_PAIR ||
16822-
N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
16823-
N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL;
16824-
1682516813
// Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
1682616814
// ones" or "constant" into something that depends on FrozenUndef. We can
1682716815
// instead pick undef values to keep those properties, while at the same time
@@ -16842,74 +16830,13 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
1684216830
}
1684316831
}
1684416832

16845-
SmallSet<SDValue, 8> MaybePoisonOperands;
16846-
SmallVector<unsigned, 8> MaybePoisonOperandNumbers;
16847-
for (auto [OpNo, Op] : enumerate(N0->ops())) {
16848-
if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly=*/false))
16849-
continue;
16850-
bool HadMaybePoisonOperands = !MaybePoisonOperands.empty();
16851-
bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second;
16852-
if (IsNewMaybePoisonOperand)
16853-
MaybePoisonOperandNumbers.push_back(OpNo);
16854-
if (!HadMaybePoisonOperands)
16855-
continue;
16856-
if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) {
16857-
// Multiple maybe-poison ops when not allowed - bail out.
16858-
return SDValue();
16859-
}
16860-
}
16861-
// NOTE: the whole op may be not guaranteed to not be undef or poison because
16862-
// it could create undef or poison due to it's poison-generating flags.
16863-
// So not finding any maybe-poison operands is fine.
16864-
16865-
for (unsigned OpNo : MaybePoisonOperandNumbers) {
16866-
// N0 can mutate during iteration, so make sure to refetch the maybe poison
16867-
// operands via the operand numbers. The typical scenario is that we have
16868-
// something like this
16869-
// t262: i32 = freeze t181
16870-
// t150: i32 = ctlz_zero_undef t262
16871-
// t184: i32 = ctlz_zero_undef t181
16872-
// t268: i32 = select_cc t181, Constant:i32<0>, t184, t186, setne:ch
16873-
// When freezing the t181 operand we get t262 back, and then the
16874-
// ReplaceAllUsesOfValueWith call will not only replace t181 by t262, but
16875-
// also recursively replace t184 by t150.
16876-
SDValue MaybePoisonOperand = N->getOperand(0).getOperand(OpNo);
16877-
// Don't replace every single UNDEF everywhere with frozen UNDEF, though.
16878-
if (MaybePoisonOperand.isUndef())
16879-
continue;
16880-
// First, freeze each offending operand.
16881-
SDValue FrozenMaybePoisonOperand = DAG.getFreeze(MaybePoisonOperand);
16882-
// Then, change all other uses of unfrozen operand to use frozen operand.
16883-
DAG.ReplaceAllUsesOfValueWith(MaybePoisonOperand, FrozenMaybePoisonOperand);
16884-
if (FrozenMaybePoisonOperand.getOpcode() == ISD::FREEZE &&
16885-
FrozenMaybePoisonOperand.getOperand(0) == FrozenMaybePoisonOperand) {
16886-
// But, that also updated the use in the freeze we just created, thus
16887-
// creating a cycle in a DAG. Let's undo that by mutating the freeze.
16888-
DAG.UpdateNodeOperands(FrozenMaybePoisonOperand.getNode(),
16889-
MaybePoisonOperand);
16890-
}
16891-
16892-
// This node has been merged with another.
16893-
if (N->getOpcode() == ISD::DELETED_NODE)
16894-
return SDValue(N, 0);
16895-
}
16896-
16897-
assert(N->getOpcode() != ISD::DELETED_NODE && "Node was deleted!");
16898-
16899-
// The whole node may have been updated, so the value we were holding
16900-
// may no longer be valid. Re-fetch the operand we're `freeze`ing.
16901-
N0 = N->getOperand(0);
16833+
// Collect and freeze all operands.
16834+
SmallVector<SDValue> Ops(N0->ops());
16835+
for (auto &Op : Ops)
16836+
Op = DAG.getFreeze(Op);
1690216837

1690316838
// Finally, recreate the node, it's operands were updated to use
1690416839
// frozen operands, so we just need to use it's "original" operands.
16905-
SmallVector<SDValue> Ops(N0->ops());
16906-
// TODO: ISD::UNDEF and ISD::POISON should get separate handling, but best
16907-
// leave for a future patch.
16908-
for (SDValue &Op : Ops) {
16909-
if (Op.isUndef())
16910-
Op = DAG.getFreeze(Op);
16911-
}
16912-
1691316840
SDLoc DL(N0);
1691416841

1691516842
// Special case handling for ShuffleVectorSDNode nodes.

llvm/test/CodeGen/AMDGPU/bf16-conversions.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
188188
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
189189
; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3]
190190
; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2
191-
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
191+
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
192192
; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1]
193193
; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
194194
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
@@ -225,7 +225,7 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
225225
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
226226
; GFX1250-NEXT: v_add_nc_u32_e32 v0, v9, v0
227227
; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, 1, v11
228-
; GFX1250-NEXT: s_or_b32 vcc_lo, s1, vcc_lo
228+
; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s1
229229
; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
230230
; GFX1250-NEXT: s_or_b32 vcc_lo, s2, s0
231231
; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
@@ -344,7 +344,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
344344
; GFX1250: ; %bb.0: ; %entry
345345
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
346346
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
347-
; GFX1250-NEXT: flat_store_b16 v[2:3], v0
347+
; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE
348348
; GFX1250-NEXT: s_endpgm
349349
entry:
350350
%a.cvt = fptrunc float %a to bfloat
@@ -380,7 +380,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
380380
; GFX1250: ; %bb.0: ; %entry
381381
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
382382
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0
383-
; GFX1250-NEXT: flat_store_b16 v[2:3], v0
383+
; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE
384384
; GFX1250-NEXT: s_endpgm
385385
entry:
386386
%a.abs = call float @llvm.fabs.f32(float %a)
@@ -417,7 +417,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
417417
; GFX1250: ; %bb.0: ; %entry
418418
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
419419
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0
420-
; GFX1250-NEXT: flat_store_b16 v[2:3], v0
420+
; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE
421421
; GFX1250-NEXT: s_endpgm
422422
entry:
423423
%a.neg = fneg float %a
@@ -480,7 +480,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
480480
; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
481481
; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
482482
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
483-
; GFX1250-NEXT: flat_store_b16 v[2:3], v0
483+
; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE
484484
; GFX1250-NEXT: s_endpgm
485485
entry:
486486
%a.cvt = fptrunc double %a to bfloat
@@ -543,7 +543,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
543543
; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
544544
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
545545
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
546-
; GFX1250-NEXT: flat_store_b16 v[2:3], v0
546+
; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE
547547
; GFX1250-NEXT: s_endpgm
548548
entry:
549549
%a.neg = fneg double %a
@@ -607,7 +607,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
607607
; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
608608
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
609609
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
610-
; GFX1250-NEXT: flat_store_b16 v[2:3], v0
610+
; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE
611611
; GFX1250-NEXT: s_endpgm
612612
entry:
613613
%a.abs = call double @llvm.fabs.f64(double %a)

llvm/test/CodeGen/AMDGPU/div_i128.ll

Lines changed: 49 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -475,21 +475,28 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
475475
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
476476
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
477477
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
478-
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
479-
; GFX9-O0-NEXT: s_nop 0
480-
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
481478
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
482479
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
483480
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
484481
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
485-
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
482+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
483+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
484+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
485+
; GFX9-O0-NEXT: s_nop 0
486+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
487+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
488+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
489+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
486490
; GFX9-O0-NEXT: s_nop 0
487-
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
488-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
491+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
492+
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
493+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
489494
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
490-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
495+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
496+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
491497
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
492-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
498+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
499+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
493500
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
494501
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
495502
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -501,6 +508,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
501508
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
502509
; GFX9-O0-NEXT: s_mov_b32 s14, s13
503510
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
511+
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
504512
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
505513
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
506514
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1039,10 +1047,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
10391047
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
10401048
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
10411049
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1042-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1043-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1044-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1045-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1050+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1051+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1052+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1053+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
10461054
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
10471055
; GFX9-O0-NEXT: s_mov_b32 s5, s6
10481056
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -2660,31 +2668,40 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
26602668
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
26612669
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
26622670
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
2663-
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
2664-
; GFX9-O0-NEXT: s_nop 0
2665-
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
26662671
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
26672672
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
26682673
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
26692674
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
2670-
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
2675+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
2676+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
2677+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
26712678
; GFX9-O0-NEXT: s_nop 0
2672-
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
2673-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
2679+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
2680+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
2681+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
2682+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
2683+
; GFX9-O0-NEXT: s_nop 0
2684+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
2685+
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
2686+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
26742687
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
2675-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
2688+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
2689+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
26762690
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
2677-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
2691+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
2692+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
26782693
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
26792694
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
26802695
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
26812696
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1
26822697
; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
2683-
; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1
2684-
; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
2698+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1
2699+
; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9]
2700+
; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
26852701
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
26862702
; GFX9-O0-NEXT: s_mov_b32 s14, s13
26872703
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
2704+
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
26882705
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
26892706
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
26902707
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -2698,16 +2715,19 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
26982715
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
26992716
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
27002717
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
2718+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
27012719
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
2702-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
2720+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13]
2721+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
27032722
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
2704-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
2723+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
27052724
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
27062725
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
27072726
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
27082727
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
2728+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
27092729
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
2710-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
2730+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
27112731
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
27122732
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
27132733
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
@@ -3220,10 +3240,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
32203240
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
32213241
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
32223242
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
3223-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
3224-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
3225-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
3226-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
3243+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
3244+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
3245+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
3246+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
32273247
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
32283248
; GFX9-O0-NEXT: s_mov_b32 s5, s6
32293249
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)

0 commit comments

Comments
 (0)