Skip to content

Commit 135d3a0

Browse files
committed
Always copy from carry-in operand to VCC
1 parent d027b65 commit 135d3a0

13 files changed

+407
-368
lines changed

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,11 +1065,9 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
10651065

10661066
/// Try to convert an \p MI in VOP3 which takes an src2 carry-in
10671067
/// operand into the corresponding VOP2 form which expects the
1068-
/// argument in VCC. To this end, either try to change the definition
1069-
/// of the carry-in operand to write to VCC or add an instruction that
1070-
/// copies from the carry-in to VCC. The conversion will only be
1071-
/// applied if \p MI can be shrunk to VOP2 and if VCC can be proven to
1072-
/// be dead before \p MI.
1068+
/// argument in VCC. To this end, add an copy from the carry-in to
1069+
/// VCC. The conversion will only be applied if \p MI can be shrunk
1070+
/// to VOP2 and if VCC can be proven to be dead before \p MI.
10731071
void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
10741072
const GCNSubtarget &ST) const {
10751073
assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
@@ -1099,16 +1097,7 @@ void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
10991097
return;
11001098
}
11011099

1102-
// Change destination of compare instruction to VCC
1103-
// or copy to VCC if carry-in is not a compare inst.
1104-
if (TII->isVOP3(*CarryDef) &&
1105-
TII->isVOPC(AMDGPU::getVOPe32(CarryDef->getOpcode())) &&
1106-
MRI->hasOneUse(CarryIn.getReg()))
1107-
CarryDef->substituteRegister(CarryIn.getReg(), Vcc, 0, *TRI);
1108-
else {
1109-
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc)
1110-
.add(CarryIn);
1111-
}
1100+
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc).add(CarryIn);
11121101

11131102
auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(),
11141103
TII->get(AMDGPU::getVOPe32(MI.getOpcode())))

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 49 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -40777,39 +40777,41 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
4077740777
; GFX9: ; %bb.0:
4077840778
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4077940779
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
40780-
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
40781-
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
40780+
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
4078240781
; GFX9-NEXT: v_and_b32_e32 v1, 1, v3
40783-
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
4078440782
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
40785-
; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[4:5]
40786-
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v5
40787-
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v7
40788-
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
40789-
; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v3, s[4:5]
40790-
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
40791-
; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, v4, s[4:5]
40783+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
40784+
; GFX9-NEXT: v_and_b32_e32 v1, 1, v2
40785+
; GFX9-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
40786+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
40787+
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1
40788+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc
40789+
; GFX9-NEXT: s_mov_b64 vcc, s[4:5]
40790+
; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7]
4079240791
; GFX9-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4079340792
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
4079440793
; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4
40795-
; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
40794+
; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
4079640795
; GFX9-NEXT: s_setpc_b64 s[30:31]
4079740796
;
4079840797
; GFX10-LABEL: v_vselect_v4bf16:
4079940798
; GFX10: ; %bb.0:
4080040799
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40801-
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
40802-
; GFX10-NEXT: v_and_b32_e32 v1, 1, v3
40800+
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
40801+
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
4080340802
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
40804-
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2
40803+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
40804+
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1
40805+
; GFX10-NEXT: v_and_b32_e32 v1, 1, v2
40806+
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v0
40807+
; GFX10-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
40808+
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
40809+
; GFX10-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4080540810
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
40806-
; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v5, s4
40807-
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0
40808-
; GFX10-NEXT: v_cndmask_b32_sdwa v2, v6, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
40809-
; GFX10-NEXT: v_cndmask_b32_sdwa v3, v7, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
40810-
; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v4, s4
40811-
; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
40812-
; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
40811+
; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v4, s5
40812+
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc_lo
40813+
; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
40814+
; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
4081340815
; GFX10-NEXT: s_setpc_b64 s[30:31]
4081440816
;
4081540817
; GFX11TRUE16-LABEL: v_vselect_v4bf16:
@@ -41058,39 +41060,36 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
4105841060
; GFX10: ; %bb.0:
4105941061
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4106041062
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
41061-
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
41062-
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
41063+
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
41064+
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
41065+
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
4106341066
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
41064-
; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v14
41065-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
41066-
; GFX10-NEXT: v_and_b32_e32 v1, 1, v3
41067+
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1
41068+
; GFX10-NEXT: v_and_b32_e32 v1, 1, v5
41069+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
41070+
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v3
4106741071
; GFX10-NEXT: v_and_b32_e32 v3, 1, v6
41068-
; GFX10-NEXT: v_and_b32_e32 v6, 1, v7
41069-
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v11
41070-
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
41071-
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v3
41072-
; GFX10-NEXT: v_cndmask_b32_e64 v3, v15, v11, s4
41073-
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v4
41074-
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v10
41075-
; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
41076-
; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v10, s4
41077-
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v5
41078-
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v13
41079-
; GFX10-NEXT: v_cndmask_b32_e64 v5, v16, v11, s4
41080-
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2
41081-
; GFX10-NEXT: v_cndmask_b32_e64 v2, v13, v9, s4
41082-
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0
41083-
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
41084-
; GFX10-NEXT: v_cndmask_b32_e64 v0, v12, v8, s4
41085-
; GFX10-NEXT: v_cndmask_b32_sdwa v8, v12, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
41072+
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v1
41073+
; GFX10-NEXT: v_and_b32_e32 v1, 1, v4
41074+
; GFX10-NEXT: v_cndmask_b32_sdwa v4, v15, v11, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4108641075
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
41087-
; GFX10-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
41088-
; GFX10-NEXT: v_cndmask_b32_e32 v1, v10, v9, vcc_lo
41089-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
41076+
; GFX10-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc_lo
41077+
; GFX10-NEXT: s_mov_b32 vcc_lo, s6
41078+
; GFX10-NEXT: v_cndmask_b32_sdwa v6, v14, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
41079+
; GFX10-NEXT: s_mov_b32 vcc_lo, s5
41080+
; GFX10-NEXT: v_cndmask_b32_sdwa v1, v13, v9, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
41081+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
41082+
; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
41083+
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
41084+
; GFX10-NEXT: v_cndmask_b32_sdwa v7, v12, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
41085+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
41086+
; GFX10-NEXT: v_perm_b32 v0, v7, v0, 0x5040100
41087+
; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo
41088+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
4109041089
; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
41091-
; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc_lo
41092-
; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
41093-
; GFX10-NEXT: v_perm_b32 v3, v6, v3, 0x5040100
41090+
; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v11, vcc_lo
41091+
; GFX10-NEXT: v_perm_b32 v2, v6, v5, 0x5040100
41092+
; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
4109441093
; GFX10-NEXT: s_setpc_b64 s[30:31]
4109541094
;
4109641095
; GFX11TRUE16-LABEL: v_vselect_v8bf16:

llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -652,13 +652,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
652652
; GFX9-GISEL: ; %bb.0:
653653
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
654654
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
655+
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 32
655656
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
656657
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
657658
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
658-
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1
659+
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v1
659660
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
660-
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v2
661-
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 32
661+
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v3
662662
; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
663663
; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1]
664664
; GFX9-GISEL-NEXT: s_endpgm
@@ -760,6 +760,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
760760
; GFX9-GISEL: ; %bb.0:
761761
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
762762
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
763+
; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 32
763764
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
764765
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
765766
; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
@@ -768,8 +769,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
768769
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1
769770
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
770771
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v2
771-
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 32
772-
; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
772+
; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
773773
; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
774774
; GFX9-GISEL-NEXT: s_endpgm
775775
%val = load i16, ptr addrspace(1) %arrayidx, align 1

llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1503,13 +1503,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
15031503
; GFX9-GISEL: ; %bb.0:
15041504
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
15051505
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
1506+
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff
15061507
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
15071508
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
15081509
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1509-
; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x100, v1
1510+
; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1
15101511
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
1511-
; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v2
1512-
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff
1512+
; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v3
15131513
; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
15141514
; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1]
15151515
; GFX9-GISEL-NEXT: s_endpgm
@@ -1604,6 +1604,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
16041604
; GFX9-GISEL: ; %bb.0:
16051605
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
16061606
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
1607+
; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff
16071608
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
16081609
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
16091610
; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
@@ -1612,8 +1613,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
16121613
; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1
16131614
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
16141615
; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v2
1615-
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff
1616-
; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1616+
; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
16171617
; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
16181618
; GFX9-GISEL-NEXT: s_endpgm
16191619
%val = load i16, ptr addrspace(1) %arrayidx, align 1

0 commit comments

Comments
 (0)