Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
f0877bb
add commute for some VOP3 inst, allow commute for both inline constan…
Shoreshen Dec 30, 2024
ae5f1e8
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Dec 31, 2024
df6903c
Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…
Shoreshen Dec 31, 2024
ada83d6
add inline constant case & merge main
Shoreshen Dec 31, 2024
c599af0
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 2, 2025
3a83ae5
fix lit change
Shoreshen Jan 2, 2025
5c9d065
fix lit change
Shoreshen Jan 2, 2025
d3f00dc
Update llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Shoreshen Jan 2, 2025
0e60298
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 2, 2025
707474f
fix comments
Shoreshen Jan 2, 2025
79219d1
fix
Shoreshen Jan 2, 2025
288ead2
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 4, 2025
4becc58
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 5, 2025
53b370a
Add legal check for swap
Shoreshen Jan 6, 2025
0b746a8
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 6, 2025
785921f
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 7, 2025
6955a86
Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…
Shoreshen Jan 8, 2025
5e15e72
add tests & merge_main
Shoreshen Jan 8, 2025
004a82d
fix lit.cfg.py
Shoreshen Jan 8, 2025
dc2739f
fix lit.cfg.py
Shoreshen Jan 8, 2025
378c02c
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 8, 2025
49ae569
Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…
Shoreshen Jan 9, 2025
161a2b9
adjust comment & merge main
Shoreshen Jan 9, 2025
4d569ae
Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…
Shoreshen Jan 10, 2025
1689c1e
adjust case & merge main
Shoreshen Jan 10, 2025
328e566
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 13, 2025
9faf423
fix inconsistent capitalization
Shoreshen Jan 13, 2025
19b8ad4
fix test case
Shoreshen Jan 13, 2025
cc3a125
Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…
Shoreshen Jan 20, 2025
0a89dc9
merge main
Shoreshen Jan 20, 2025
d8e6cb7
fix format
Shoreshen Jan 20, 2025
3c7bd89
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 21, 2025
bf1da57
remove special handling for VOPD, since no commutable VOPD instruction
Shoreshen Jan 21, 2025
789092d
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 21, 2025
6c35280
Merge branch 'main' into Add-isCommutable-attribute-to-VOP3-instructions
Shoreshen Jan 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2749,6 +2749,19 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
return &MI;
}

static MachineInstr *swapInlineConstOperands(MachineInstr &MI,
MachineOperand &NonRegOp1,
MachineOperand &NonRegOp2) {
unsigned TargetFlags = NonRegOp1.getTargetFlags();
int64_t NonRegVal = NonRegOp1.getImm();

NonRegOp1.setImm(NonRegOp2.getImm());
NonRegOp2.setImm(NonRegVal);
NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
NonRegOp2.setTargetFlags(TargetFlags);
return &MI;
}

MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned Src0Idx,
unsigned Src1Idx) const {
Expand Down Expand Up @@ -2785,6 +2798,10 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
} else if (!Src0.isReg() && Src1.isReg()) {
if (isOperandLegal(MI, Src1Idx, &Src0))
CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
} else if (isInlineConstant(Src1)) {
// If Src1 is inline constant and Src0 is not, then isOperandLegal rejects
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't explaining why to do this, but this is also an API flaw that's always been there. We need an isOperandLegal that doesn't account for the context of the other operands

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or rather, one that takes the full set of operands that need to be considered for the result instruction

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @arsenm

This isn't explaining why to do this, but this is also an API flaw that's always been there. We need an isOperandLegal that doesn't account for the context of the other operands

isOperandLegal also check the literal and constant bus (literal or SGPR) limit, I think maybe it is better to separate them from the function. It will also cause some in-consistency with ISA (e.g. reject instructions with 1 literal and 1 imm) if the OpIdx and MO's index are different

Or rather, one that takes the full set of operands that need to be considered for the result instruction

We can do that, but to create the result instruction, we actually swapped the operands on the original instruction too. So if I do that and failed the isOperandLegal check, I would need to swap it back

Copy link
Contributor Author

@Shoreshen Shoreshen Jan 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or rather, one that takes the full set of operands that need to be considered for the result instruction

Hi @arsenm , I tried to check the swapped instruction with isOperandLegal, it fails 100+ cases. The following is the change:

MachineInstr* SIInstrInfo::swapOperands(MachineInstr &MI, bool NewMI, 
                                        unsigned Src0Idx,
                                        unsigned Src1Idx,
                                        MachineOperand &Src0,
                                        MachineOperand &Src1) const {
  MachineInstr *CommutedMI = nullptr;

  if (Src0.isReg() && Src1.isReg()) {
    // Be sure to copy the source modifiers to the right place.
    CommutedMI
        = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
  } else if (Src0.isReg() && !Src1.isReg()) {
    CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
  } else if (!Src0.isReg() && Src1.isReg()) {
    CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
  } else if (Src0.isImm() && Src1.isImm()) {
    CommutedMI = swapImmOperands(MI, Src0, Src1);
  }

  return CommutedMI;
}

MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                                  unsigned Src0Idx,
                                                  unsigned Src1Idx) const {
  assert(!NewMI && "this should never be used");

  unsigned Opc = MI.getOpcode();
  int CommutedOpcode = commuteOpcode(Opc);
  if (CommutedOpcode == -1)
    return nullptr;

  if (Src0Idx > Src1Idx)
    std::swap(Src0Idx, Src1Idx);

  assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
           static_cast<int>(Src0Idx) &&
         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
           static_cast<int>(Src1Idx) &&
         "inconsistency with findCommutedOpIndices");

  MachineOperand &Src0 = MI.getOperand(Src0Idx);
  MachineOperand &Src1 = MI.getOperand(Src1Idx);
  MachineInstr *CommutedMI = swapOperands(MI, NewMI, Src0Idx, Src1Idx, Src0, Src1);
  if (!CommutedMI)
    return nullptr;
  if (!isOperandLegal(*CommutedMI, Src1Idx, &CommutedMI->getOperand(Src1Idx))) {
    // swap back if failed check
    swapOperands(MI, NewMI, Src0Idx, Src1Idx, Src0, Src1);
    return nullptr;
  }

  if (CommutedMI) {
    swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
                        Src1, AMDGPU::OpName::src1_modifiers);

    swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
                        AMDGPU::OpName::src1_sel);

    CommutedMI->setDesc(get(CommutedOpcode));
  }

  return CommutedMI;
}

The reason for that is the VALU instructions with literal in the first input operands.

The commuteInstruction function was called during the shrink instruction pass, and mismatched OpIdx and MO (first operand) will be the parameter of isOperandLegal. It return false because MO was counted for 2 times of literal limit (explained in the other reply).

Now if using the swapped instruction, the mismatch no longer exists, then it will move all literal constant to the second input operand for all VALU instructions.

Shall I fix all the cases in this PR?? or fix them in a new issue??

if (isOperandLegal(MI, Src1Idx, &Src0))
CommutedMI = swapInlineConstOperands(MI, Src0, Src1);
} else {
// FIXME: Found two non registers to commute. This does happen.
return nullptr;
Expand Down
15 changes: 10 additions & 5 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,9 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
let FPDPRounding = 1 in {
let Predicates = [Has16BitInsts, isGFX8Only] in {
defm V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
let isCommutable = 1 in {
defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
} // End isCommutable = 1
} // End Predicates = [Has16BitInsts, isGFX8Only]

let SubtargetPredicate = isGFX9Plus in {
Expand Down Expand Up @@ -639,8 +641,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>;

defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
let isCommutable = 1 in {
defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
} // End isCommutable = 1

defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>;
Expand Down Expand Up @@ -1254,8 +1258,9 @@ let SubtargetPredicate = isGFX10Plus in {
def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
}

defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>;
let isCommutable = 1 in {
defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>;
} // End isCommutable = 1
defm V_SUB_NC_U16 : VOP3Inst_t16 <"v_sub_nc_u16", VOP_I16_I16_I16, sub>;

} // End SubtargetPredicate = isGFX10Plus
Expand Down
18 changes: 14 additions & 4 deletions llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=machine-cse -verify-machineinstrs %s -o - 2>&1 | FileCheck --check-prefix=GCN %s

# GCN-LABEL: name: test_machine_cse_op_sel
# GCN: %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
# GCN: %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
# GCN: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
---
name: test_machine_cse_op_sel
body: |
; GCN-LABEL: name: test_machine_cse_op_sel
; GCN: %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %2, 0, 1, 0, implicit $exec
bb.0:
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
Expand All @@ -15,3 +14,14 @@ body: |
DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
...

---
name: test_machine_cse_op_inline_const
body: |
; GCN-LABEL: name: test_machine_cse_op_inline_const
; GCN: %0:vgpr_32 = V_ADD_NC_U16_e64 0, 64, 0, -3, 1, 0, implicit $mode, implicit $exec
; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %2:vgpr_32, %0, %0, 0, 1, 0, implicit $exec
bb.0:
%1:vgpr_32 = V_ADD_NC_U16_e64 0, 64, 0, -3, 1, 0, implicit $mode, implicit $exec
%2:vgpr_32 = V_ADD_NC_U16_e64 0, -3, 0, 64, 1, 0, implicit $mode, implicit $exec
DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %1, %2, 0, 1, 0, implicit $exec
...
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be a dedicated commute test for every changed opcode like this. What about the _e32 forms?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, for V_ADD_NC_U16 no e32 instruction was created, from ISA it is only listed in VOP3 encoding in section "15.3.4. VOP3
"

4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/ctlz.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1566,7 +1566,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffe8
; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffe8, v1
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
Expand Down Expand Up @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffe7
; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffe7, v1
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1657,8 +1657,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
; GFX10-NEXT: v_add_nc_u16 v1, v1, 0x900
; GFX10-NEXT: v_add_nc_u16 v5, v2, 0x900
; GFX10-NEXT: v_add_nc_u16 v1, 0x900, v1
; GFX10-NEXT: v_add_nc_u16 v5, 0x900, v2
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
Expand Down Expand Up @@ -1723,10 +1723,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
; GFX11-NEXT: v_add_nc_u16 v2, v2, 0x900
; GFX11-NEXT: v_add_nc_u16 v2, 0x900, v2
; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_nc_u16 v1, v1, 0x900
; GFX11-NEXT: v_add_nc_u16 v1, 0x900, v1
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v2
; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f00, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
Expand All @@ -408,7 +408,7 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
Expand Down Expand Up @@ -462,7 +462,7 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f00, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
Expand All @@ -473,7 +473,7 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
Expand Down Expand Up @@ -1348,7 +1348,7 @@ define i1 @isnormal_bf16(bfloat %x) {
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
; GFX10CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -1357,7 +1357,7 @@ define i1 @isnormal_bf16(bfloat %x) {
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -1402,7 +1402,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
; GFX10CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX10CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10CHECK-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -1411,7 +1411,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX11CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11CHECK-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -1464,7 +1464,7 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7eff, v1
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
Expand All @@ -1475,7 +1475,7 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
Expand Down Expand Up @@ -1529,7 +1529,7 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7eff, v1
; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
Expand All @@ -1540,7 +1540,7 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80
; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1
; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1
; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
Expand Down Expand Up @@ -2569,7 +2569,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s4, 0x7f80, v0
; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s5, 0x7f80, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
; GFX10CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s6, 0x7f, v1
; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo
; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
Expand All @@ -2587,7 +2587,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0
; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s1, 0x7f80, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80
; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s2, 0x7f, v1
; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
Expand Down Expand Up @@ -2669,7 +2669,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX10CHECK-NEXT: v_add_nc_u16 v1, v0, -1
; GFX10CHECK-NEXT: v_add_nc_u16 v2, v0, 0xff80
; GFX10CHECK-NEXT: v_add_nc_u16 v2, 0xff80, v0
; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s5, 0x7fbf, v0
; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f, v1
Expand All @@ -2685,7 +2685,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX11CHECK-NEXT: v_add_nc_u16 v1, v0, -1
; GFX11CHECK-NEXT: v_add_nc_u16 v2, v0, 0xff80
; GFX11CHECK-NEXT: v_add_nc_u16 v2, 0xff80, v0
; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s1, 0x7fbf, v0
; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v1
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1327,7 +1327,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0
; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1
; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
;
Expand All @@ -1353,7 +1353,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0
; GFX11-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1
; GFX11-GISEL-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -1486,7 +1486,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: global_load_ushort v1, v1, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0
; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1
; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
Expand Down Expand Up @@ -1517,7 +1517,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0
; GFX11-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
Expand Down Expand Up @@ -1686,8 +1686,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0
; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v2, 0xffc0
; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1
; GFX10-GISEL-NEXT: v_add_nc_u16 v2, 0xffc0, v2
; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-GISEL-NEXT: global_store_short v0, v2, s[0:1]
Expand Down Expand Up @@ -1724,8 +1724,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0
; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v2, 0xffc0
; GFX11-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1
; GFX11-GISEL-NEXT: v_add_nc_u16 v2, 0xffc0, v2
; GFX11-GISEL-NEXT: global_store_b16 v0, v1, s[0:1] dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: global_store_b16 v0, v2, s[0:1] dlc
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x3e7
; GFX11-NEXT: v_add_nc_u16 v2, 0x3e7, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
Expand Down