From f0877bb77c9134803c2647124f0650a720f87f71 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Mon, 30 Dec 2024 16:41:38 +0800 Subject: [PATCH 01/18] add commute for some VOP3 inst, allow commute for both inline constant operand, adjust tests --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 17 ++++++++++ llvm/lib/Target/AMDGPU/VOP3Instructions.td | 9 ++++-- llvm/test/CodeGen/AMDGPU/commute-op-sel.mir | 3 +- llvm/test/CodeGen/AMDGPU/ctlz.ll | 4 +-- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 8 ++--- .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll | 32 +++++++++---------- .../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 16 +++++----- llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 2 +- llvm/test/lit.cfg.py | 2 +- 9 files changed, 56 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f97ea40caa670..a7a384a3dbf3d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2749,6 +2749,20 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, return &MI; } +static MachineInstr *swapInlineConstOperands(MachineInstr &MI, + MachineOperand &NonRegOp1, + MachineOperand &NonRegOp2) { + + auto TargetFlags = NonRegOp1.getTargetFlags(); + auto NonRegVal = NonRegOp1.getImm(); + + NonRegOp1.setImm(NonRegOp2.getImm()); + NonRegOp2.setImm(NonRegVal); + NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags()); + NonRegOp2.setTargetFlags(TargetFlags); + return &MI; +} + MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned Src0Idx, unsigned Src1Idx) const { @@ -2785,6 +2799,9 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, } else if (!Src0.isReg() && Src1.isReg()) { if (isOperandLegal(MI, Src1Idx, &Src0)) CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); + } else if (isInlineConstant(Src0) && isInlineConstant(Src1)) { + if (isOperandLegal(MI, Src1Idx, &Src0)) + CommutedMI = swapInlineConstOperands(MI, Src0, Src1); } else { // FIXME: Found two non registers to commute. This does happen. return nullptr; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 22e457674c07a..a01fcf308c83b 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -335,6 +335,7 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in { let FPDPRounding = 1 in { let Predicates = [Has16BitInsts, isGFX8Only] in { defm V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile, AMDGPUdiv_fixup>; + let isCommutable = 1 in defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, any_fma>; } // End Predicates = [Has16BitInsts, isGFX8Only] @@ -639,8 +640,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in { defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>; defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>; -defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile>; -defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile>; +let isCommutable = 1 in { + defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile>; + defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile>; +} // End isCommutable = 1 defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>; defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>; @@ -1254,7 +1257,7 @@ let SubtargetPredicate = isGFX10Plus in { def : PermlanePat; def : PermlanePat; } - + let isCommutable = 1 in defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>; defm V_SUB_NC_U16 : VOP3Inst_t16 <"v_sub_nc_u16", VOP_I16_I16_I16, sub>; diff --git a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir index b9397f9d5d4dd..01595ce04313c 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir +++ b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir @@ -2,8 +2,7 @@ # GCN-LABEL: name: test_machine_cse_op_sel # GCN: %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec -# GCN: %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec -# GCN: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec +# GCN: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %2, 0, 1, 0, implicit $exec --- name: test_machine_cse_op_sel body: | diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 3019d4d298eb4..b4d450a90d595 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -1566,7 +1566,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffe8 +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffe8, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffe7 +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffe7, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index b897e1feed5d5..fec020a296b9b 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1657,8 +1657,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; GFX10-NEXT: v_add_nc_u16 v1, v1, 0x900 -; GFX10-NEXT: v_add_nc_u16 v5, v2, 0x900 +; GFX10-NEXT: v_add_nc_u16 v1, 0x900, v1 +; GFX10-NEXT: v_add_nc_u16 v5, 0x900, v2 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 @@ -1723,10 +1723,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-NEXT: v_add_nc_u16 v2, v2, 0x900 +; GFX11-NEXT: v_add_nc_u16 v2, 0x900, v2 ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_nc_u16 v1, v1, 0x900 +; GFX11-NEXT: v_add_nc_u16 v1, 0x900, v1 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v2 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index f416131e3d314..480d978fa530b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -397,7 +397,7 @@ define i1 @posnormal_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0 -; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80 +; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 ; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f00, v1 ; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -408,7 +408,7 @@ define i1 @posnormal_bf16(bfloat %x) nounwind { ; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80 +; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 ; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1 ; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -462,7 +462,7 @@ define i1 @negnormal_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80 +; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 ; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f00, v1 ; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -473,7 +473,7 @@ define i1 @negnormal_bf16(bfloat %x) nounwind { ; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80 +; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 ; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1 ; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -1348,7 +1348,7 @@ define i1 @isnormal_bf16(bfloat %x) { ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80 +; GFX10CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0 ; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1357,7 +1357,7 @@ define i1 @isnormal_bf16(bfloat %x) { ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80 +; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0 ; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0 ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1402,7 +1402,7 @@ define i1 @not_isnormal_bf16(bfloat %x) { ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80 +; GFX10CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0 ; GFX10CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1411,7 +1411,7 @@ define i1 @not_isnormal_bf16(bfloat %x) { ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80 +; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0 ; GFX11CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0 ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1464,7 +1464,7 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80 +; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 ; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7eff, v1 ; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -1475,7 +1475,7 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) { ; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80 +; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 ; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1 ; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -1529,7 +1529,7 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0 -; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80 +; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 ; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7eff, v1 ; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -1540,7 +1540,7 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) { ; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, 0xff80 +; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 ; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1 ; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -2569,7 +2569,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 ; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s4, 0x7f80, v0 ; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s5, 0x7f80, v0 -; GFX10CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80 +; GFX10CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0 ; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s6, 0x7f, v1 ; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo ; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0 @@ -2587,7 +2587,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) { ; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 ; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0 ; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s1, 0x7f80, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, 0xff80 +; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0 ; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s2, 0x7f, v1 ; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0 @@ -2669,7 +2669,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX10CHECK-NEXT: v_add_nc_u16 v1, v0, -1 -; GFX10CHECK-NEXT: v_add_nc_u16 v2, v0, 0xff80 +; GFX10CHECK-NEXT: v_add_nc_u16 v2, 0xff80, v0 ; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 ; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s5, 0x7fbf, v0 ; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f, v1 @@ -2685,7 +2685,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) { ; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX11CHECK-NEXT: v_add_nc_u16 v1, v0, -1 -; GFX11CHECK-NEXT: v_add_nc_u16 v2, v0, 0xff80 +; GFX11CHECK-NEXT: v_add_nc_u16 v2, 0xff80, v0 ; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 ; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s1, 0x7fbf, v0 ; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v1 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index a1a466fb04440..22996eda955be 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1327,7 +1327,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1 ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; @@ -1353,7 +1353,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX11-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1 ; GFX11-GISEL-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1486,7 +1486,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1 ; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm @@ -1517,7 +1517,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_u16 v1, v1, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX11-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1686,8 +1686,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v2, 0xffc0 +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1 +; GFX10-GISEL-NEXT: v_add_nc_u16 v2, 0xffc0, v2 ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-GISEL-NEXT: global_store_short v0, v2, s[0:1] @@ -1724,8 +1724,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v2, 0xffc0 +; GFX11-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1 +; GFX11-GISEL-NEXT: v_add_nc_u16 v2, 0xffc0, v2 ; GFX11-GISEL-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: global_store_b16 v0, v2, s[0:1] dlc diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index dd03fb62b8ebb..82fae44e20818 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -397,7 +397,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x3e7 +; GFX11-NEXT: v_add_nc_u16 v2, 0x3e7, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 5a03a85386e0a..9839f823ac9f4 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -463,7 +463,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("ascii") + readobj_out = readobj_cmd.stdout.read().decode("utf-8") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From ada83d6dabada48641d7e22e08a3650b0b7a3a03 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Tue, 31 Dec 2024 23:18:16 +0800 Subject: [PATCH 02/18] add inline constant case & merge main --- llvm/test/CodeGen/AMDGPU/commute-op-sel.mir | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir index 01595ce04313c..e332ba4cd672c 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir +++ b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir @@ -1,11 +1,11 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=machine-cse -verify-machineinstrs %s -o - 2>&1 | FileCheck --check-prefix=GCN %s -# GCN-LABEL: name: test_machine_cse_op_sel -# GCN: %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec -# GCN: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %2, 0, 1, 0, implicit $exec --- name: test_machine_cse_op_sel body: | + ; GCN-LABEL: name: test_machine_cse_op_sel + ; GCN: %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %2, 0, 1, 0, implicit $exec bb.0: %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF @@ -14,3 +14,14 @@ body: | DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec ... +--- +name: test_machine_cse_op_inline_const +body: | + ; GCN-LABEL: name: test_machine_cse_op_inline_const + ; GCN: %0:vgpr_32 = V_ADD_NC_U16_e64 0, 64, 0, -3, 1, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %2:vgpr_32, %0, %0, 0, 1, 0, implicit $exec + bb.0: + %1:vgpr_32 = V_ADD_NC_U16_e64 0, 64, 0, -3, 1, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_ADD_NC_U16_e64 0, -3, 0, 64, 1, 0, implicit $mode, implicit $exec + DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %1, %2, 0, 1, 0, implicit $exec +... From 3a83ae5ab6f954d40690a84370104560875c82d2 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Thu, 2 Jan 2025 14:05:03 +0800 Subject: [PATCH 03/18] fix lit change --- llvm/test/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 9839f823ac9f4..2adba3dc8d693 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -463,7 +463,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("utf-8") + readobj_out = readobj_cmd.stdout.read().decode("acsii") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From 5c9d065fa8a58687116007bc08d399ee465e8c66 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Thu, 2 Jan 2025 14:06:13 +0800 Subject: [PATCH 04/18] fix lit change --- llvm/test/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 2adba3dc8d693..5a03a85386e0a 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -463,7 +463,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("acsii") + readobj_out = readobj_cmd.stdout.read().decode("ascii") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From d3f00dc1fbd59a1c9544f9441bb41dbe4493385b Mon Sep 17 00:00:00 2001 From: Shoreshen <372660931@qq.com> Date: Thu, 2 Jan 2025 21:53:47 +0800 Subject: [PATCH 05/18] Update llvm/lib/Target/AMDGPU/SIInstrInfo.cpp Co-authored-by: Matt Arsenault --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index a7a384a3dbf3d..dfe5911450a09 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2752,7 +2752,6 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, static MachineInstr *swapInlineConstOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2) { - auto TargetFlags = NonRegOp1.getTargetFlags(); auto NonRegVal = NonRegOp1.getImm(); From 707474fcf909f7abe04663f6a9903e20cfd99a9c Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Fri, 3 Jan 2025 00:03:53 +0800 Subject: [PATCH 06/18] fix comments --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 ++++--- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 10 ++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index dfe5911450a09..18e227b6a3802 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2752,8 +2752,8 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, static MachineInstr *swapInlineConstOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2) { - auto TargetFlags = NonRegOp1.getTargetFlags(); - auto NonRegVal = NonRegOp1.getImm(); + unsigned TargetFlags = NonRegOp1.getTargetFlags(); + int64_t NonRegVal = NonRegOp1.getImm(); NonRegOp1.setImm(NonRegOp2.getImm()); NonRegOp2.setImm(NonRegVal); @@ -2798,7 +2798,8 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, } else if (!Src0.isReg() && Src1.isReg()) { if (isOperandLegal(MI, Src1Idx, &Src0)) CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); - } else if (isInlineConstant(Src0) && isInlineConstant(Src1)) { + } else if (isInlineConstant(Src1)) { + // If Src1 is inline constant and Src0 is not, then isOperandLegal rejects if (isOperandLegal(MI, Src1Idx, &Src0)) CommutedMI = swapInlineConstOperands(MI, Src0, Src1); } else { diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index a01fcf308c83b..fb56fd5c97039 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -335,8 +335,9 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in { let FPDPRounding = 1 in { let Predicates = [Has16BitInsts, isGFX8Only] in { defm V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile, AMDGPUdiv_fixup>; - let isCommutable = 1 in - defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, any_fma>; + let isCommutable = 1 in { + defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, any_fma>; + } // End isCommutable = 1 } // End Predicates = [Has16BitInsts, isGFX8Only] let SubtargetPredicate = isGFX9Plus in { @@ -1257,8 +1258,9 @@ let SubtargetPredicate = isGFX10Plus in { def : PermlanePat; def : PermlanePat; } - let isCommutable = 1 in - defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>; + let isCommutable = 1 in { + defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>; + } // End isCommutable = 1 defm V_SUB_NC_U16 : VOP3Inst_t16 <"v_sub_nc_u16", VOP_I16_I16_I16, sub>; } // End SubtargetPredicate = isGFX10Plus From 79219d10c8b252a6ac337ece1cee358ffa3b8cc1 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Fri, 3 Jan 2025 00:06:19 +0800 Subject: [PATCH 07/18] fix --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 18e227b6a3802..b9445a750bf84 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2798,8 +2798,7 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, } else if (!Src0.isReg() && Src1.isReg()) { if (isOperandLegal(MI, Src1Idx, &Src0)) CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); - } else if (isInlineConstant(Src1)) { - // If Src1 is inline constant and Src0 is not, then isOperandLegal rejects + } else if (isInlineConstant(Src0) && isInlineConstant(Src1)) { if (isOperandLegal(MI, Src1Idx, &Src0)) CommutedMI = swapInlineConstOperands(MI, Src0, Src1); } else { From 53b370a2012b36ae48e0a3580bd19bfd79d9b219 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Mon, 6 Jan 2025 18:40:09 +0800 Subject: [PATCH 08/18] Add legal check for swap --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 155 ++++++++++++------ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 11 +- .../test/CodeGen/AMDGPU/carryout-selection.ll | 8 +- llvm/test/CodeGen/AMDGPU/cmp_shrink.mir | 2 +- llvm/test/CodeGen/AMDGPU/commute-op-sel.mir | 40 ++++- 5 files changed, 155 insertions(+), 61 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index b9445a750bf84..31a183940f70d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2749,7 +2749,7 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, return &MI; } -static MachineInstr *swapInlineConstOperands(MachineInstr &MI, +static MachineInstr *swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2) { unsigned TargetFlags = NonRegOp1.getTargetFlags(); @@ -2762,6 +2762,54 @@ static MachineInstr *swapInlineConstOperands(MachineInstr &MI, return &MI; } +bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, + unsigned OpIdx0, const MachineOperand *MO0, + unsigned OpIdx1, const MachineOperand *MO1) const { + const MCInstrDesc &InstDesc = MI.getDesc(); + const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0]; + const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1]; + const TargetRegisterClass *DefinedRC1 = + OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr; + const TargetRegisterClass *DefinedRC0 = + OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr; + + unsigned Opc = MI.getOpcode(); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + if (Src0Idx == -1) { + // VOPD V_DUAL_* instructions use different operand names. + Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0X); + } + + // Swap doesn't breach constantbus or literal limits + // It may move literal to position other than src0, this is not allowed pre-gfx10 + // However, most test cases need literals in Src0 for VOP + // FIX-ME: After gfx9, literal can be in place other than Src0 + if (isVALU(MI)){ + if ((int)OpIdx0 == Src0Idx && + !MO0->isReg() && !isInlineConstant(*MO0, OpInfo1)) + return false; + if ((int)OpIdx1 == Src0Idx && + !MO1->isReg() && !isInlineConstant(*MO1, OpInfo0)) + return false; + } + + if (OpIdx1 != Src0Idx && MO0->isReg()) { + if (!DefinedRC1) + return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN; + return isLegalRegOperand(MI, OpIdx1, *MO0); + } + if (OpIdx0 != Src0Idx && MO1->isReg()) { + if (!DefinedRC0) + return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN; + return isLegalRegOperand(MI, OpIdx0, *MO1); + } + + // No need to check 64bit literals since swapping does not bring new + // 64bit literals into current instruction to fold to 32bit + + return isImmOperandLegal(MI, OpIdx1, *MO0); +} + MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned Src0Idx, unsigned Src1Idx) const { @@ -2783,24 +2831,20 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, MachineOperand &Src0 = MI.getOperand(Src0Idx); MachineOperand &Src1 = MI.getOperand(Src1Idx); - + if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) { + return nullptr; + } MachineInstr *CommutedMI = nullptr; if (Src0.isReg() && Src1.isReg()) { - if (isOperandLegal(MI, Src1Idx, &Src0)) { - // Be sure to copy the source modifiers to the right place. - CommutedMI - = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); - } - + // Be sure to copy the source modifiers to the right place. + CommutedMI + = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); } else if (Src0.isReg() && !Src1.isReg()) { - if (isOperandLegal(MI, Src1Idx, &Src0)) - CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); + CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); } else if (!Src0.isReg() && Src1.isReg()) { - if (isOperandLegal(MI, Src1Idx, &Src0)) - CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); - } else if (isInlineConstant(Src0) && isInlineConstant(Src1)) { - if (isOperandLegal(MI, Src1Idx, &Src0)) - CommutedMI = swapInlineConstOperands(MI, Src0, Src1); + CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); + } else if (Src0.isImm() && Src1.isImm()) { + CommutedMI = swapImmOperands(MI, Src0, Src1); } else { // FIXME: Found two non registers to commute. This does happen. return nullptr; @@ -5809,6 +5853,52 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, return RC->hasSuperClassEq(DRC); } +bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, + unsigned OpIdx, + const MachineOperand &MO) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx]; + unsigned Opc = MI.getOpcode(); + + if (!isLegalRegOperand(MRI, OpInfo, MO)) + return false; + + // check Accumulate GPR operand + bool IsAGPR = RI.isAGPR(MRI, MO.getReg()); + if (IsAGPR && !ST.hasMAIInsts()) + return false; + if (IsAGPR && + (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && + (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) + return false; + // Atomics should have both vdst and vdata either vgpr or agpr. + const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + const int DataIdx = AMDGPU::getNamedOperandIdx(Opc, + isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); + if ((int)OpIdx == VDstIdx && DataIdx != -1 && + MI.getOperand(DataIdx).isReg() && + RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) + return false; + if ((int)OpIdx == DataIdx) { + if (VDstIdx != -1 && + RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR) + return false; + // DS instructions with 2 src operands also must have tied RC. + const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::data1); + if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() && + RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) + return false; + } + + // Check V_ACCVGPR_WRITE_B32_e64 + if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() && + (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && + RI.isSGPRReg(MRI, MO.getReg())) + return false; + return true; +} + bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const { @@ -5871,40 +5961,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, if (MO->isReg()) { if (!DefinedRC) return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN; - if (!isLegalRegOperand(MRI, OpInfo, *MO)) - return false; - bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); - if (IsAGPR && !ST.hasMAIInsts()) - return false; - unsigned Opc = MI.getOpcode(); - if (IsAGPR && - (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && - (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) - return false; - // Atomics should have both vdst and vdata either vgpr or agpr. - const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); - const int DataIdx = AMDGPU::getNamedOperandIdx(Opc, - isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); - if ((int)OpIdx == VDstIdx && DataIdx != -1 && - MI.getOperand(DataIdx).isReg() && - RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) - return false; - if ((int)OpIdx == DataIdx) { - if (VDstIdx != -1 && - RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR) - return false; - // DS instructions with 2 src operands also must have tied RC. - const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::data1); - if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() && - RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) - return false; - } - if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() && - (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && - RI.isSGPRReg(MRI, MO->getReg())) - return false; - return true; + return isLegalRegOperand(MI, OpIdx, *MO); } if (MO->isImm()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 8f9ca6141816d..a05f8d5efff3a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -193,7 +193,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const; - + bool isLegalToSwap(const MachineInstr &MI, + unsigned fromIdx, const MachineOperand *fromMO, + unsigned toIdx, const MachineOperand *toMO) const; MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override; @@ -1209,11 +1211,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { const MachineOperand &MO) const; /// Check if \p MO (a register operand) is a legal register for the - /// given operand description. + /// given operand description or operand index. + /// The operand index version provide more legality checks bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const; - + bool isLegalRegOperand(const MachineInstr &MI, + unsigned OpIdx, + const MachineOperand &MO) const; /// Legalize operands in \p MI by either commuting it or inserting a /// copy of src1. void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const; diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index fc89615059152..cdea4fd158b04 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -355,7 +355,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 -; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0, 0x1234, s2 +; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0x1234, 0, s2 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-NEXT: s_endpgm @@ -365,7 +365,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1030W32-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2 +; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s2 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W32-NEXT: s_endpgm @@ -375,7 +375,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1030W64-NEXT: v_add_co_u32 v0, s[2:3], 0x56789876, v0 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[2:3] +; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3] ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W64-NEXT: s_endpgm @@ -387,7 +387,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cmp_shrink.mir b/llvm/test/CodeGen/AMDGPU/cmp_shrink.mir index 9b3579b43a38a..ae3fa153f381a 100644 --- a/llvm/test/CodeGen/AMDGPU/cmp_shrink.mir +++ b/llvm/test/CodeGen/AMDGPU/cmp_shrink.mir @@ -7,6 +7,6 @@ name: not_shrink_icmp body: | bb.0: ; GCN-LABEL: name: not_shrink_icmp - ; GCN: S_CMP_GT_I32 1, 65, implicit-def $scc + ; GCN: S_CMP_LT_I32 65, 1, implicit-def $scc S_CMP_GT_I32 1, 65, implicit-def $scc ... diff --git a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir index e332ba4cd672c..7344e08f679b8 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir +++ b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir @@ -1,9 +1,9 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=machine-cse -verify-machineinstrs %s -o - 2>&1 | FileCheck --check-prefix=GCN %s --- -name: test_machine_cse_op_sel +name: test_machine_cse_op_sel_v_add_nc_u16 body: | - ; GCN-LABEL: name: test_machine_cse_op_sel + ; GCN-LABEL: name: test_machine_cse_op_sel_v_add_nc_u16 ; GCN: %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %2, 0, 1, 0, implicit $exec bb.0: @@ -15,9 +15,9 @@ body: | ... --- -name: test_machine_cse_op_inline_const +name: test_machine_cse_op_const_v_add_nc_u16 body: | - ; GCN-LABEL: name: test_machine_cse_op_inline_const + ; GCN-LABEL: name: test_machine_cse_op_const_v_add_nc_u16 ; GCN: %0:vgpr_32 = V_ADD_NC_U16_e64 0, 64, 0, -3, 1, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %2:vgpr_32, %0, %0, 0, 1, 0, implicit $exec bb.0: @@ -25,3 +25,35 @@ body: | %2:vgpr_32 = V_ADD_NC_U16_e64 0, -3, 0, 64, 1, 0, implicit $mode, implicit $exec DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %1, %2, 0, 1, 0, implicit $exec ... + +--- +name: test_machine_cse_op_v_fma_f16 +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: test_machine_cse_op_v_fma_f16 + ; GCN: %3:vgpr_32 = nofpexcept V_FMA_F16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %3, 0, 1, 0, implicit $exec + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = nofpexcept V_FMA_F16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + %4:vgpr_32 = nofpexcept V_FMA_F16_e64 0, %1, 0, %0, 0, %2, 0, 0, implicit $mode, implicit $exec + DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %4, 0, 1, 0, implicit $exec +... + +--- +name: test_machine_cse_op_const_v_fma_f16 +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: test_machine_cse_op_const_v_fma_f16 + ; GCN: %1:vgpr_32 = nofpexcept V_FMA_F16_e64 0, 3481272320, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %1, 0, 1, 0, implicit $exec + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = nofpexcept V_FMA_F16_e64 0, 1, 0, 3481272320, 0, %0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = nofpexcept V_FMA_F16_e64 0, 3481272320, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec + DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %2, 0, 1, 0, implicit $exec +... From 5e15e728eb38187c5dfe39cb44c59edc6bddbb49 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Wed, 8 Jan 2025 11:27:50 +0800 Subject: [PATCH 09/18] add tests & merge_main --- llvm/test/CodeGen/AMDGPU/commute-op-sel.mir | 66 ++++++++++++++++++++- llvm/test/lit.cfg.py | 2 +- 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir index 7344e08f679b8..823d8d496ea56 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir +++ b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir @@ -51,9 +51,73 @@ body: | ; GCN: %1:vgpr_32 = nofpexcept V_FMA_F16_e64 0, 3481272320, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %1, 0, 1, 0, implicit $exec bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2 + liveins: $vgpr0 %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = nofpexcept V_FMA_F16_e64 0, 1, 0, 3481272320, 0, %0, 0, 0, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_FMA_F16_e64 0, 3481272320, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %2, 0, 1, 0, implicit $exec ... + +--- +name: test_machine_cse_op_v_MAD_u16 +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: test_machine_cse_op_v_MAD_u16 + ; GCN: %3:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %3, 0, 1, 0, implicit $exec + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + %4:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, %1, 0, %0, 0, %2, 0, 0, implicit $mode, implicit $exec + DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %4, 0, 1, 0, implicit $exec +... + +--- +name: test_machine_cse_op_const_v_MAD_u16 +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: test_machine_cse_op_const_v_MAD_u16 + ; GCN: %1:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %1, 0, 1, 0, implicit $exec + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, 64, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec + DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %2, 0, 1, 0, implicit $exec +... + +--- +name: test_machine_cse_op_v_MAD_i16 +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: test_machine_cse_op_v_MAD_i16 + ; GCN: %3:vgpr_32 = nofpexcept V_MAD_I32_I16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %3, 0, 1, 0, implicit $exec + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = nofpexcept V_MAD_I32_I16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + %4:vgpr_32 = nofpexcept V_MAD_I32_I16_e64 0, %1, 0, %0, 0, %2, 0, 0, implicit $mode, implicit $exec + DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %4, 0, 1, 0, implicit $exec +... + +--- +name: test_machine_cse_op_const_v_MAD_i16 +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: test_machine_cse_op_const_v_MAD_i16 + ; GCN: %1:vgpr_32 = nofpexcept V_MAD_I32_I16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %1, 0, 1, 0, implicit $exec + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = nofpexcept V_MAD_I32_I16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = nofpexcept V_MAD_I32_I16_e64 0, 64, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec + DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %2, 0, 1, 0, implicit $exec +... diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 5a03a85386e0a..9839f823ac9f4 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -463,7 +463,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("ascii") + readobj_out = readobj_cmd.stdout.read().decode("utf-8") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From 004a82d671b72e5989680baa289f546cccbd201b Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Wed, 8 Jan 2025 21:06:11 +0800 Subject: [PATCH 10/18] fix lit.cfg.py --- llvm/test/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 9839f823ac9f4..2adba3dc8d693 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -463,7 +463,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("utf-8") + readobj_out = readobj_cmd.stdout.read().decode("acsii") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From dc2739f135fa7a69b26875ad4e178fc52ff1c392 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Wed, 8 Jan 2025 21:07:49 +0800 Subject: [PATCH 11/18] fix lit.cfg.py --- llvm/test/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 2adba3dc8d693..5a03a85386e0a 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -463,7 +463,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("acsii") + readobj_out = readobj_cmd.stdout.read().decode("ascii") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From 161a2b971ad24d02ba6582f008d403f1f269631f Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Thu, 9 Jan 2025 10:47:57 +0800 Subject: [PATCH 12/18] adjust comment & merge main --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 224a5e81b1125..8d03ccf5c0c1c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2780,10 +2780,10 @@ bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0X); } - // Swap doesn't breach constantbus or literal limits + // Swap doesn't breach constant bus or literal limits // It may move literal to position other than src0, this is not allowed pre-gfx10 // However, most test cases need literals in Src0 for VOP - // FIX-ME: After gfx9, literal can be in place other than Src0 + // FIXME: After gfx9, literal can be in place other than Src0 if (isVALU(MI)){ if ((int)OpIdx0 == Src0Idx && !MO0->isReg() && !isInlineConstant(*MO0, OpInfo1)) @@ -2804,8 +2804,8 @@ bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, return isLegalRegOperand(MI, OpIdx0, *MO1); } - // No need to check 64bit literals since swapping does not bring new - // 64bit literals into current instruction to fold to 32bit + // No need to check 64-bit literals since swapping does not bring new + // 64-bit literals into current instruction to fold to 32-bit return isImmOperandLegal(MI, OpIdx1, *MO0); } From 1689c1e84c34862aade60fbcbc7a42a3fa3d7b39 Mon Sep 17 00:00:00 2001 From: ShoreShen <372660931@qq.com> Date: Fri, 10 Jan 2025 17:51:40 +0800 Subject: [PATCH 13/18] adjust case & merge main --- llvm/test/CodeGen/AMDGPU/commute-op-sel.mir | 52 ++++++++++----------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir index 823d8d496ea56..61cd634938bd4 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir +++ b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir @@ -15,9 +15,9 @@ body: | ... --- -name: test_machine_cse_op_const_v_add_nc_u16 +name: test_machine_cse_op_sel_const_v_add_nc_u16 body: | - ; GCN-LABEL: name: test_machine_cse_op_const_v_add_nc_u16 + ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_add_nc_u16 ; GCN: %0:vgpr_32 = V_ADD_NC_U16_e64 0, 64, 0, -3, 1, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %2:vgpr_32, %0, %0, 0, 1, 0, implicit $exec bb.0: @@ -27,10 +27,10 @@ body: | ... --- -name: test_machine_cse_op_v_fma_f16 +name: test_machine_cse_op_sel_v_fma_f16 tracksRegLiveness: true body: | - ; GCN-LABEL: name: test_machine_cse_op_v_fma_f16 + ; GCN-LABEL: name: test_machine_cse_op_sel_v_fma_f16 ; GCN: %3:vgpr_32 = nofpexcept V_FMA_F16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %3, 0, 1, 0, implicit $exec bb.0: @@ -44,10 +44,10 @@ body: | ... --- -name: test_machine_cse_op_const_v_fma_f16 +name: test_machine_cse_op_sel_const_v_fma_f16 tracksRegLiveness: true body: | - ; GCN-LABEL: name: test_machine_cse_op_const_v_fma_f16 + ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_fma_f16 ; GCN: %1:vgpr_32 = nofpexcept V_FMA_F16_e64 0, 3481272320, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %1, 0, 1, 0, implicit $exec bb.0: @@ -59,65 +59,65 @@ body: | ... --- -name: test_machine_cse_op_v_MAD_u16 +name: test_machine_cse_op_sel_v_MAD_u16 tracksRegLiveness: true body: | - ; GCN-LABEL: name: test_machine_cse_op_v_MAD_u16 - ; GCN: %3:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + ; GCN-LABEL: name: test_machine_cse_op_sel_v_MAD_u16 + ; GCN: %3:vgpr_32 = V_MAD_U32_U16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %3, 0, 1, 0, implicit $exec bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 %2:vgpr_32 = COPY $vgpr2 - %3:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec - %4:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, %1, 0, %0, 0, %2, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = V_MAD_U32_U16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + %4:vgpr_32 = V_MAD_U32_U16_e64 0, %1, 0, %0, 0, %2, 0, 0, implicit $mode, implicit $exec DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %4, 0, 1, 0, implicit $exec ... --- -name: test_machine_cse_op_const_v_MAD_u16 +name: test_machine_cse_op_sel_const_v_MAD_u16 tracksRegLiveness: true body: | - ; GCN-LABEL: name: test_machine_cse_op_const_v_MAD_u16 - ; GCN: %1:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec + ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_MAD_u16 + ; GCN: %1:vgpr_32 = V_MAD_U32_U16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %1, 0, 1, 0, implicit $exec bb.0: liveins: $vgpr0 %0:vgpr_32 = COPY $vgpr0 - %1:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec - %2:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, 64, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec + %1:vgpr_32 = V_MAD_U32_U16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_MAD_U32_U16_e64 0, 64, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %2, 0, 1, 0, implicit $exec ... --- -name: test_machine_cse_op_v_MAD_i16 +name: test_machine_cse_op_sel_v_MAD_i16 tracksRegLiveness: true body: | - ; GCN-LABEL: name: test_machine_cse_op_v_MAD_i16 - ; GCN: %3:vgpr_32 = nofpexcept V_MAD_I32_I16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + ; GCN-LABEL: name: test_machine_cse_op_sel_v_MAD_i16 + ; GCN: %3:vgpr_32 = V_MAD_I32_I16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %3, 0, 1, 0, implicit $exec bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 %2:vgpr_32 = COPY $vgpr2 - %3:vgpr_32 = nofpexcept V_MAD_I32_I16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec - %4:vgpr_32 = nofpexcept V_MAD_I32_I16_e64 0, %1, 0, %0, 0, %2, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = V_MAD_I32_I16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + %4:vgpr_32 = V_MAD_I32_I16_e64 0, %1, 0, %0, 0, %2, 0, 0, implicit $mode, implicit $exec DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %4, 0, 1, 0, implicit $exec ... --- -name: test_machine_cse_op_const_v_MAD_i16 +name: test_machine_cse_op_sel_const_v_MAD_i16 tracksRegLiveness: true body: | - ; GCN-LABEL: name: test_machine_cse_op_const_v_MAD_i16 - ; GCN: %1:vgpr_32 = nofpexcept V_MAD_I32_I16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec + ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_MAD_i16 + ; GCN: %1:vgpr_32 = V_MAD_I32_I16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %1, 0, 1, 0, implicit $exec bb.0: liveins: $vgpr0 %0:vgpr_32 = COPY $vgpr0 - %1:vgpr_32 = nofpexcept V_MAD_I32_I16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec - %2:vgpr_32 = nofpexcept V_MAD_I32_I16_e64 0, 64, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec + %1:vgpr_32 = V_MAD_I32_I16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_MAD_I32_I16_e64 0, 64, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %2, 0, 1, 0, implicit $exec ... From 9faf423ab9e38232d65809bb6a214beda0881c38 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Mon, 13 Jan 2025 12:27:26 +0800 Subject: [PATCH 14/18] fix inconsistent capitalization --- llvm/test/CodeGen/AMDGPU/commute-op-sel.mir | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir index 61cd634938bd4..3c4e4a5c6c20b 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir +++ b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir @@ -59,7 +59,7 @@ body: | ... --- -name: test_machine_cse_op_sel_v_MAD_u16 +name: test_machine_cse_op_sel_v_mad_u16 tracksRegLiveness: true body: | ; GCN-LABEL: name: test_machine_cse_op_sel_v_MAD_u16 @@ -76,7 +76,7 @@ body: | ... --- -name: test_machine_cse_op_sel_const_v_MAD_u16 +name: test_machine_cse_op_sel_const_v_mad_u16 tracksRegLiveness: true body: | ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_MAD_u16 @@ -91,7 +91,7 @@ body: | ... --- -name: test_machine_cse_op_sel_v_MAD_i16 +name: test_machine_cse_op_sel_v_mad_i16 tracksRegLiveness: true body: | ; GCN-LABEL: name: test_machine_cse_op_sel_v_MAD_i16 @@ -108,7 +108,7 @@ body: | ... --- -name: test_machine_cse_op_sel_const_v_MAD_i16 +name: test_machine_cse_op_sel_const_v_mad_i16 tracksRegLiveness: true body: | ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_MAD_i16 From 19b8ad4c6a305b9f491f64cba3137cc878250b38 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Mon, 13 Jan 2025 15:44:40 +0800 Subject: [PATCH 15/18] fix test case --- llvm/test/CodeGen/AMDGPU/commute-op-sel.mir | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir index 3c4e4a5c6c20b..9274c995dde92 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir +++ b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir @@ -62,7 +62,7 @@ body: | name: test_machine_cse_op_sel_v_mad_u16 tracksRegLiveness: true body: | - ; GCN-LABEL: name: test_machine_cse_op_sel_v_MAD_u16 + ; GCN-LABEL: name: test_machine_cse_op_sel_v_mad_u16 ; GCN: %3:vgpr_32 = V_MAD_U32_U16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %3, 0, 1, 0, implicit $exec bb.0: @@ -79,7 +79,7 @@ body: | name: test_machine_cse_op_sel_const_v_mad_u16 tracksRegLiveness: true body: | - ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_MAD_u16 + ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_mad_u16 ; GCN: %1:vgpr_32 = V_MAD_U32_U16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %1, 0, 1, 0, implicit $exec bb.0: @@ -94,7 +94,7 @@ body: | name: test_machine_cse_op_sel_v_mad_i16 tracksRegLiveness: true body: | - ; GCN-LABEL: name: test_machine_cse_op_sel_v_MAD_i16 + ; GCN-LABEL: name: test_machine_cse_op_sel_v_mad_i16 ; GCN: %3:vgpr_32 = V_MAD_I32_I16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %5:vgpr_32, %3, %3, 0, 1, 0, implicit $exec bb.0: @@ -111,7 +111,7 @@ body: | name: test_machine_cse_op_sel_const_v_mad_i16 tracksRegLiveness: true body: | - ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_MAD_i16 + ; GCN-LABEL: name: test_machine_cse_op_sel_const_v_mad_i16 ; GCN: %1:vgpr_32 = V_MAD_I32_I16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %1, 0, 1, 0, implicit $exec bb.0: From 0a89dc9afbd3e3f5bbcc1f001616e2e228ce1bcf Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Mon, 20 Jan 2025 09:18:43 +0800 Subject: [PATCH 16/18] merge main --- .../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 186 +++++++++++++----- 1 file changed, 141 insertions(+), 45 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 3c4f7049715e9..01528cdf7c125 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1346,18 +1346,44 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-SDAG-TRUE16-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_i16_x_sub_64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1 -; GFX11-GISEL-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-SDAG-FAKE16-LABEL: v_test_i16_x_sub_64: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX11-SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: v_test_i16_x_sub_64: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v1.l, 0xffc0, v1.l +; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_test_i16_x_sub_64: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, 0xffc0, v1 +; GFX11-GISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-GISEL-FAKE16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext @@ -1509,21 +1535,53 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; GFX11-SDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-SDAG-TRUE16-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_u16 v1, v1, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-SDAG-FAKE16-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: global_load_u16 v1, v1, s[2:3] +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-SDAG-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1 +; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xffc0, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v1, s[2:3] +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, 0xffc0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-GISEL-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-FAKE16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext @@ -1720,24 +1778,62 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-TRUE16-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, 0xffc0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, 0xffc0, v2 -; GFX11-GISEL-NEXT: global_store_b16 v0, v1, s[0:1] dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: global_store_b16 v0, v2, s[0:1] dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-SDAG-FAKE16-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX11-SDAG-FAKE16-NEXT: v_sub_nc_u16 v2, v2, 64 +; GFX11-SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1] dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v1.l, 0xffc0, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v2.l, 0xffc0, v2.l +; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1] dlc +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1] dlc +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, 0xffc0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, 0xffc0, v2 +; GFX11-GISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1] dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext From d8e6cb788d88d2e1d2f14755126b2ab1487b960f Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Mon, 20 Jan 2025 10:15:13 +0800 Subject: [PATCH 17/18] fix format --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 63 ++++++++++++-------------- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 9 ++-- 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index c8c3cf573c56d..3b85cb4f7a859 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2750,8 +2750,8 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, } static MachineInstr *swapImmOperands(MachineInstr &MI, - MachineOperand &NonRegOp1, - MachineOperand &NonRegOp2) { + MachineOperand &NonRegOp1, + MachineOperand &NonRegOp2) { unsigned TargetFlags = NonRegOp1.getTargetFlags(); int64_t NonRegVal = NonRegOp1.getImm(); @@ -2762,9 +2762,9 @@ static MachineInstr *swapImmOperands(MachineInstr &MI, return &MI; } -bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, - unsigned OpIdx0, const MachineOperand *MO0, - unsigned OpIdx1, const MachineOperand *MO1) const { +bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0, + const MachineOperand *MO0, unsigned OpIdx1, + const MachineOperand *MO1) const { const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0]; const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1]; @@ -2772,7 +2772,7 @@ bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr; const TargetRegisterClass *DefinedRC0 = OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr; - + unsigned Opc = MI.getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) { @@ -2781,15 +2781,15 @@ bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, } // Swap doesn't breach constant bus or literal limits - // It may move literal to position other than src0, this is not allowed pre-gfx10 - // However, most test cases need literals in Src0 for VOP + // It may move literal to position other than src0, this is not allowed + // pre-gfx10 However, most test cases need literals in Src0 for VOP // FIXME: After gfx9, literal can be in place other than Src0 - if (isVALU(MI)){ - if ((int)OpIdx0 == Src0Idx && - !MO0->isReg() && !isInlineConstant(*MO0, OpInfo1)) + if (isVALU(MI)) { + if ((int)OpIdx0 == Src0Idx && !MO0->isReg() && + !isInlineConstant(*MO0, OpInfo1)) return false; - if ((int)OpIdx1 == Src0Idx && - !MO1->isReg() && !isInlineConstant(*MO1, OpInfo0)) + if ((int)OpIdx1 == Src0Idx && !MO1->isReg() && + !isInlineConstant(*MO1, OpInfo0)) return false; } @@ -2797,14 +2797,14 @@ bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, if (!DefinedRC1) return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN; return isLegalRegOperand(MI, OpIdx1, *MO0); - } + } if (OpIdx0 != Src0Idx && MO1->isReg()) { if (!DefinedRC0) return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN; return isLegalRegOperand(MI, OpIdx0, *MO1); } - - // No need to check 64-bit literals since swapping does not bring new + + // No need to check 64-bit literals since swapping does not bring new // 64-bit literals into current instruction to fold to 32-bit return isImmOperandLegal(MI, OpIdx1, *MO0); @@ -2837,8 +2837,8 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, MachineInstr *CommutedMI = nullptr; if (Src0.isReg() && Src1.isReg()) { // Be sure to copy the source modifiers to the right place. - CommutedMI - = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); + CommutedMI = + TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); } else if (Src0.isReg() && !Src1.isReg()) { CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); } else if (!Src0.isReg() && Src1.isReg()) { @@ -5877,8 +5877,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, return RC->hasSuperClassEq(DRC); } -bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, - unsigned OpIdx, +bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, const MachineOperand &MO) const { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx]; @@ -5891,34 +5890,32 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, bool IsAGPR = RI.isAGPR(MRI, MO.getReg()); if (IsAGPR && !ST.hasMAIInsts()) return false; - if (IsAGPR && - (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && - (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) + if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && + (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) return false; // Atomics should have both vdst and vdata either vgpr or agpr. const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); - const int DataIdx = AMDGPU::getNamedOperandIdx(Opc, - isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); + const int DataIdx = AMDGPU::getNamedOperandIdx( + Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); if ((int)OpIdx == VDstIdx && DataIdx != -1 && - MI.getOperand(DataIdx).isReg() && - RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) + MI.getOperand(DataIdx).isReg() && + RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) return false; if ((int)OpIdx == DataIdx) { if (VDstIdx != -1 && RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR) - return false; + return false; // DS instructions with 2 src operands also must have tied RC. - const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::data1); + const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() && RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) - return false; + return false; } // Check V_ACCVGPR_WRITE_B32_e64 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() && - (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && - RI.isSGPRReg(MRI, MO.getReg())) + (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && + RI.isSGPRReg(MRI, MO.getReg())) return false; return true; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index a1d2cd94732de..a609c9abbad01 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -193,9 +193,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const; - bool isLegalToSwap(const MachineInstr &MI, - unsigned fromIdx, const MachineOperand *fromMO, - unsigned toIdx, const MachineOperand *toMO) const; + bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, + const MachineOperand *fromMO, unsigned toIdx, + const MachineOperand *toMO) const; MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override; @@ -1225,8 +1225,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const; - bool isLegalRegOperand(const MachineInstr &MI, - unsigned OpIdx, + bool isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, const MachineOperand &MO) const; /// Legalize operands in \p MI by either commuting it or inserting a /// copy of src1. From bf1da570759f2a1b6ee99b7cf4a356b03548fdf0 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Tue, 21 Jan 2025 21:29:31 +0800 Subject: [PATCH 18/18] remove special handling for VOPD, since no commutable VOPD instruction --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 3b85cb4f7a859..5c20f28b3d9de 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2775,10 +2775,6 @@ bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0, unsigned Opc = MI.getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - if (Src0Idx == -1) { - // VOPD V_DUAL_* instructions use different operand names. - Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0X); - } // Swap doesn't breach constant bus or literal limits // It may move literal to position other than src0, this is not allowed