Skip to content

[AMDGPU][SDAG] Support source modifiers on select integer operands #147325

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 29 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d4ac937
Add new test for source modifiers on select
chrisjbris Jul 2, 2025
c89274e
Populate check-lines before patching
chrisjbris Jul 2, 2025
b6b3726
[AMDGPU][SDAG] Support source modifiers as integer on select
chrisjbris Jul 7, 2025
dea39d1
Simplify switch in BitwiseToSrcModifierOp()
chrisjbris Jul 7, 2025
004dc9f
[NFC] Correct typo in BitwiseToSrcModifierOp()
chrisjbris Jul 7, 2025
b27ce62
Fix bitcast type in performSelectCombine()
chrisjbris Jul 7, 2025
f503034
Respond to first review comments
chrisjbris Jul 7, 2025
e073552
Respond to secon review comments - rename function and correct test
chrisjbris Jul 7, 2025
000ddc8
[NFC] Remove incomplete dag-style comment
chrisjbris Jul 7, 2025
2604329
Make test for bitwise src mods more stringent and correct fneg-fabs o…
chrisjbris Jul 7, 2025
97d93d6
Reviewer corrections
chrisjbris Jul 8, 2025
f255ddc
Refactor to support the source modifiers on either or both operands.
chrisjbris Jul 8, 2025
2e2249a
Fix Typo.
chrisjbris Jul 8, 2025
a505a72
Respond to reviewer - Add i16 tests, simplify obtaining type
chrisjbris Jul 8, 2025
a8bd726
Inline bitcast node creation.
chrisjbris Jul 8, 2025
e5f1e67
Add functional implementation for i64
chrisjbris Jul 11, 2025
767cc11
Fix formatting
chrisjbris Jul 11, 2025
686919f
[DAGCombine] Move the AMDGPU combine to Target Indepenent DAGCombine
chrisjbris Jul 12, 2025
4c79caf
Remove dead code that was moved to the TI DAGCombiner
chrisjbris Jul 13, 2025
c265ed4
Canonicalise TI select operand variable names and update tests
chrisjbris Jul 13, 2025
d658adb
Fix missed clang-format
chrisjbris Jul 13, 2025
9fb7344
Suppress overzealous clang-format
chrisjbris Jul 13, 2025
29d9b3d
Suppress overzealous clang-format
chrisjbris Jul 13, 2025
cd5c732
Remove unnecessary lambda and refactor foldSelectOfSourceMods() to fi…
chrisjbris Jul 13, 2025
ec42e07
[NFC] Minor corrections to whitespace and test name
chrisjbris Jul 13, 2025
4bd51d0
Add tighter constraints to apply combine and update tests.
chrisjbris Jul 14, 2025
b0140bd
Further constrain shouldFoldSelectWithIdentityConstant(), preventing …
chrisjbris Jul 14, 2025
ce7cd98
Fix broken insert-delay-alu-bug.ll test
chrisjbris Jul 14, 2025
fe28221
Replace target-specific function names with target-independent names
chrisjbris Jul 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,7 @@ namespace {
SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
SDValue N2, SDValue N3, ISD::CondCode CC);
SDValue foldSelectOfBinops(SDNode *N);
SDValue bitmaskOperandsToSignInstructions(SDNode *N);
SDValue foldSextSetcc(SDNode *N);
SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
const SDLoc &DL);
Expand Down Expand Up @@ -12175,6 +12176,73 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
return SDValue();
}

// Replace bitwise operations that modify the sign bit of integers
// with FABS and FNEG.
static SDValue getBitMaskToInstruction(SDValue N, SelectionDAG &DAG) {

unsigned Opc = N.getNode()->getOpcode();
if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR)
return SDValue();

SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);

const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.shouldFoldSelectWithIdentityConstant(
N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS))
return SDValue();

ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
if (!CRHS)
return SDValue();

EVT VT = RHS.getValueType();
EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());
EVT FVT = VT.isVector() ? VT.changeVectorElementType(FT) : FT;
SDLoc SL = SDLoc(N);

switch (Opc) {
case ISD::XOR:
if (CRHS->getAPIntValue().isSignMask())
return DAG.getNode(ISD::FNEG, SL, FVT,
DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
break;
case ISD::OR:
if (CRHS->getAPIntValue().isSignMask()) {
SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT,
DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
return DAG.getNode(ISD::FNEG, SL, FVT, Abs);
}
break;
case ISD::AND:
if (CRHS->getAPIntValue().isMaxSignedValue())
return DAG.getNode(ISD::FABS, SL, FVT,
DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
break;
default:
return SDValue();
}
return SDValue();
}

SDValue DAGCombiner::bitmaskOperandsToSignInstructions(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
EVT VT = N->getValueType(0);
SDValue SrcModN1 = getBitMaskToInstruction(N1, DAG);
SDValue SrcModN2 = getBitMaskToInstruction(N2, DAG);
if (SrcModN1 || SrcModN2) {
SDLoc SL(N);
EVT FVT = SrcModN1 ? SrcModN1.getValueType() : SrcModN2.getValueType();
SDValue FN1 = SrcModN1 ? SrcModN1 : DAG.getNode(ISD::BITCAST, SL, FVT, N1);
SDValue FN2 = SrcModN2 ? SrcModN2 : DAG.getNode(ISD::BITCAST, SL, FVT, N2);
SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FN1, FN2);
return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
}
return SDValue();
}

SDValue DAGCombiner::visitSELECT(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
Expand Down Expand Up @@ -12390,6 +12458,11 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG))
return R;

// Identify bitmask operations that modify only the sign bit
// and replace with FNEG or FABS as appropriate.
if (SDValue F = bitmaskOperandsToSignInstructions(N))
return F;

return SDValue();
}

Expand Down
21 changes: 21 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15493,6 +15493,27 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
return SDValue();
}

bool SITargetLowering::shouldFoldSelectWithIdentityConstant(
unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
SDValue Y) const {

if (BinOpcode != ISD::AND && BinOpcode != ISD::OR && BinOpcode != ISD::XOR)
return false;

ConstantSDNode *CY = isConstOrConstSplat(Y);
if (!CY)
return false;

if (!CY->getAPIntValue().isSignMask() &&
!CY->getAPIntValue().isMaxSignedValue())
return false;

if (VT.getScalarType() != MVT::i32)
return false;

return true;
}

SDValue SITargetLowering::performSetCCCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {

bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;

bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
unsigned SelectOpcode, SDValue X,
SDValue Y) const override;

private:
// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
// the three offsets (voffset, soffset and instoffset) into the SDValue[3]
Expand Down
51 changes: 19 additions & 32 deletions llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
Original file line number Diff line number Diff line change
Expand Up @@ -349,29 +349,24 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, v3, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: select_fneg_xor_select_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, v3, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg0 = xor i32 %arg0, -2147483648
%select0 = select i1 %cond0, i32 %arg1, i32 %fneg0
Expand Down Expand Up @@ -550,31 +545,25 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; GCN-NEXT: v_cndmask_b32_e64 v2, -v3, v5, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GCN-NEXT: v_cndmask_b32_e64 v1, v2, -v2, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: select_fneg_xor_select_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1
; GFX11-NEXT: v_cndmask_b32_e64 v2, -v3, v5, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -v2, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg0 = xor i64 %arg0, 9223372036854775808
%select0 = select i1 %cond0, i64 %arg1, i64 %fneg0
Expand Down Expand Up @@ -936,25 +925,23 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) {
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
; GCN-NEXT: v_bfrev_b32_e32 v2, 1
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
; GCN-NEXT: v_xor_b32_e32 v1, v1, v0
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, -v1, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: cospiD_pattern0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v5, 1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 1, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 0x80000000, vcc_lo
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v0
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%i = and i32 %arg, 1
%i3 = icmp eq i32 %i, 0
Expand Down
Loading