Skip to content

[AMDGPU][SDAG] Support source modifiers on select integer operands #147325

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 29 commits into from
Closed
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d4ac937
Add new test for source modifiers on select
chrisjbris Jul 2, 2025
c89274e
Populate check-lines before patching
chrisjbris Jul 2, 2025
b6b3726
[AMDGPU][SDAG] Support source modifiers as integer on select
chrisjbris Jul 7, 2025
dea39d1
Simplify switch in BitwiseToSrcModifierOp()
chrisjbris Jul 7, 2025
004dc9f
[NFC] Correct typo in BitwiseToSrcModifierOp()
chrisjbris Jul 7, 2025
b27ce62
Fix bitcast type in performSelectCombine()
chrisjbris Jul 7, 2025
f503034
Respond to first review comments
chrisjbris Jul 7, 2025
e073552
Respond to secon review comments - rename function and correct test
chrisjbris Jul 7, 2025
000ddc8
[NFC] Remove incomplete dag-style comment
chrisjbris Jul 7, 2025
2604329
Make test for bitwise src mods more stringent and correct fneg-fabs o…
chrisjbris Jul 7, 2025
97d93d6
Reviewer corrections
chrisjbris Jul 8, 2025
f255ddc
Refactor to support the source modifiers on either or both operands.
chrisjbris Jul 8, 2025
2e2249a
Fix Typo.
chrisjbris Jul 8, 2025
a505a72
Respond to reviewer - Add i16 tests, simplify obtaining type
chrisjbris Jul 8, 2025
a8bd726
Inline bitcast node creation.
chrisjbris Jul 8, 2025
e5f1e67
Add functional implementation for i64
chrisjbris Jul 11, 2025
767cc11
Fix formatting
chrisjbris Jul 11, 2025
686919f
[DAGCombine] Move the AMDGPU combine to Target Indepenent DAGCombine
chrisjbris Jul 12, 2025
4c79caf
Remove dead code that was moved to the TI DAGCombiner
chrisjbris Jul 13, 2025
c265ed4
Canonicalise TI select operand variable names and update tests
chrisjbris Jul 13, 2025
d658adb
Fix missed clang-format
chrisjbris Jul 13, 2025
9fb7344
Suppress overzealous clang-format
chrisjbris Jul 13, 2025
29d9b3d
Suppress overzealous clang-format
chrisjbris Jul 13, 2025
cd5c732
Remove unnecessary lambda and refactor foldSelectOfSourceMods() to fi…
chrisjbris Jul 13, 2025
ec42e07
[NFC] Minor corrections to whitespace and test name
chrisjbris Jul 13, 2025
4bd51d0
Add tighter constraints to apply combine and update tests.
chrisjbris Jul 14, 2025
b0140bd
Further constrain shouldFoldSelectWithIdentityConstant(), preventing …
chrisjbris Jul 14, 2025
ce7cd98
Fix broken insert-delay-alu-bug.ll test
chrisjbris Jul 14, 2025
fe28221
Replace target-specific function names with target-independent names
chrisjbris Jul 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,7 @@ namespace {
SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
SDValue N2, SDValue N3, ISD::CondCode CC);
SDValue foldSelectOfBinops(SDNode *N);
SDValue foldSelectOfSourceMods(SDNode *N);
SDValue foldSextSetcc(SDNode *N);
SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
const SDLoc &DL);
Expand Down Expand Up @@ -12175,6 +12176,71 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
return SDValue();
}

static SDValue getBitwiseToSrcModifierOp(SDValue N, SelectionDAG &DAG) {

unsigned Opc = N.getNode()->getOpcode();
if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR)
return SDValue();

SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);

const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.shouldFoldSelectWithIdentityConstant(
N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS))
return SDValue();

ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
if (!CRHS)
return SDValue();

EVT VT = RHS.getValueType();
EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());
EVT FVT = VT.isVector() ? VT.changeVectorElementType(FT) : FT;
SDLoc SL = SDLoc(N);

switch (Opc) {
case ISD::XOR:
if (CRHS->getAPIntValue().isSignMask())
return DAG.getNode(ISD::FNEG, SL, FVT,
DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
break;
case ISD::OR:
if (CRHS->getAPIntValue().isSignMask()) {
SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT,
DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
return DAG.getNode(ISD::FNEG, SL, FVT, Abs);
}
break;
case ISD::AND:
if (CRHS->getAPIntValue().isMaxSignedValue())
return DAG.getNode(ISD::FABS, SL, FVT,
DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
break;
default:
return SDValue();
}
return SDValue();
}

SDValue DAGCombiner::foldSelectOfSourceMods(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
EVT VT = N->getValueType(0);
SDValue SrcModN1 = getBitwiseToSrcModifierOp(N1, DAG);
SDValue SrcModN2 = getBitwiseToSrcModifierOp(N2, DAG);
if (SrcModN1 || SrcModN2) {
SDLoc SL(N);
EVT FVT = SrcModN1 ? SrcModN1.getValueType() : SrcModN2.getValueType();
SDValue FN1 = SrcModN1 ? SrcModN1 : DAG.getNode(ISD::BITCAST, SL, FVT, N1);
SDValue FN2 = SrcModN2 ? SrcModN2 : DAG.getNode(ISD::BITCAST, SL, FVT, N2);
SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FN1, FN2);
return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
}
return SDValue();
}

SDValue DAGCombiner::visitSELECT(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
Expand Down Expand Up @@ -12390,6 +12456,11 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG))
return R;

// Identify bitmask operations that are source mods and create
// the relevant fneg, fabs or fneg+fabs.
if (SDValue F = foldSelectOfSourceMods(N))
return F;

return SDValue();
}

Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15493,6 +15493,14 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
return SDValue();
}

bool SITargetLowering::shouldFoldSelectWithIdentityConstant(
unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
SDValue Y) const {
return (BinOpcode == ISD::AND || BinOpcode == ISD::OR ||
BinOpcode == ISD::XOR) &&
(VT.getScalarType() == MVT::i32);
}

SDValue SITargetLowering::performSetCCCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {

bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;

bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
unsigned SelectOpcode, SDValue X,
SDValue Y) const override;

private:
// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
// the three offsets (voffset, soffset and instoffset) into the SDValue[3]
Expand Down
18 changes: 10 additions & 8 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7145,12 +7145,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0
; GFX7LESS-NEXT: s_or_b32 s5, s4, s6
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
Expand Down Expand Up @@ -8838,12 +8839,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0
; GFX7LESS-NEXT: s_or_b32 s5, s4, s6
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -913,15 +913,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec
; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec
; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr3, killed $vgpr2, implicit $exec
; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr53, 0, $vgpr10, 0, 0, 6, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr3, killed $vcc, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr15, killed $vgpr2, implicit $exec
; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr17, 0, $vgpr10, 0, 0, 6, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr15, killed $vcc, implicit $exec
; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr10, killed renamable $vgpr2, renamable $vgpr10, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
; GFX90A-NEXT: S_BRANCH %bb.65
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.68.bb174:
Expand Down
Loading