Skip to content

Commit 686919f

Browse files
committed
[DAGCombine] Move the AMDGPU combine to Target Indepenent DAGCombine
- Allows removal of i64 specific code - the TI combine splits to i32 ops. - Update quite a few AMDGPU tests, these all appear to be improvements in codegen. Need to double-check.
1 parent 767cc11 commit 686919f

17 files changed

+2016
-1932
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,7 @@ namespace {
684684
SDValue VecIn2, unsigned LeftIdx,
685685
bool DidSplitVec);
686686
SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
687-
687+
SDValue getBitwiseToSrcModifierOp(SDValue N);
688688
/// Walk up chain skipping non-aliasing memory nodes,
689689
/// looking for aliasing nodes and adding them to the Aliases vector.
690690
void GatherAllAliases(SDNode *N, SDValue OriginalChain,
@@ -12175,6 +12175,56 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
1217512175
return SDValue();
1217612176
}
1217712177

12178+
static EVT getFloatVT(EVT VT) {
12179+
EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());
12180+
return VT.isVector() ? VT.changeVectorElementType(FT) : FT;
12181+
}
12182+
12183+
SDValue DAGCombiner::getBitwiseToSrcModifierOp(SDValue N) {
12184+
12185+
unsigned Opc = N.getNode()->getOpcode();
12186+
if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR)
12187+
return SDValue();
12188+
12189+
SDValue LHS = N->getOperand(0);
12190+
SDValue RHS = N->getOperand(1);
12191+
12192+
if(!TLI.shouldFoldSelectWithIdentityConstant(N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS))
12193+
return SDValue();
12194+
12195+
ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
12196+
12197+
if (!CRHS)
12198+
return SDValue();
12199+
12200+
EVT VT = RHS.getValueType();
12201+
EVT FVT = getFloatVT(VT);
12202+
SDLoc SL = SDLoc(N);
12203+
12204+
switch (Opc) {
12205+
case ISD::XOR:
12206+
if (CRHS->getAPIntValue().isSignMask())
12207+
return DAG.getNode(ISD::FNEG, SL, FVT,
12208+
DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
12209+
break;
12210+
case ISD::OR:
12211+
if (CRHS->getAPIntValue().isSignMask()) {
12212+
SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT,
12213+
DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
12214+
return DAG.getNode(ISD::FNEG, SL, FVT, Abs);
12215+
}
12216+
break;
12217+
case ISD::AND:
12218+
if (CRHS->getAPIntValue().isMaxSignedValue())
12219+
return DAG.getNode(ISD::FABS, SL, FVT,
12220+
DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
12221+
break;
12222+
default:
12223+
return SDValue();
12224+
}
12225+
return SDValue();
12226+
}
12227+
1217812228
SDValue DAGCombiner::visitSELECT(SDNode *N) {
1217912229
SDValue N0 = N->getOperand(0);
1218012230
SDValue N1 = N->getOperand(1);
@@ -12390,6 +12440,29 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
1239012440
if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG))
1239112441
return R;
1239212442

12443+
auto FoldSrcMods = [&](SDValue LHS, SDValue RHS, EVT VT) -> SDValue {
12444+
SDValue SrcModTrue = getBitwiseToSrcModifierOp(LHS);
12445+
SDValue SrcModFalse = getBitwiseToSrcModifierOp(RHS);
12446+
if (SrcModTrue || SrcModFalse) {
12447+
SDLoc SL(N);
12448+
EVT FVT =
12449+
SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType();
12450+
SDValue FLHS =
12451+
SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
12452+
SDValue FRHS =
12453+
SrcModFalse ? SrcModFalse : DAG.getNode(ISD::BITCAST, SL, FVT, RHS);
12454+
SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FLHS, FRHS);
12455+
return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
12456+
}
12457+
return SDValue();
12458+
};
12459+
12460+
// Identify bitmask operations that are source mods and create
12461+
// the relevant fneg, fabs or fneg+fabs.
12462+
if (VT == MVT::i32 || VT == MVT::v2i32)
12463+
if (SDValue F = FoldSrcMods(N1, N2, VT))
12464+
return F;
12465+
1239312466
return SDValue();
1239412467
}
1239512468

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 0 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -4948,95 +4948,6 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
49484948
return SDValue();
49494949
};
49504950

4951-
// Support source modifiers on integer operands.
4952-
if (VT == MVT::i32 || VT == MVT::v2i32)
4953-
if (SDValue F = FoldSrcMods(True, False, VT))
4954-
return F;
4955-
4956-
// auto SplitSelect = [&]() -> std::pair(
4957-
// For i64 if a source modifier is to be folded in we split into two i32
4958-
// select of high and low values. The Operator need only be applied to the
4959-
// high values in order to change the sign bit.
4960-
if (VT == MVT::i64) {
4961-
bool TrueHasModifierOp =
4962-
(True.getOpcode() == ISD::AND || True.getOpcode() == ISD::OR ||
4963-
True.getOpcode() == ISD::XOR);
4964-
4965-
bool FalseHasModifierOp =
4966-
(False.getOpcode() == ISD::AND || False.getOpcode() == ISD::OR ||
4967-
False.getOpcode() == ISD::XOR);
4968-
4969-
ConstantSDNode *CTrueRHS = nullptr;
4970-
if (TrueHasModifierOp) {
4971-
SDValue TrueRHS = True->getOperand(1);
4972-
CTrueRHS = dyn_cast<ConstantSDNode>(TrueRHS);
4973-
}
4974-
4975-
ConstantSDNode *CFalseRHS = nullptr;
4976-
if (FalseHasModifierOp) {
4977-
SDValue FalseRHS = False->getOperand(1);
4978-
CFalseRHS = dyn_cast<ConstantSDNode>(FalseRHS);
4979-
}
4980-
4981-
// If True or False is a candidate for source modifier folding, extract
4982-
// the high value using APInt and reconstruct a ConstantSDNode.
4983-
SDValue TrueHiOp;
4984-
SDValue BCTrue = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, True);
4985-
SDValue TrueLo;
4986-
SDValue TrueHi;
4987-
if (CTrueRHS) {
4988-
SDValue TrueLHS = True->getOperand(0);
4989-
SDValue TrueLHSHiVal = getHiHalf64(BCTrue, DAG);
4990-
TrueLo = getLoHalf64(TrueLHS, DAG);
4991-
APInt CTrueRHSHiBits =
4992-
CTrueRHS->getAPIntValue().getHiBits(32).trunc(32);
4993-
SDValue CTrueRHSHiVal =
4994-
DAG.getConstant(CTrueRHSHiBits, SDLoc(N), MVT::i32);
4995-
unsigned OpcTrue = True.getOpcode();
4996-
TrueHiOp = DAG.getNode(OpcTrue, SDLoc(N), MVT::i32, TrueLHSHiVal,
4997-
CTrueRHSHiVal);
4998-
} else {
4999-
TrueLo = getLoHalf64(BCTrue, DAG);
5000-
TrueHi = getHiHalf64(BCTrue, DAG);
5001-
}
5002-
5003-
SDValue FalseHiOp;
5004-
SDValue BCFalse = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, False);
5005-
SDValue FalseLo;
5006-
SDValue FalseHi;
5007-
if (CFalseRHS) {
5008-
SDValue FalseLHS = False->getOperand(0);
5009-
FalseLo = getLoHalf64(FalseLHS, DAG);
5010-
SDValue FalseLHSHiVal = getHiHalf64(BCFalse, DAG);
5011-
APInt CFalseRHSHiBits =
5012-
CFalseRHS->getAPIntValue().getHiBits(32).trunc(32);
5013-
SDValue CFalseRHSHiVal =
5014-
DAG.getConstant(CFalseRHSHiBits, SDLoc(N), MVT::i32);
5015-
unsigned OpcFalse = False.getOpcode();
5016-
FalseHiOp = DAG.getNode(OpcFalse, SDLoc(N), MVT::i32, FalseLHSHiVal,
5017-
CFalseRHSHiVal);
5018-
} else {
5019-
FalseLo = getLoHalf64(BCFalse, DAG);
5020-
FalseHi = getHiHalf64(BCFalse, DAG);
5021-
}
5022-
5023-
if (CTrueRHS || CFalseRHS) {
5024-
// Place the low bits directly into the select. The operator is unneeded
5025-
// for these.
5026-
SDValue LoSelect =
5027-
DAG.getNode(ISD::SELECT, SDLoc(N), MVT::i32, Cond, TrueLo, FalseLo);
5028-
// If a source modifier may be folded use the bitwise-op of the high
5029-
// values, otherwise just pass the high part of the value.
5030-
SDValue FoldedHi =
5031-
FoldSrcMods(CTrueRHS ? TrueHiOp : TrueHi,
5032-
CFalseRHS ? FalseHiOp : FalseHi, MVT::i32);
5033-
5034-
SDValue ResV =
5035-
DAG.getBuildVector(MVT::v2i32, SDLoc(N), {FoldedHi, LoSelect});
5036-
SDValue Res = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, ResV);
5037-
return Res;
5038-
}
5039-
}
50404951
}
50414952

50424953
// There's no reason to not do this if the condition has other uses.

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15493,6 +15493,14 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
1549315493
return SDValue();
1549415494
}
1549515495

15496+
bool SITargetLowering::shouldFoldSelectWithIdentityConstant(
15497+
unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
15498+
SDValue Y) const {
15499+
return (BinOpcode == ISD::AND || BinOpcode == ISD::OR ||
15500+
BinOpcode == ISD::XOR) &&
15501+
(VT.getScalarType() == MVT::i32);
15502+
}
15503+
1549615504
SDValue SITargetLowering::performSetCCCombine(SDNode *N,
1549715505
DAGCombinerInfo &DCI) const {
1549815506
SelectionDAG &DAG = DCI.DAG;

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
264264

265265
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;
266266

267+
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
268+
unsigned SelectOpcode, SDValue X,
269+
SDValue Y) const override;
270+
267271
private:
268272
// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
269273
// the three offsets (voffset, soffset and instoffset) into the SDValue[3]

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7145,12 +7145,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
71457145
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
71467146
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
71477147
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
7148-
; GFX7LESS-NEXT: s_mov_b32 s2, -1
71497148
; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0
7150-
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
71517149
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
7152-
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
7153-
; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0
7150+
; GFX7LESS-NEXT: s_or_b32 s5, s4, s6
7151+
; GFX7LESS-NEXT: s_mov_b32 s2, -1
7152+
; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4
7153+
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5
7154+
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
71547155
; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0
71557156
; GFX7LESS-NEXT: s_endpgm
71567157
;
@@ -8838,12 +8839,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
88388839
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
88398840
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
88408841
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
8841-
; GFX7LESS-NEXT: s_mov_b32 s2, -1
88428842
; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0
8843-
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
88448843
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
8845-
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
8846-
; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0
8844+
; GFX7LESS-NEXT: s_or_b32 s5, s4, s6
8845+
; GFX7LESS-NEXT: s_mov_b32 s2, -1
8846+
; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4
8847+
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5
8848+
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
88478849
; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0
88488850
; GFX7LESS-NEXT: s_endpgm
88498851
;

llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -913,15 +913,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
913913
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
914914
; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec
915915
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
916-
; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
917-
; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
918-
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
919-
; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
920-
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec
921-
; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec
922-
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
923-
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec
924-
; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
916+
; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
917+
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr3, killed $vgpr2, implicit $exec
918+
; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
919+
; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr53, 0, $vgpr10, 0, 0, 6, implicit $exec
920+
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr3, killed $vcc, implicit $exec
921+
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr15, killed $vgpr2, implicit $exec
922+
; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr17, 0, $vgpr10, 0, 0, 6, implicit $exec
923+
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr15, killed $vcc, implicit $exec
924+
; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr10, killed renamable $vgpr2, renamable $vgpr10, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
925925
; GFX90A-NEXT: S_BRANCH %bb.65
926926
; GFX90A-NEXT: {{ $}}
927927
; GFX90A-NEXT: bb.68.bb174:

0 commit comments

Comments
 (0)