Skip to content

Commit 6153f46

Browse files
committed
Move the combine to TI DAGCombine.
- Allows removal of i64 specific code - the TI combine splits to i32 ops. - Update quite a few AMDGPU tests, these all appear to be improvements in codegen.
1 parent bdb35f1 commit 6153f46

17 files changed

+1584
-1730
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,7 @@ namespace {
684684
SDValue VecIn2, unsigned LeftIdx,
685685
bool DidSplitVec);
686686
SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
687-
687+
SDValue getBitwiseToSrcModifierOp(SDValue N);
688688
/// Walk up chain skipping non-aliasing memory nodes,
689689
/// looking for aliasing nodes and adding them to the Aliases vector.
690690
void GatherAllAliases(SDNode *N, SDValue OriginalChain,
@@ -12172,6 +12172,56 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
1217212172
return SDValue();
1217312173
}
1217412174

12175+
static EVT getFloatVT(EVT VT) {
12176+
EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());
12177+
return VT.isVector() ? VT.changeVectorElementType(FT) : FT;
12178+
}
12179+
12180+
SDValue DAGCombiner::getBitwiseToSrcModifierOp(SDValue N) {
12181+
12182+
unsigned Opc = N.getNode()->getOpcode();
12183+
if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR)
12184+
return SDValue();
12185+
12186+
SDValue LHS = N->getOperand(0);
12187+
SDValue RHS = N->getOperand(1);
12188+
12189+
if(!TLI.shouldFoldSelectWithIdentityConstant(N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS))
12190+
return SDValue();
12191+
12192+
ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
12193+
12194+
if (!CRHS)
12195+
return SDValue();
12196+
12197+
EVT VT = RHS.getValueType();
12198+
EVT FVT = getFloatVT(VT);
12199+
SDLoc SL = SDLoc(N);
12200+
12201+
switch (Opc) {
12202+
case ISD::XOR:
12203+
if (CRHS->getAPIntValue().isSignMask())
12204+
return DAG.getNode(ISD::FNEG, SL, FVT,
12205+
DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
12206+
break;
12207+
case ISD::OR:
12208+
if (CRHS->getAPIntValue().isSignMask()) {
12209+
SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT,
12210+
DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
12211+
return DAG.getNode(ISD::FNEG, SL, FVT, Abs);
12212+
}
12213+
break;
12214+
case ISD::AND:
12215+
if (CRHS->getAPIntValue().isMaxSignedValue())
12216+
return DAG.getNode(ISD::FABS, SL, FVT,
12217+
DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
12218+
break;
12219+
default:
12220+
return SDValue();
12221+
}
12222+
return SDValue();
12223+
}
12224+
1217512225
SDValue DAGCombiner::visitSELECT(SDNode *N) {
1217612226
SDValue N0 = N->getOperand(0);
1217712227
SDValue N1 = N->getOperand(1);
@@ -12387,6 +12437,29 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
1238712437
if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG))
1238812438
return R;
1238912439

12440+
auto FoldSrcMods = [&](SDValue LHS, SDValue RHS, EVT VT) -> SDValue {
12441+
SDValue SrcModTrue = getBitwiseToSrcModifierOp(LHS);
12442+
SDValue SrcModFalse = getBitwiseToSrcModifierOp(RHS);
12443+
if (SrcModTrue || SrcModFalse) {
12444+
SDLoc SL(N);
12445+
EVT FVT =
12446+
SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType();
12447+
SDValue FLHS =
12448+
SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
12449+
SDValue FRHS =
12450+
SrcModFalse ? SrcModFalse : DAG.getNode(ISD::BITCAST, SL, FVT, RHS);
12451+
SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FLHS, FRHS);
12452+
return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
12453+
}
12454+
return SDValue();
12455+
};
12456+
12457+
// Identify bitmask operations that are source mods and create
12458+
// the relevant fneg, fabs or fneg+fabs.
12459+
if (VT == MVT::i32 || VT == MVT::v2i32)
12460+
if (SDValue F = FoldSrcMods(N1, N2, VT))
12461+
return F;
12462+
1239012463
return SDValue();
1239112464
}
1239212465

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 0 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -4936,95 +4936,6 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
49364936
return SDValue();
49374937
};
49384938

4939-
// Support source modifiers on integer operands.
4940-
if (VT == MVT::i32 || VT == MVT::v2i32)
4941-
if (SDValue F = FoldSrcMods(True, False, VT))
4942-
return F;
4943-
4944-
// auto SplitSelect = [&]() -> std::pair(
4945-
// For i64 if a source modifier is to be folded in we split into two i32
4946-
// select of high and low values. The Operator need only be applied to the
4947-
// high values in order to change the sign bit.
4948-
if (VT == MVT::i64) {
4949-
bool TrueHasModifierOp =
4950-
(True.getOpcode() == ISD::AND || True.getOpcode() == ISD::OR ||
4951-
True.getOpcode() == ISD::XOR);
4952-
4953-
bool FalseHasModifierOp =
4954-
(False.getOpcode() == ISD::AND || False.getOpcode() == ISD::OR ||
4955-
False.getOpcode() == ISD::XOR);
4956-
4957-
ConstantSDNode *CTrueRHS = nullptr;
4958-
if (TrueHasModifierOp) {
4959-
SDValue TrueRHS = True->getOperand(1);
4960-
CTrueRHS = dyn_cast<ConstantSDNode>(TrueRHS);
4961-
}
4962-
4963-
ConstantSDNode *CFalseRHS = nullptr;
4964-
if (FalseHasModifierOp) {
4965-
SDValue FalseRHS = False->getOperand(1);
4966-
CFalseRHS = dyn_cast<ConstantSDNode>(FalseRHS);
4967-
}
4968-
4969-
// If True or False is a candidate for source modifier folding, extract
4970-
// the high value using APInt and reconstruct a ConstantSDNode.
4971-
SDValue TrueHiOp;
4972-
SDValue BCTrue = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, True);
4973-
SDValue TrueLo;
4974-
SDValue TrueHi;
4975-
if (CTrueRHS) {
4976-
SDValue TrueLHS = True->getOperand(0);
4977-
SDValue TrueLHSHiVal = getHiHalf64(BCTrue, DAG);
4978-
TrueLo = getLoHalf64(TrueLHS, DAG);
4979-
APInt CTrueRHSHiBits =
4980-
CTrueRHS->getAPIntValue().getHiBits(32).trunc(32);
4981-
SDValue CTrueRHSHiVal =
4982-
DAG.getConstant(CTrueRHSHiBits, SDLoc(N), MVT::i32);
4983-
unsigned OpcTrue = True.getOpcode();
4984-
TrueHiOp = DAG.getNode(OpcTrue, SDLoc(N), MVT::i32, TrueLHSHiVal,
4985-
CTrueRHSHiVal);
4986-
} else {
4987-
TrueLo = getLoHalf64(BCTrue, DAG);
4988-
TrueHi = getHiHalf64(BCTrue, DAG);
4989-
}
4990-
4991-
SDValue FalseHiOp;
4992-
SDValue BCFalse = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, False);
4993-
SDValue FalseLo;
4994-
SDValue FalseHi;
4995-
if (CFalseRHS) {
4996-
SDValue FalseLHS = False->getOperand(0);
4997-
FalseLo = getLoHalf64(FalseLHS, DAG);
4998-
SDValue FalseLHSHiVal = getHiHalf64(BCFalse, DAG);
4999-
APInt CFalseRHSHiBits =
5000-
CFalseRHS->getAPIntValue().getHiBits(32).trunc(32);
5001-
SDValue CFalseRHSHiVal =
5002-
DAG.getConstant(CFalseRHSHiBits, SDLoc(N), MVT::i32);
5003-
unsigned OpcFalse = False.getOpcode();
5004-
FalseHiOp = DAG.getNode(OpcFalse, SDLoc(N), MVT::i32, FalseLHSHiVal,
5005-
CFalseRHSHiVal);
5006-
} else {
5007-
FalseLo = getLoHalf64(BCFalse, DAG);
5008-
FalseHi = getHiHalf64(BCFalse, DAG);
5009-
}
5010-
5011-
if (CTrueRHS || CFalseRHS) {
5012-
// Place the low bits directly into the select. The operator is unneeded
5013-
// for these.
5014-
SDValue LoSelect =
5015-
DAG.getNode(ISD::SELECT, SDLoc(N), MVT::i32, Cond, TrueLo, FalseLo);
5016-
// If a source modifier may be folded use the bitwise-op of the high
5017-
// values, otherwise just pass the high part of the value.
5018-
SDValue FoldedHi =
5019-
FoldSrcMods(CTrueRHS ? TrueHiOp : TrueHi,
5020-
CFalseRHS ? FalseHiOp : FalseHi, MVT::i32);
5021-
5022-
SDValue ResV =
5023-
DAG.getBuildVector(MVT::v2i32, SDLoc(N), {FoldedHi, LoSelect});
5024-
SDValue Res = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, ResV);
5025-
return Res;
5026-
}
5027-
}
50284939
}
50294940

50304941
// There's no reason to not do this if the condition has other uses.

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15491,6 +15491,14 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
1549115491
return SDValue();
1549215492
}
1549315493

15494+
bool SITargetLowering::shouldFoldSelectWithIdentityConstant(
15495+
unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
15496+
SDValue Y) const {
15497+
return (BinOpcode == ISD::AND || BinOpcode == ISD::OR ||
15498+
BinOpcode == ISD::XOR) &&
15499+
(VT.getScalarType() == MVT::i32);
15500+
}
15501+
1549415502
SDValue SITargetLowering::performSetCCCombine(SDNode *N,
1549515503
DAGCombinerInfo &DCI) const {
1549615504
SelectionDAG &DAG = DCI.DAG;

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
264264

265265
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;
266266

267+
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
268+
unsigned SelectOpcode, SDValue X,
269+
SDValue Y) const override;
270+
267271
private:
268272
// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
269273
// the three offsets (voffset, soffset and instoffset) into the SDValue[3]

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7145,12 +7145,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
71457145
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
71467146
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
71477147
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
7148-
; GFX7LESS-NEXT: s_mov_b32 s2, -1
71497148
; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0
7150-
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
71517149
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
7152-
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
7153-
; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0
7150+
; GFX7LESS-NEXT: s_or_b32 s5, s4, s6
7151+
; GFX7LESS-NEXT: s_mov_b32 s2, -1
7152+
; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4
7153+
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5
7154+
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
71547155
; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0
71557156
; GFX7LESS-NEXT: s_endpgm
71567157
;
@@ -8847,12 +8848,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
88478848
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
88488849
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
88498850
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
8850-
; GFX7LESS-NEXT: s_mov_b32 s2, -1
88518851
; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0
8852-
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
88538852
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
8854-
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
8855-
; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0
8853+
; GFX7LESS-NEXT: s_or_b32 s5, s4, s6
8854+
; GFX7LESS-NEXT: s_mov_b32 s2, -1
8855+
; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4
8856+
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5
8857+
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
88568858
; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0
88578859
; GFX7LESS-NEXT: s_endpgm
88588860
;

llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -913,15 +913,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
913913
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
914914
; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec
915915
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
916-
; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
917-
; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
918-
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
919-
; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
920-
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec
921-
; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec
922-
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
923-
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec
924-
; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
916+
; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
917+
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr3, killed $vgpr2, implicit $exec
918+
; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
919+
; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr53, 0, $vgpr10, 0, 0, 6, implicit $exec
920+
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr3, killed $vcc, implicit $exec
921+
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr15, killed $vgpr2, implicit $exec
922+
; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr17, 0, $vgpr10, 0, 0, 6, implicit $exec
923+
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr15, killed $vcc, implicit $exec
924+
; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr10, killed renamable $vgpr2, renamable $vgpr10, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
925925
; GFX90A-NEXT: S_BRANCH %bb.65
926926
; GFX90A-NEXT: {{ $}}
927927
; GFX90A-NEXT: bb.68.bb174:

0 commit comments

Comments
 (0)