Skip to content

Commit b6b3726

Browse files
committed
[AMDGPU][SDAG] Support source modifiers as integer on select
Extend the DAGCombine() for select to directly support fneg and fabs for i32, v2i32 and i64.
1 parent c89274e commit b6b3726

File tree

2 files changed

+86
-31
lines changed

2 files changed

+86
-31
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 74 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4842,6 +4842,64 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
48424842
return SDValue();
48434843
}
48444844

4845+
static EVT IntToFloatVT(EVT VT) {
4846+
return VT = VT.isVector() ? MVT::getVectorVT(MVT::getFloatingPointVT(
4847+
VT.getScalarSizeInBits()),
4848+
VT.getVectorNumElements())
4849+
: MVT::getFloatingPointVT(VT.getFixedSizeInBits());
4850+
}
4851+
4852+
static SDValue BitwiseToSrcModifierOp(SDValue N,
4853+
TargetLowering::DAGCombinerInfo &DCI) {
4854+
4855+
unsigned Opc = N.getNode()->getOpcode();
4856+
if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::AND)
4857+
return SDValue();
4858+
4859+
SelectionDAG &DAG = DCI.DAG;
4860+
SDValue LHS = N.getNode()->getOperand(0);
4861+
SDValue RHS = N.getNode()->getOperand(1);
4862+
ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
4863+
4864+
if (!CRHS)
4865+
return SDValue();
4866+
4867+
EVT VT = RHS.getValueType();
4868+
4869+
assert((VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) &&
4870+
"Expected i32, v2i32 or i64 value type.");
4871+
4872+
uint64_t Mask = 0;
4873+
if (VT.isVector()) {
4874+
SDValue Splat = DAG.getSplatValue(RHS);
4875+
const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat);
4876+
Mask = C->getZExtValue();
4877+
} else
4878+
Mask = CRHS->getZExtValue();
4879+
4880+
EVT FVT = IntToFloatVT(VT);
4881+
SDValue BC = DAG.getNode(ISD::BITCAST, SDLoc(N), FVT, LHS);
4882+
4883+
switch (Opc) {
4884+
case ISD::XOR:
4885+
if (Mask == 0x80000000u || Mask == 0x8000000000000000u)
4886+
return DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC);
4887+
return SDValue();
4888+
case ISD::OR:
4889+
if (Mask == 0x80000000u || Mask == 0x8000000000000000u) {
4890+
SDValue Abs = DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC);
4891+
return DAG.getNode(ISD::FABS, SDLoc(N), FVT, Abs);
4892+
}
4893+
return SDValue();
4894+
case ISD::AND:
4895+
if (Mask == 0x7fffffffu || Mask == 0x7fffffffffffffffu)
4896+
return DAG.getNode(ISD::FABS, SDLoc(N), FVT, BC);
4897+
return SDValue();
4898+
default:
4899+
return SDValue();
4900+
}
4901+
}
4902+
48454903
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
48464904
DAGCombinerInfo &DCI) const {
48474905
if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
@@ -4876,12 +4934,25 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
48764934
}
48774935

48784936
if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4879-
SDValue MinMax
4880-
= combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4937+
SDValue MinMax =
4938+
combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
48814939
// Revisit this node so we can catch min3/max3/med3 patterns.
4882-
//DCI.AddToWorklist(MinMax.getNode());
4940+
// DCI.AddToWorklist(MinMax.getNode());
48834941
return MinMax;
48844942
}
4943+
4944+
// Support source modifiers as integer.
4945+
if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) {
4946+
SDLoc SL(N);
4947+
SDValue LHS = N->getOperand(1);
4948+
SDValue RHS = N->getOperand(2);
4949+
if (SDValue SrcMod = BitwiseToSrcModifierOp(LHS, DCI)) {
4950+
SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, VT, RHS);
4951+
SDValue FSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, SrcMod, FRHS);
4952+
SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
4953+
return BC;
4954+
}
4955+
}
48854956
}
48864957

48874958
// There's no reason to not do this if the condition has other uses.

llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,15 @@ define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) {
88
; GCN-LABEL: fneg_select_i32:
99
; GCN: ; %bb.0:
1010
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11-
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
1211
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
13-
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
12+
; GCN-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc
1413
; GCN-NEXT: s_setpc_b64 s[30:31]
1514
;
1615
; GFX11-LABEL: fneg_select_i32:
1716
; GFX11: ; %bb.0:
1817
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19-
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
2018
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
21-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
22-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
19+
; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc_lo
2320
; GFX11-NEXT: s_setpc_b64 s[30:31]
2421
%neg.a = xor i32 %a, u0x80000000
2522
%cmp = icmp eq i32 %cond, zeroinitializer
@@ -31,24 +28,19 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
3128
; GCN-LABEL: fneg_select_v2i32:
3229
; GCN: ; %bb.0:
3330
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34-
; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
3531
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36-
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
37-
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
32+
; GCN-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc
3833
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
39-
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
34+
; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc
4035
; GCN-NEXT: s_setpc_b64 s[30:31]
4136
;
4237
; GFX11-LABEL: fneg_select_v2i32:
4338
; GFX11: ; %bb.0:
4439
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
45-
; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
4640
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
47-
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
48-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
49-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
41+
; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc_lo
5042
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
51-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo
43+
; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc_lo
5244
; GFX11-NEXT: s_setpc_b64 s[30:31]
5345
%neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
5446
%cmp = icmp eq <2 x i32> %cond, zeroinitializer
@@ -60,18 +52,15 @@ define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
6052
; GCN-LABEL: fabs_select_i32:
6153
; GCN: ; %bb.0:
6254
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63-
; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
6455
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
65-
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
56+
; GCN-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc
6657
; GCN-NEXT: s_setpc_b64 s[30:31]
6758
;
6859
; GFX11-LABEL: fabs_select_i32:
6960
; GFX11: ; %bb.0:
7061
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
71-
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
7262
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
73-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
74-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
63+
; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc_lo
7564
; GFX11-NEXT: s_setpc_b64 s[30:31]
7665
%neg.a = and i32 %a, u0x7fffffff
7766
%cmp = icmp eq i32 %cond, zeroinitializer
@@ -83,24 +72,19 @@ define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
8372
; GCN-LABEL: fabs_select_v2i32:
8473
; GCN: ; %bb.0:
8574
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
86-
; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2
8775
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
88-
; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
89-
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
76+
; GCN-NEXT: v_cndmask_b32_e64 v0, v4, |v2|, vcc
9077
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
91-
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
78+
; GCN-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc
9279
; GCN-NEXT: s_setpc_b64 s[30:31]
9380
;
9481
; GFX11-LABEL: fabs_select_v2i32:
9582
; GFX11: ; %bb.0:
9683
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9784
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
98-
; GFX11-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
99-
; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2
100-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
101-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
85+
; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, |v2|, vcc_lo
10286
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
103-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo
87+
; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc_lo
10488
; GFX11-NEXT: s_setpc_b64 s[30:31]
10589
%neg.a = and <2 x i32> %a, splat (i32 u0x7fffffff)
10690
%cmp = icmp eq <2 x i32> %cond, zeroinitializer

0 commit comments

Comments
 (0)