Skip to content

Commit 0da6f7b

Browse files
committed
[DAG] Fold (setcc ((x | x >> c0 | ...) & mask)) sequences
Fold sequences where we extract a bunch of contiguous bits from a value, merge them into the low bit and then check if the low bits are zero or not. It seems like a strange sequence at first but it's an idiom used by device libs in device libs to check workitem IDs for AMDGPU. The reason I put this in DAGCombiner instead of the target combiner is because this is a generic, valid transform that's also fairly niche, so there isn't much risk of a combine loop I think. See #136727
1 parent 267fb0b commit 0da6f7b

File tree

2 files changed

+91
-29
lines changed

2 files changed

+91
-29
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28978,13 +28978,97 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
2897828978
return SDValue();
2897928979
}
2898028980

28981+
static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
28982+
const TargetLowering &TLI) {
28983+
// Match a pattern such as:
28984+
// (X | (X >> C0) | (X >> C1) | ...) & Mask
28985+
// This extracts contiguous parts of X and ORs them together before comparing.
28986+
// We can optimize this so that we directly check (X & SomeMask) instead,
28987+
// eliminating the shifts.
28988+
28989+
EVT VT = Root.getValueType();
28990+
28991+
if (Root.getOpcode() != ISD::AND)
28992+
return SDValue();
28993+
28994+
SDValue N0 = Root.getOperand(0);
28995+
SDValue N1 = Root.getOperand(1);
28996+
28997+
if (N0.getOpcode() != ISD::OR || !isa<ConstantSDNode>(N1))
28998+
return SDValue();
28999+
29000+
APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal();
29001+
if (!RootMask.isMask())
29002+
return SDValue();
29003+
29004+
SDValue Src;
29005+
const auto IsSrc = [&](SDValue V) {
29006+
if (!Src) {
29007+
Src = V;
29008+
return true;
29009+
}
29010+
29011+
return Src == V;
29012+
};
29013+
29014+
SmallVector<SDValue> Worklist = {N0};
29015+
APInt PartsMask(VT.getSizeInBits(), 0);
29016+
while (!Worklist.empty()) {
29017+
SDValue V = Worklist.pop_back_val();
29018+
if (!V.hasOneUse() && Src != V)
29019+
return SDValue();
29020+
29021+
if (V.getOpcode() == ISD::OR) {
29022+
Worklist.push_back(V.getOperand(0));
29023+
Worklist.push_back(V.getOperand(1));
29024+
continue;
29025+
}
29026+
29027+
if (V.getOpcode() == ISD::SRL) {
29028+
SDValue ShiftSrc = V.getOperand(0);
29029+
SDValue ShiftAmt = V.getOperand(1);
29030+
29031+
if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt))
29032+
return SDValue();
29033+
29034+
PartsMask |= (RootMask << cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal());
29035+
continue;
29036+
}
29037+
29038+
if (IsSrc(V)) {
29039+
PartsMask |= RootMask;
29040+
continue;
29041+
}
29042+
29043+
return SDValue();
29044+
}
29045+
29046+
if (!RootMask.isMask() || !Src)
29047+
return SDValue();
29048+
29049+
SDLoc DL(Root);
29050+
return DAG.getNode(ISD::AND, DL, VT,
29051+
{Src, DAG.getConstant(PartsMask, DL, VT)});
29052+
}
29053+
2898129054
/// This is a stub for TargetLowering::SimplifySetCC.
2898229055
SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
2898329056
ISD::CondCode Cond, const SDLoc &DL,
2898429057
bool foldBooleans) {
2898529058
TargetLowering::DAGCombinerInfo
2898629059
DagCombineInfo(DAG, Level, false, this);
28987-
return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
29060+
if (SDValue C =
29061+
TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL))
29062+
return C;
29063+
29064+
if ((Cond == ISD::SETNE || Cond == ISD::SETEQ) &&
29065+
N0.getOpcode() == ISD::AND && isNullConstant(N1)) {
29066+
29067+
if (SDValue Res = matchMergedBFX(N0, DAG, TLI))
29068+
return DAG.getSetCC(DL, VT, Res, N1, Cond);
29069+
}
29070+
29071+
return SDValue();
2898829072
}
2898929073

2899029074
/// Given an ISD::SDIV node expressing a divide by constant, return

llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,15 @@ define i1 @workitem_zero() {
1212
; DAGISEL-GFX8-LABEL: workitem_zero:
1313
; DAGISEL-GFX8: ; %bb.0: ; %entry
1414
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15-
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v1, 10, v31
16-
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v0, 20, v31
17-
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v1, v31, v1
18-
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v0, v1, v0
19-
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3ff, v0
15+
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
2016
; DAGISEL-GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2117
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
2218
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
2319
;
2420
; DAGISEL-GFX942-LABEL: workitem_zero:
2521
; DAGISEL-GFX942: ; %bb.0: ; %entry
2622
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31
28-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31
29-
; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0
30-
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
23+
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
3124
; DAGISEL-GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
3225
; DAGISEL-GFX942-NEXT: s_nop 1
3326
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -40,11 +33,7 @@ define i1 @workitem_zero() {
4033
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
4134
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
4235
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
43-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31
44-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31
45-
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
46-
; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0
47-
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
36+
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
4837
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
4938
; DAGISEL-GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
5039
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd
@@ -106,22 +95,15 @@ define i1 @workitem_nonzero() {
10695
; DAGISEL-GFX8-LABEL: workitem_nonzero:
10796
; DAGISEL-GFX8: ; %bb.0: ; %entry
10897
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109-
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v1, 10, v31
110-
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v0, 20, v31
111-
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v1, v31, v1
112-
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v0, v1, v0
113-
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3ff, v0
98+
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
11499
; DAGISEL-GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
115100
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
116101
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
117102
;
118103
; DAGISEL-GFX942-LABEL: workitem_nonzero:
119104
; DAGISEL-GFX942: ; %bb.0: ; %entry
120105
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31
122-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31
123-
; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0
124-
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
106+
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
125107
; DAGISEL-GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
126108
; DAGISEL-GFX942-NEXT: s_nop 1
127109
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -134,11 +116,7 @@ define i1 @workitem_nonzero() {
134116
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
135117
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
136118
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
137-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31
138-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31
139-
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
140-
; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0
141-
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
119+
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
142120
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
143121
; DAGISEL-GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
144122
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd

0 commit comments

Comments
 (0)