Skip to content

Commit 48d4700

Browse files
Pierre-vhkrishna2803
authored andcommitted
[DAG] Fold (setcc ((x | x >> c0 | ...) & mask)) sequences (llvm#146054)
Fold sequences where we extract a bunch of contiguous bits from a value, merge them into the low bit and then check if the low bits are zero or not. Usually the and would be on the outside (the leaves) of the expression, but the DAG canonicalizes it to a single `and` at the root of the expression. The reason I put this in DAGCombiner instead of the target combiner is because this is a generic, valid transform that's also fairly niche, so there isn't much risk of a combine loop I think. See llvm#136727
1 parent 6c2ec13 commit 48d4700

File tree

3 files changed

+217
-29
lines changed

3 files changed

+217
-29
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28982,13 +28982,100 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
2898228982
return SDValue();
2898328983
}
2898428984

28985+
static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
28986+
const TargetLowering &TLI) {
28987+
// Match a pattern such as:
28988+
// (X | (X >> C0) | (X >> C1) | ...) & Mask
28989+
// This extracts contiguous parts of X and ORs them together before comparing.
28990+
// We can optimize this so that we directly check (X & SomeMask) instead,
28991+
// eliminating the shifts.
28992+
28993+
EVT VT = Root.getValueType();
28994+
28995+
// TODO: Support vectors?
28996+
if (!VT.isScalarInteger() || Root.getOpcode() != ISD::AND)
28997+
return SDValue();
28998+
28999+
SDValue N0 = Root.getOperand(0);
29000+
SDValue N1 = Root.getOperand(1);
29001+
29002+
if (N0.getOpcode() != ISD::OR || !isa<ConstantSDNode>(N1))
29003+
return SDValue();
29004+
29005+
APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal();
29006+
29007+
SDValue Src;
29008+
const auto IsSrc = [&](SDValue V) {
29009+
if (!Src) {
29010+
Src = V;
29011+
return true;
29012+
}
29013+
29014+
return Src == V;
29015+
};
29016+
29017+
SmallVector<SDValue> Worklist = {N0};
29018+
APInt PartsMask(VT.getSizeInBits(), 0);
29019+
while (!Worklist.empty()) {
29020+
SDValue V = Worklist.pop_back_val();
29021+
if (!V.hasOneUse() && (Src && Src != V))
29022+
return SDValue();
29023+
29024+
if (V.getOpcode() == ISD::OR) {
29025+
Worklist.push_back(V.getOperand(0));
29026+
Worklist.push_back(V.getOperand(1));
29027+
continue;
29028+
}
29029+
29030+
if (V.getOpcode() == ISD::SRL) {
29031+
SDValue ShiftSrc = V.getOperand(0);
29032+
SDValue ShiftAmt = V.getOperand(1);
29033+
29034+
if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt))
29035+
return SDValue();
29036+
29037+
auto ShiftAmtVal = cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal();
29038+
if (ShiftAmtVal > RootMask.getBitWidth())
29039+
return SDValue();
29040+
29041+
PartsMask |= (RootMask << ShiftAmtVal);
29042+
continue;
29043+
}
29044+
29045+
if (IsSrc(V)) {
29046+
PartsMask |= RootMask;
29047+
continue;
29048+
}
29049+
29050+
return SDValue();
29051+
}
29052+
29053+
if (!Src)
29054+
return SDValue();
29055+
29056+
SDLoc DL(Root);
29057+
return DAG.getNode(ISD::AND, DL, VT,
29058+
{Src, DAG.getConstant(PartsMask, DL, VT)});
29059+
}
29060+
2898529061
/// This is a stub for TargetLowering::SimplifySetCC.
2898629062
SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
2898729063
ISD::CondCode Cond, const SDLoc &DL,
2898829064
bool foldBooleans) {
2898929065
TargetLowering::DAGCombinerInfo
2899029066
DagCombineInfo(DAG, Level, false, this);
28991-
return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
29067+
if (SDValue C =
29068+
TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL))
29069+
return C;
29070+
29071+
if (ISD::isIntEqualitySetCC(Cond) && N0.getOpcode() == ISD::AND &&
29072+
isNullConstant(N1)) {
29073+
29074+
if (SDValue Res = matchMergedBFX(N0, DAG, TLI))
29075+
return DAG.getSetCC(DL, VT, Res, N1, Cond);
29076+
}
29077+
29078+
return SDValue();
2899229079
}
2899329080

2899429081
/// Given an ISD::SDIV node expressing a divide by constant, return
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -O3 -mtriple=amdgcn -mcpu=fiji %s -o - | FileCheck %s
3+
4+
define i1 @basic_eq_i16_3x5(i16 %arg) {
5+
; CHECK-LABEL: basic_eq_i16_3x5:
6+
; CHECK: ; %bb.0: ; %entry
7+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8+
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
9+
; CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
10+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
11+
; CHECK-NEXT: s_setpc_b64 s[30:31]
12+
entry:
13+
%a = and i16 %arg, 31
14+
%sh5 = lshr i16 %arg, 5
15+
%b = and i16 %sh5, 31
16+
%or = or i16 %a, %b
17+
%sh10 = lshr i16 %arg, 10
18+
%c = and i16 %sh10, 31
19+
%or1 = or i16 %or, %c
20+
%cmp = icmp eq i16 %or1, 0
21+
ret i1 %cmp
22+
}
23+
24+
define i1 @basic_eq_i32_3x5(i32 %arg) {
25+
; CHECK-LABEL: basic_eq_i32_3x5:
26+
; CHECK: ; %bb.0: ; %entry
27+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28+
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
29+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
30+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
31+
; CHECK-NEXT: s_setpc_b64 s[30:31]
32+
entry:
33+
%a = and i32 %arg, 31
34+
%sh5 = lshr i32 %arg, 5
35+
%b = and i32 %sh5, 31
36+
%or = or i32 %a, %b
37+
%sh10 = lshr i32 %arg, 10
38+
%c = and i32 %sh10, 31
39+
%or1 = or i32 %or, %c
40+
%cmp = icmp eq i32 %or1, 0
41+
ret i1 %cmp
42+
}
43+
44+
define i1 @basic_eq_i64_3x5(i64 %arg) {
45+
; CHECK-LABEL: basic_eq_i64_3x5:
46+
; CHECK: ; %bb.0: ; %entry
47+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48+
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
49+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
50+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
51+
; CHECK-NEXT: s_setpc_b64 s[30:31]
52+
entry:
53+
%a = and i64 %arg, 31
54+
%sh5 = lshr i64 %arg, 5
55+
%b = and i64 %sh5, 31
56+
%or = or i64 %a, %b
57+
%sh10 = lshr i64 %arg, 10
58+
%c = and i64 %sh10, 31
59+
%or1 = or i64 %or, %c
60+
%cmp = icmp eq i64 %or1, 0
61+
ret i1 %cmp
62+
}
63+
64+
define i1 @basic_ne_i32_3x5(i32 %arg) {
65+
; CHECK-LABEL: basic_ne_i32_3x5:
66+
; CHECK: ; %bb.0: ; %entry
67+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68+
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
69+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
70+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
71+
; CHECK-NEXT: s_setpc_b64 s[30:31]
72+
entry:
73+
%a = and i32 %arg, 31
74+
%sh5 = lshr i32 %arg, 5
75+
%b = and i32 %sh5, 31
76+
%or = or i32 %a, %b
77+
%sh10 = lshr i32 %arg, 10
78+
%c = and i32 %sh10, 31
79+
%or1 = or i32 %or, %c
80+
%cmp = icmp ne i32 %or1, 0
81+
ret i1 %cmp
82+
}
83+
84+
define i1 @eq_i32_3x5_holes_in_mask(i32 %arg) {
85+
; CHECK-LABEL: eq_i32_3x5_holes_in_mask:
86+
; CHECK: ; %bb.0: ; %entry
87+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88+
; CHECK-NEXT: v_and_b32_e32 v0, 0x7f9f, v0
89+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
90+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
91+
; CHECK-NEXT: s_setpc_b64 s[30:31]
92+
entry:
93+
%a = and i32 %arg, 31
94+
%sh5 = lshr i32 %arg, 7
95+
%b = and i32 %sh5, 31
96+
%or = or i32 %a, %b
97+
%sh10 = lshr i32 %arg, 10
98+
%c = and i32 %sh10, 31
99+
%or1 = or i32 %or, %c
100+
%cmp = icmp ne i32 %or1, 0
101+
ret i1 %cmp
102+
}
103+
104+
define i1 @eq_i32_3x5_all_shifted(i32 %arg) {
105+
; CHECK-LABEL: eq_i32_3x5_all_shifted:
106+
; CHECK: ; %bb.0: ; %entry
107+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108+
; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffc, v0
109+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
110+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
111+
; CHECK-NEXT: s_setpc_b64 s[30:31]
112+
entry:
113+
%sh2 = lshr i32 %arg, 2
114+
%a = and i32 %sh2, 31
115+
%sh5 = lshr i32 %arg, 7
116+
%b = and i32 %sh5, 31
117+
%or = or i32 %a, %b
118+
%sh10 = lshr i32 %arg, 10
119+
%c = and i32 %sh10, 31
120+
%or1 = or i32 %or, %c
121+
%cmp = icmp ne i32 %or1, 0
122+
ret i1 %cmp
123+
}

llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,15 @@ define i1 @workitem_zero() {
1212
; DAGISEL-GFX8-LABEL: workitem_zero:
1313
; DAGISEL-GFX8: ; %bb.0: ; %entry
1414
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15-
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v1, 10, v31
16-
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v0, 20, v31
17-
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v1, v31, v1
18-
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v0, v1, v0
19-
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3ff, v0
15+
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
2016
; DAGISEL-GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2117
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
2218
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
2319
;
2420
; DAGISEL-GFX942-LABEL: workitem_zero:
2521
; DAGISEL-GFX942: ; %bb.0: ; %entry
2622
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31
28-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31
29-
; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0
30-
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
23+
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
3124
; DAGISEL-GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
3225
; DAGISEL-GFX942-NEXT: s_nop 1
3326
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -40,11 +33,7 @@ define i1 @workitem_zero() {
4033
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
4134
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
4235
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
43-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31
44-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31
45-
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
46-
; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0
47-
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
36+
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
4837
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
4938
; DAGISEL-GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
5039
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd
@@ -106,22 +95,15 @@ define i1 @workitem_nonzero() {
10695
; DAGISEL-GFX8-LABEL: workitem_nonzero:
10796
; DAGISEL-GFX8: ; %bb.0: ; %entry
10897
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109-
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v1, 10, v31
110-
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v0, 20, v31
111-
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v1, v31, v1
112-
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v0, v1, v0
113-
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3ff, v0
98+
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
11499
; DAGISEL-GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
115100
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
116101
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
117102
;
118103
; DAGISEL-GFX942-LABEL: workitem_nonzero:
119104
; DAGISEL-GFX942: ; %bb.0: ; %entry
120105
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31
122-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31
123-
; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0
124-
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
106+
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
125107
; DAGISEL-GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
126108
; DAGISEL-GFX942-NEXT: s_nop 1
127109
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -134,11 +116,7 @@ define i1 @workitem_nonzero() {
134116
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
135117
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
136118
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
137-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31
138-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31
139-
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
140-
; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0
141-
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
119+
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
142120
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
143121
; DAGISEL-GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
144122
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd

0 commit comments

Comments
 (0)