Skip to content

Commit 1f58cbe

Browse files
authored
[DAG] Fold (umin (sub a b) a) -> (usubo a b); (select usubo.1 a usubo.0) (#161651)
Fixes #161036.
1 parent 448146d commit 1f58cbe

File tree

3 files changed

+326
-0
lines changed

3 files changed

+326
-0
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6219,6 +6219,25 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
62196219
SDLoc(N), VT, N0, N1))
62206220
return SD;
62216221

6222+
if (TLI.isOperationLegalOrCustom(ISD::USUBO, VT) &&
6223+
!TLI.isOperationLegalOrCustom(ISD::UMIN, VT)) {
6224+
SDValue B;
6225+
6226+
// (umin (sub a, b), a) -> (usubo a, b); (select usubo.1, a, usubo.0)
6227+
if (sd_match(N0, m_Sub(m_Specific(N1), m_Value(B)))) {
6228+
SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT));
6229+
SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N1, B);
6230+
return DAG.getSelect(DL, VT, USO.getValue(1), N1, USO.getValue(0));
6231+
}
6232+
6233+
// (umin a, (sub a, b)) -> (usubo a, b); (select usubo.1, a, usubo.0)
6234+
if (sd_match(N1, m_Sub(m_Specific(N0), m_Value(B)))) {
6235+
SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT));
6236+
SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N0, B);
6237+
return DAG.getSelect(DL, VT, USO.getValue(1), N0, USO.getValue(0));
6238+
}
6239+
}
6240+
62226241
// Simplify the operands using demanded-bits information.
62236242
if (SimplifyDemandedBits(SDValue(N, 0)))
62246243
return SDValue(N, 0);
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
2+
3+
; GitHub issue #161036
4+
5+
; Positive test : umin(sub(a,b),a) with scalar types should be folded
6+
define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) {
7+
; CHECK-LABEL: underflow_compare_fold_i64
8+
; CHECK-LABEL: %bb.0:
9+
; CHECK-NEXT: subs x8, x0, x1
10+
; CHECK-NEXT: csel x0, x0, x8, lo
11+
; CHECK-NEXT: ret
12+
%sub = sub i64 %a, %b
13+
%cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
14+
ret i64 %cond
15+
}
16+
17+
; Positive test : umin(a,sub(a,b)) with scalar types should be folded
18+
define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) {
19+
; CHECK-LABEL: underflow_compare_fold_i64_commute
20+
; CHECK-LABEL: %bb.0:
21+
; CHECK-NEXT: subs x8, x0, x1
22+
; CHECK-NEXT: csel x0, x0, x8, lo
23+
; CHECK-NEXT: ret
24+
%sub = sub i64 %a, %b
25+
%cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
26+
ret i64 %cond
27+
}
28+
29+
; Positive test : multi-use is OK since the sub instruction still runs once
30+
define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
31+
; CHECK-LABEL: underflow_compare_fold_i64_multi_use
32+
; CHECK-LABEL: %bb.0:
33+
; CHECK-NEXT: subs x8, x0, x1
34+
; CHECK-NEXT: csel x0, x0, x8, lo
35+
; CHECK-NEXT: str x8, [x2]
36+
; CHECK-NEXT: ret
37+
%sub = sub i64 %a, %b
38+
store i64 %sub, ptr addrspace(1) %ptr
39+
%cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
40+
ret i64 %cond
41+
}
42+
43+
; Positive test : i32
44+
define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) {
45+
; CHECK-LABEL: underflow_compare_fold_i32
46+
; CHECK-LABEL: %bb.0:
47+
; CHECK-NEXT: subs w8, w0, w1
48+
; CHECK-NEXT: csel w0, w0, w8, lo
49+
; CHECK-NEXT: ret
50+
%sub = sub i32 %a, %b
51+
%cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a)
52+
ret i32 %cond
53+
}
54+
55+
; Positive test : i32
56+
define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) {
57+
; CHECK-LABEL: underflow_compare_fold_i32_commute
58+
; CHECK-LABEL: %bb.0:
59+
; CHECK-NEXT: subs w8, w0, w1
60+
; CHECK-NEXT: csel w0, w0, w8, lo
61+
; CHECK-NEXT: ret
62+
%sub = sub i32 %a, %b
63+
%cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub)
64+
ret i32 %cond
65+
}
66+
67+
; Positive test : i32
68+
define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
69+
; CHECK-LABEL: underflow_compare_fold_i32_multi_use
70+
; CHECK-LABEL: %bb.0:
71+
; CHECK-NEXT: subs w8, w0, w1
72+
; CHECK-NEXT: csel w0, w0, w8, lo
73+
; CHECK-NEXT: str w8, [x2]
74+
; CHECK-NEXT: ret
75+
%sub = sub i32 %a, %b
76+
store i32 %sub, ptr addrspace(1) %ptr
77+
%cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
78+
ret i32 %cond
79+
}
80+
81+
; Negative test : i16
82+
define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) {
83+
; CHECK-LABEL: underflow_compare_fold_i16
84+
; CHECK-LABEL: %bb.0:
85+
; CHECK-LABEL: sub w8, w0, w1
86+
; CHECK-LABEL: and w9, w0, #0xffff
87+
; CHECK-LABEL: and w8, w8, #0xffff
88+
; CHECK-LABEL: cmp w8, w9
89+
; CHECK-LABEL: csel w0, w8, w9, lo
90+
; CHECK-LABEL: ret
91+
%sub = sub i16 %a, %b
92+
%cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a)
93+
ret i16 %cond
94+
}
95+
96+
; Negative test : i16
97+
define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) {
98+
; CHECK-LABEL: underflow_compare_fold_i16_commute
99+
; CHECK-LABEL: %bb.0:
100+
; CHECK-LABEL: sub w8, w0, w1
101+
; CHECK-LABEL: and w9, w0, #0xffff
102+
; CHECK-LABEL: and w8, w8, #0xffff
103+
; CHECK-LABEL: cmp w9, w8
104+
; CHECK-LABEL: csel w0, w9, w8, lo
105+
; CHECK-LABEL: ret
106+
%sub = sub i16 %a, %b
107+
%cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub)
108+
ret i16 %cond
109+
}
110+
111+
; Negative test : i16
112+
define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) {
113+
; CHECK-LABEL: underflow_compare_fold_i16_multi_use
114+
; CHECK-LABEL: %bb.0:
115+
; CHECK-LABEL: sub w8, w0, w1
116+
; CHECK-LABEL: and w9, w0, #0xffff
117+
; CHECK-LABEL: and w10, w8, #0xffff
118+
; CHECK-LABEL: strh w8, [x2]
119+
; CHECK-LABEL: cmp w10, w9
120+
; CHECK-LABEL: csel w0, w10, w9, lo
121+
; CHECK-LABEL: ret
122+
%sub = sub i16 %a, %b
123+
store i16 %sub, ptr addrspace(1) %ptr
124+
%cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
125+
ret i16 %cond
126+
}
127+
128+
; Negative test, vector types : umin(sub(a,b),a) but with vectors
129+
define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
130+
; CHECK-LABEL: underflow_compare_dontfold_vectors
131+
; CHECK-LABEL: %bb.0
132+
; CHECK-NEXT: sub v1.16b, v0.16b, v1.16b
133+
; CHECK-NEXT: umin v0.16b, v1.16b, v0.16b
134+
; CHECK-NEXT: ret
135+
%sub = sub <16 x i8> %a, %b
136+
%cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
137+
ret <16 x i8> %cond
138+
}
139+
140+
; Negative test, pattern mismatch : umin(add(a,b),a)
141+
define i64 @umin_add(i64 %a, i64 %b) {
142+
; CHECK-LABEL: umin_add
143+
; CHECK-LABEL: %bb.0
144+
; CHECK-NEXT: add x8, x0, x1
145+
; CHECK-NEXT: cmp x8, x0
146+
; CHECK-NEXT: csel x0, x8, x0, lo
147+
; CHECK-NEXT: ret
148+
%add = add i64 %a, %b
149+
%cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
150+
ret i64 %cond
151+
}
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
2+
3+
; GitHub issue #161036
4+
5+
; Positive test : umin(sub(a,b),a) with scalar types should be folded
6+
define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) {
7+
; CHECK-LABEL: underflow_compare_fold_i64
8+
; CHECK-LABEL: %bb.0
9+
; CHECK-NEXT: movq %rdi, %rax
10+
; CHECK-NEXT: subq %rsi, %rax
11+
; CHECK-NEXT: cmovbq %rdi, %rax
12+
; CHECK-NEXT: retq
13+
%sub = sub i64 %a, %b
14+
%cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
15+
ret i64 %cond
16+
}
17+
18+
; Positive test : umin(a,sub(a,b)) with scalar types should be folded
19+
define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) {
20+
; CHECK-LABEL: underflow_compare_fold_i64_commute
21+
; CHECK-LABEL: %bb.0
22+
; CHECK-NEXT: movq %rdi, %rax
23+
; CHECK-NEXT: subq %rsi, %rax
24+
; CHECK-NEXT: cmovbq %rdi, %rax
25+
; CHECK-NEXT: retq
26+
%sub = sub i64 %a, %b
27+
%cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
28+
ret i64 %cond
29+
}
30+
31+
; Positive test : multi-use is OK since the sub instruction still runs once
32+
define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
33+
; CHECK-LABEL: underflow_compare_fold_i64_multi_use
34+
; CHECK-LABEL: %bb.0
35+
; CHECK-NEXT: movq %rdi, %rax
36+
; CHECK-NEXT: subq %rsi, %rax
37+
; CHECK-NEXT: movq %rax, (%rdx)
38+
; CHECK-NEXT: cmovbq %rdi, %rax
39+
; CHECK-NEXT: retq
40+
%sub = sub i64 %a, %b
41+
store i64 %sub, ptr addrspace(1) %ptr
42+
%cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
43+
ret i64 %cond
44+
}
45+
46+
; Positive test : i32
47+
define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) {
48+
; CHECK-LABEL: underflow_compare_fold_i32
49+
; CHECK-LABEL: %bb.0
50+
; CHECK-NEXT: movl %edi, %eax
51+
; CHECK-NEXT: subl %esi, %eax
52+
; CHECK-NEXT: cmovbl %edi, %eax
53+
; CHECK-NEXT: retq
54+
%sub = sub i32 %a, %b
55+
%cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a)
56+
ret i32 %cond
57+
}
58+
59+
; Positive test : i32
60+
define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) {
61+
; CHECK-LABEL: underflow_compare_fold_i32_commute
62+
; CHECK-LABEL: %bb.0
63+
; CHECK-NEXT: movl %edi, %eax
64+
; CHECK-NEXT: subl %esi, %eax
65+
; CHECK-NEXT: cmovbl %edi, %eax
66+
; CHECK-NEXT: retq
67+
%sub = sub i32 %a, %b
68+
%cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub)
69+
ret i32 %cond
70+
}
71+
72+
; Positive test : i32
73+
define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
74+
; CHECK-LABEL: underflow_compare_fold_i32_multi_use
75+
; CHECK-LABEL: %bb.0
76+
; CHECK-NEXT: movl %edi, %eax
77+
; CHECK-NEXT: subl %esi, %eax
78+
; CHECK-NEXT: movl %eax, (%rdx)
79+
; CHECK-NEXT: cmovbl %edi, %eax
80+
; CHECK-NEXT: retq
81+
%sub = sub i32 %a, %b
82+
store i32 %sub, ptr addrspace(1) %ptr
83+
%cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
84+
ret i32 %cond
85+
}
86+
87+
; Positive test : i16
88+
define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) {
89+
; CHECK-LABEL: underflow_compare_fold_i16
90+
; CHECK-LABEL: %bb.0:
91+
; CHECK-NEXT: movl %edi, %eax
92+
; CHECK-NEXT: subw %si, %ax
93+
; CHECK-NEXT: cmovbl %edi, %eax
94+
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
95+
; CHECK-NEXT: retq
96+
%sub = sub i16 %a, %b
97+
%cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a)
98+
ret i16 %cond
99+
}
100+
101+
; Positive test : i16
102+
define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) {
103+
; CHECK-LABEL: underflow_compare_fold_i16_commute
104+
; CHECK-LABEL: %bb.0:
105+
; CHECK-NEXT: movl %edi, %eax
106+
; CHECK-NEXT: subw %si, %ax
107+
; CHECK-NEXT: cmovbl %edi, %eax
108+
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
109+
; CHECK-NEXT: retq
110+
%sub = sub i16 %a, %b
111+
%cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub)
112+
ret i16 %cond
113+
}
114+
115+
; Positive test : i16
116+
define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) {
117+
; CHECK-LABEL: underflow_compare_fold_i16_multi_use
118+
; CHECK-LABEL: %bb.0:
119+
; CHECK-NEXT: movl %edi, %eax
120+
; CHECK-NEXT: subw %si, %ax
121+
; CHECK-NEXT: movw %ax, (%rdx)
122+
; CHECK-NEXT: cmovbl %edi, %eax
123+
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
124+
; CHECK-NEXT: retq
125+
%sub = sub i16 %a, %b
126+
store i16 %sub, ptr addrspace(1) %ptr
127+
%cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
128+
ret i16 %cond
129+
}
130+
131+
132+
; Negative test, vector types : umin(sub(a,b),a) but with vectors
133+
define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
134+
; CHECK-LABEL: underflow_compare_dontfold_vectors
135+
; CHECK-LABEL: %bb.0
136+
; CHECK-NEXT: movdqa %xmm0, %xmm2
137+
; CHECK-NEXT: psubb %xmm1, %xmm2
138+
; CHECK-NEXT: pminub %xmm2, %xmm0
139+
; CHECK-NEXT: retq
140+
%sub = sub <16 x i8> %a, %b
141+
%cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
142+
ret <16 x i8> %cond
143+
}
144+
145+
; Negative test, pattern mismatch : umin(add(a,b),a)
146+
define i64 @umin_add(i64 %a, i64 %b) {
147+
; CHECK-LABEL: umin_add
148+
; CHECK-LABEL: %bb.0
149+
; CHECK-NEXT: leaq (%rsi,%rdi), %rax
150+
; CHECK-NEXT: cmpq %rdi, %rax
151+
; CHECK-NEXT: cmovaeq %rdi, %rax
152+
; CHECK-NEXT: retq
153+
%add = add i64 %a, %b
154+
%cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
155+
ret i64 %cond
156+
}

0 commit comments

Comments
 (0)