Skip to content

Commit 3a66e88

Browse files
committed
[InstCombine] Canonicalize abs(sub(ext(X),ext(Y))) -> ext(sub(max(X,Y),min(X,Y)))
This fold pushes the extension to after the abs. This form generates identical scalar code, but is more profitable for vectorization due to the smaller element type. This allows higher VFs to be selected and avoids expensive vector extends. Proofs: https://alive2.llvm.org/ce/z/rChrWe, https://alive2.llvm.org/ce/z/D5E4bJ
1 parent a9ca220 commit 3a66e88

File tree

4 files changed

+310
-221
lines changed

4 files changed

+310
-221
lines changed

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1920,6 +1920,23 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
19201920
if (match(IIOperand, m_SRem(m_Value(X), m_APInt(C))) && *C == 2)
19211921
return BinaryOperator::CreateAnd(X, ConstantInt::get(II->getType(), 1));
19221922

1923+
// abs (sub (sext X, sext Y)) -> zext (sub (smax (x, y) - smin(x, y)))
1924+
bool AbsSExtDiff = match(
1925+
IIOperand, m_OneUse(m_Sub(m_SExt(m_Value(X)), m_SExt(m_Value(Y)))));
1926+
// abs (sub (zext X, zext Y)) -> zext (sub (umax (x, y) - umin(x, y)))
1927+
bool AbsZExtDiff =
1928+
!AbsSExtDiff && match(IIOperand, m_OneUse(m_Sub(m_ZExt(m_Value(X)),
1929+
m_ZExt(m_Value(Y)))));
1930+
if ((AbsSExtDiff || AbsZExtDiff) && X->getType() == Y->getType()) {
1931+
bool IsSigned = AbsSExtDiff;
1932+
Value *Max = Builder.CreateBinaryIntrinsic(
1933+
IsSigned ? Intrinsic::smax : Intrinsic::umax, X, Y);
1934+
Value *Min = Builder.CreateBinaryIntrinsic(
1935+
IsSigned ? Intrinsic::smin : Intrinsic::umin, X, Y);
1936+
Value *Sub = Builder.CreateSub(Max, Min);
1937+
return CastInst::Create(Instruction::ZExt, Sub, II->getType());
1938+
}
1939+
19231940
break;
19241941
}
19251942
case Intrinsic::umin: {
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -passes=instcombine -S | FileCheck %s
3+
4+
; abs (sub (sext X, sext Y)) -> zext (sub (smax (x, y) - smin(x, y)))
5+
; Proof: https://alive2.llvm.org/ce/z/D5E4bJ
6+
7+
; abs (sub (zext X, zext Y)) -> zext (sub (umax (x, y) - umin(x, y)))
8+
; Proof: https://alive2.llvm.org/ce/z/rChrWe
9+
10+
define i32 @sext_i8(i8 %a, i8 %b) {
11+
; CHECK-LABEL: define i32 @sext_i8(
12+
; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
13+
; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[A]], i8 [[B]])
14+
; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.smin.i8(i8 [[A]], i8 [[B]])
15+
; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[TMP1]], [[TMP2]]
16+
; CHECK-NEXT: [[ABS:%.*]] = zext i8 [[TMP3]] to i32
17+
; CHECK-NEXT: ret i32 [[ABS]]
18+
;
19+
%ext.a = sext i8 %a to i32
20+
%ext.b = sext i8 %b to i32
21+
%sub = sub nsw i32 %ext.a, %ext.b
22+
%abs = call i32 @llvm.abs(i32 %sub, i1 true)
23+
ret i32 %abs
24+
}
25+
26+
define i32 @zext_i8(i8 %a, i8 %b) {
27+
; CHECK-LABEL: define i32 @zext_i8(
28+
; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
29+
; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[A]], i8 [[B]])
30+
; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.umin.i8(i8 [[A]], i8 [[B]])
31+
; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[TMP1]], [[TMP2]]
32+
; CHECK-NEXT: [[ABS:%.*]] = zext i8 [[TMP3]] to i32
33+
; CHECK-NEXT: ret i32 [[ABS]]
34+
;
35+
%ext.a = zext i8 %a to i32
36+
%ext.b = zext i8 %b to i32
37+
%sub = sub nsw i32 %ext.a, %ext.b
38+
%abs = call i32 @llvm.abs(i32 %sub, i1 true)
39+
ret i32 %abs
40+
}
41+
42+
define i64 @zext_i32(i32 %a, i32 %b) {
43+
; CHECK-LABEL: define i64 @zext_i32(
44+
; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
45+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.umax.i32(i32 [[A]], i32 [[B]])
46+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.umin.i32(i32 [[A]], i32 [[B]])
47+
; CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[TMP1]], [[TMP2]]
48+
; CHECK-NEXT: [[ABS:%.*]] = zext i32 [[TMP3]] to i64
49+
; CHECK-NEXT: ret i64 [[ABS]]
50+
;
51+
%ext.a = zext i32 %a to i64
52+
%ext.b = zext i32 %b to i64
53+
%sub = sub nsw i64 %ext.a, %ext.b
54+
%abs = call i64 @llvm.abs(i64 %sub, i1 true)
55+
ret i64 %abs
56+
}
57+
58+
define <16 x i32> @vec_source(<16 x i8> %a, <16 x i8> %b) {
59+
; CHECK-LABEL: define <16 x i32> @vec_source(
60+
; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) {
61+
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
62+
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
63+
; CHECK-NEXT: [[TMP3:%.*]] = sub <16 x i8> [[TMP1]], [[TMP2]]
64+
; CHECK-NEXT: [[ABS:%.*]] = zext <16 x i8> [[TMP3]] to <16 x i32>
65+
; CHECK-NEXT: ret <16 x i32> [[ABS]]
66+
;
67+
%ext.a = sext <16 x i8> %a to <16 x i32>
68+
%ext.b = sext <16 x i8> %b to <16 x i32>
69+
%sub = sub nsw <16 x i32> %ext.a, %ext.b
70+
%abs = call <16 x i32> @llvm.abs(<16 x i32> %sub, i1 true)
71+
ret <16 x i32> %abs
72+
}
73+
74+
define i32 @mixed_extend(i8 %a, i8 %b) {
75+
; CHECK-LABEL: define i32 @mixed_extend(
76+
; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
77+
; CHECK-NEXT: [[EXT_A:%.*]] = sext i8 [[A]] to i32
78+
; CHECK-NEXT: [[EXT_B:%.*]] = zext i8 [[B]] to i32
79+
; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[EXT_A]], [[EXT_B]]
80+
; CHECK-NEXT: [[ABS:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 true)
81+
; CHECK-NEXT: ret i32 [[ABS]]
82+
;
83+
%ext.a = sext i8 %a to i32
84+
%ext.b = zext i8 %b to i32
85+
%sub = sub nsw i32 %ext.a, %ext.b
86+
%abs = call i32 @llvm.abs(i32 %sub, i1 true)
87+
ret i32 %abs
88+
}
89+
90+
define i32 @mixed_source_types(i16 %a, i8 %b) {
91+
; CHECK-LABEL: define i32 @mixed_source_types(
92+
; CHECK-SAME: i16 [[A:%.*]], i8 [[B:%.*]]) {
93+
; CHECK-NEXT: [[EXT_A:%.*]] = zext i16 [[A]] to i32
94+
; CHECK-NEXT: [[EXT_B:%.*]] = zext i8 [[B]] to i32
95+
; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[EXT_A]], [[EXT_B]]
96+
; CHECK-NEXT: [[ABS:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 true)
97+
; CHECK-NEXT: ret i32 [[ABS]]
98+
;
99+
%ext.a = zext i16 %a to i32
100+
%ext.b = zext i8 %b to i32
101+
%sub = sub nsw i32 %ext.a, %ext.b
102+
%abs = call i32 @llvm.abs(i32 %sub, i1 true)
103+
ret i32 %abs
104+
}

llvm/test/Transforms/InstCombine/icmp.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4065,10 +4065,10 @@ define <2 x i1> @f4_vec(<2 x i64> %a, <2 x i64> %b) {
40654065
define i32 @f5(i8 %a, i8 %b) {
40664066
; CHECK-LABEL: define i32 @f5(
40674067
; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
4068-
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[A]] to i32
4069-
; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[B]] to i32
4070-
; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV3]]
4071-
; CHECK-NEXT: [[SUB7_SUB:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 true)
4068+
; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[A]], i8 [[B]])
4069+
; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.umin.i8(i8 [[A]], i8 [[B]])
4070+
; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[TMP1]], [[TMP2]]
4071+
; CHECK-NEXT: [[SUB7_SUB:%.*]] = zext i8 [[TMP3]] to i32
40724072
; CHECK-NEXT: ret i32 [[SUB7_SUB]]
40734073
;
40744074
%conv = zext i8 %a to i32

0 commit comments

Comments
 (0)