-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[InstCombine] Fold min(X+1, Y) - min(X, Y) --> zext X < Y
#157782
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This fold is invalid for @llvm.smin.i1, since smin(-1, 0) == -1 (take X = Y = 0). Otherwise, if X+1 has the appropriate nsw or nuw, this transform replaces a sub and at least one min with an icmp and a zext. It is also invalid for i1 in general, but it seems that other folds take care of i1. In llvm#157524, this expression was folded to a select, but it seems that select X < Y, 1, 0 can be canonicalized to zext X < Y.
|
@llvm/pr-subscribers-llvm-transforms Author: None (benwu25) ChangesThis PR addresses #157524. alive2: https://alive2.llvm.org/ce/z/xe_vb2 This fold is invalid for The alive2 proof in #157524 used a select for the fold, but it seems like Could someone help review and commit? Thanks! Full diff: https://github.com/llvm/llvm-project/pull/157782.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index d934638c15e75..63c6fc52322d6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2719,6 +2719,24 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
return BinaryOperator::CreateSub(X, Not);
}
+ // min(X+1, Y) - min(X, Y) --> zext X < Y
+ // Replacing a sub and at least one min with an icmp
+ // and a zext is a potential improvement.
+ if (match(Op0, m_c_SMin(m_c_NSWAdd(m_Value(X), m_One()), m_Value(Y))) &&
+ match(Op1, m_c_SMin(m_Value(X), m_Value(Y))) &&
+ I.getType()->getScalarSizeInBits() != 1 &&
+ (Op0->hasOneUse() || Op1->hasOneUse())) {
+ Value *Cond = Builder.CreateICmpSLT(X, Y);
+ return new ZExtInst(Cond, I.getType());
+ }
+ if (match(Op0, m_c_UMin(m_c_NUWAdd(m_Value(X), m_One()), m_Value(Y))) &&
+ match(Op1, m_c_UMin(m_Value(X), m_Value(Y))) &&
+ I.getType()->getScalarSizeInBits() != 1 &&
+ (Op0->hasOneUse() || Op1->hasOneUse())) {
+ Value *Cond = Builder.CreateICmpULT(X, Y);
+ return new ZExtInst(Cond, I.getType());
+ }
+
// Optimize pointer differences into the same array into a size. Consider:
// &A[10] - &A[0]: we should compile this to "10".
Value *LHSOp, *RHSOp;
diff --git a/llvm/test/Transforms/InstCombine/min-zext.ll b/llvm/test/Transforms/InstCombine/min-zext.ll
new file mode 100644
index 0000000000000..43af1f48bcfed
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/min-zext.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i32 @test_smin(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @test_smin(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[ARG0]], [[ARG1]]
+; CHECK-NEXT: [[V3:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT: ret i32 [[V3]]
+;
+ %v0 = tail call i32 @llvm.smin.i32(i32 %arg0, i32 %arg1)
+ %v1 = add nsw i32 %arg0, 1
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 %arg1)
+ %v3 = sub i32 %v2, %v0
+ ret i32 %v3
+}
+
+define i32 @test_umin(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @test_umin(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[ARG0]], [[ARG1]]
+; CHECK-NEXT: [[V3:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT: ret i32 [[V3]]
+;
+ %v0 = tail call i32 @llvm.umin.i32(i32 %arg0, i32 %arg1)
+ %v1 = add nuw i32 %arg0, 1
+ %v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 %arg1)
+ %v3 = sub i32 %v2, %v0
+ ret i32 %v3
+}
+
+define i1 @test_smin_i1(i1 %arg0, i1 %arg1) {
+; CHECK-LABEL: define i1 @test_smin_i1(
+; CHECK-SAME: i1 [[ARG0:%.*]], i1 [[ARG1:%.*]]) {
+; CHECK-NEXT: [[V0:%.*]] = or i1 [[ARG0]], [[ARG1]]
+; CHECK-NEXT: [[V3:%.*]] = xor i1 [[V0]], true
+; CHECK-NEXT: ret i1 [[V3]]
+;
+ %v0 = tail call i1 @llvm.smin.i1(i1 %arg0, i1 %arg1)
+ %v1 = add nsw i1 %arg0, 1
+ %v2 = tail call i1 @llvm.smin.i1(i1 %v1, i1 %arg1)
+ %v3 = sub i1 %v2, %v0
+ ret i1 %v3
+}
+
+declare void @use(i2)
+
+define i2 @test_smin_use_operands(i2 %arg0, i2 %arg1) {
+; CHECK-LABEL: define i2 @test_smin_use_operands(
+; CHECK-SAME: i2 [[ARG0:%.*]], i2 [[ARG1:%.*]]) {
+; CHECK-NEXT: [[V0:%.*]] = tail call i2 @llvm.smin.i2(i2 [[ARG0]], i2 [[ARG1]])
+; CHECK-NEXT: [[V1:%.*]] = add nsw i2 [[ARG0]], 1
+; CHECK-NEXT: [[V2:%.*]] = tail call i2 @llvm.smin.i2(i2 [[V1]], i2 [[ARG1]])
+; CHECK-NEXT: [[V3:%.*]] = sub i2 [[V2]], [[V0]]
+; CHECK-NEXT: call void @use(i2 [[V2]])
+; CHECK-NEXT: call void @use(i2 [[V0]])
+; CHECK-NEXT: ret i2 [[V3]]
+;
+ %v0 = tail call i2 @llvm.smin.i2(i2 %arg0, i2 %arg1)
+ %v1 = add nsw i2 %arg0, 1
+ %v2 = tail call i2 @llvm.smin.i2(i2 %v1, i2 %arg1)
+ %v3 = sub i2 %v2, %v0
+ call void @use(i2 %v2)
+ call void @use(i2 %v0)
+ ret i2 %v3
+}
+
+define i2 @test_smin_use_operand(i2 %arg0, i2 %arg1) {
+; CHECK-LABEL: define i2 @test_smin_use_operand(
+; CHECK-SAME: i2 [[ARG0:%.*]], i2 [[ARG1:%.*]]) {
+; CHECK-NEXT: [[V1:%.*]] = add nsw i2 [[ARG0]], 1
+; CHECK-NEXT: [[V2:%.*]] = tail call i2 @llvm.smin.i2(i2 [[V1]], i2 [[ARG1]])
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i2 [[ARG0]], [[ARG1]]
+; CHECK-NEXT: [[V3:%.*]] = zext i1 [[TMP1]] to i2
+; CHECK-NEXT: call void @use(i2 [[V2]])
+; CHECK-NEXT: ret i2 [[V3]]
+;
+ %v0 = tail call i2 @llvm.smin.i2(i2 %arg0, i2 %arg1)
+ %v1 = add nsw i2 %arg0, 1
+ %v2 = tail call i2 @llvm.smin.i2(i2 %v1, i2 %arg1)
+ %v3 = sub i2 %v2, %v0
+ call void @use(i2 %v2)
+ ret i2 %v3
+}
+
+define i32 @test_smin_missing_nsw(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @test_smin_missing_nsw(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT: [[V0:%.*]] = tail call i32 @llvm.smin.i32(i32 [[ARG0]], i32 [[ARG1]])
+; CHECK-NEXT: [[V1:%.*]] = add i32 [[ARG0]], 1
+; CHECK-NEXT: [[V2:%.*]] = tail call i32 @llvm.smin.i32(i32 [[V1]], i32 [[ARG1]])
+; CHECK-NEXT: [[V3:%.*]] = sub i32 [[V2]], [[V0]]
+; CHECK-NEXT: ret i32 [[V3]]
+;
+ %v0 = tail call i32 @llvm.smin.i32(i32 %arg0, i32 %arg1)
+ %v1 = add i32 %arg0, 1
+ %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 %arg1)
+ %v3 = sub i32 %v2, %v0
+ ret i32 %v3
+}
+
+define i32 @test_umin_missing_nuw(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: define i32 @test_umin_missing_nuw(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
+; CHECK-NEXT: [[V0:%.*]] = tail call i32 @llvm.umin.i32(i32 [[ARG0]], i32 [[ARG1]])
+; CHECK-NEXT: [[V1:%.*]] = add i32 [[ARG0]], 1
+; CHECK-NEXT: [[V2:%.*]] = tail call i32 @llvm.umin.i32(i32 [[V1]], i32 [[ARG1]])
+; CHECK-NEXT: [[V3:%.*]] = sub i32 [[V2]], [[V0]]
+; CHECK-NEXT: ret i32 [[V3]]
+;
+ %v0 = tail call i32 @llvm.umin.i32(i32 %arg0, i32 %arg1)
+ %v1 = add i32 %arg0, 1
+ %v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 %arg1)
+ %v3 = sub i32 %v2, %v0
+ ret i32 %v3
+}
|
min(X+1, Y) - min(X, Y) --> zext X < Y (#157524)min(X+1, Y) - min(X, Y) --> zext X < Y
In case I misinterpreted, this can be changed to mismatch the arguments of one of the min calls instead.
|
@zyw-bot mfuzz |
dtcxzyw
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Thanks!
nikic
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's also possible to fold this if the sub operands are swapped, just using sext instead of zext: https://alive2.llvm.org/ce/z/UtxZm_
No idea whether that variant occurs in the wild though.
This PR addresses #157524.
alive2: https://alive2.llvm.org/ce/z/xe_vb2
godbolt: https://alive2.llvm.org/ce/z/7A8PxK
This fold is invalid for
@llvm.smin.i1sincesmin(-1, 0) == -1. I also avoided i1 in general since this uses zext, but it seems like those checks for width might not be necessary, since other folds get to it first.The alive2 proof in #157524 used a select for the fold, but it seems like
select X < Y, 1, 0should be canonicalized tozext X < Yif the bit width is correct.Could someone help review and merge?
Thanks!