-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[LLVM][AArch64] Optimize sign bit tests with TST instruction for SIGN_EXTEND patterns #158061
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: guan jian (rez5427) ChangesHi, I recently found out in some cases LLVM doesn't generate optimal code like:
This optimization is only applied when the following conditions are met:
Full diff: https://github.com/llvm/llvm-project/pull/158061.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e788bee6be322..2510f97d7d846 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11630,6 +11630,48 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
}
+ // Check for sign bit test patterns that can use TST optimization.
+ // (SELECT_CC setlt, singn_extend_inreg, 0, tval, fval)
+ // -> TST %operand, sign_bit; CSEL
+ // (SELECT_CC setlt, singn_extend, 0, tval, fval)
+ // -> TST %operand, sign_bit; CSEL
+ if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
+ (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
+ LHS.getOpcode() == ISD::SIGN_EXTEND)) {
+
+ SDValue OriginalVal = LHS.getOperand(0);
+ EVT OriginalVT = LHS.getOpcode() == ISD::SIGN_EXTEND_INREG
+ ? cast<VTSDNode>(LHS.getOperand(1))->getVT()
+ : OriginalVal.getValueType();
+
+ // Apply TST optimization for integer types
+ if (OriginalVT.isInteger()) {
+ // Calculate the sign bit for the original type
+ unsigned BitWidth = OriginalVT.getSizeInBits();
+ APInt SignBit = APInt::getSignedMinValue(BitWidth);
+ EVT TestVT = (BitWidth <= 32) ? MVT::i32 : MVT::i64;
+ unsigned TestBitWidth = TestVT.getSizeInBits();
+ if (BitWidth < TestBitWidth) {
+ SignBit = SignBit.zext(TestBitWidth);
+ }
+
+ SDValue SignBitConst = DAG.getConstant(SignBit, DL, TestVT);
+ SDValue TestOperand = OriginalVal;
+ if (OriginalVal.getValueType() != TestVT) {
+ TestOperand = DAG.getNode(ISD::ZERO_EXTEND, DL, TestVT, OriginalVal);
+ }
+
+ SDValue TST =
+ DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
+ TestOperand, SignBitConst);
+
+ SDValue Flags = TST.getValue(1);
+ return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal,
+ FVal, DAG.getConstant(AArch64CC::MI, DL, MVT::i32),
+ Flags);
+ }
+ }
+
// Canonicalise absolute difference patterns:
// select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
// select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
diff --git a/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll b/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll
index 0960c4c2a3342..b81a141b63c3a 100644
--- a/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll
+++ b/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll
@@ -78,8 +78,7 @@ B:
define i32 @g_i8_sign_extend_inreg(i8 %in, i32 %a, i32 %b) nounwind {
; CHECK-LABEL: g_i8_sign_extend_inreg:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sxtb w8, w0
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: tst w0, #0x80
; CHECK-NEXT: csel w8, w1, w2, mi
; CHECK-NEXT: add w0, w8, w0, uxtb
; CHECK-NEXT: ret
@@ -100,8 +99,7 @@ B:
define i32 @g_i16_sign_extend_inreg(i16 %in, i32 %a, i32 %b) nounwind {
; CHECK-LABEL: g_i16_sign_extend_inreg:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sxth w8, w0
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: tst w0, #0x8000
; CHECK-NEXT: csel w8, w1, w2, mi
; CHECK-NEXT: add w0, w8, w0, uxth
; CHECK-NEXT: ret
@@ -167,9 +165,7 @@ B:
define i64 @g_i32_sign_extend_i64(i32 %in, i64 %a, i64 %b) nounwind {
; CHECK-LABEL: g_i32_sign_extend_i64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: sxtw x8, w0
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: tst w0, #0x80000000
; CHECK-NEXT: csel x8, x1, x2, mi
; CHECK-NEXT: add x0, x8, w0, uxtw
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 18665bcbeae83..6e9d13135410c 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -2093,3 +2093,54 @@ define <2 x i1> @icmp_slt_v2i64_Zero_LHS(<2 x i64> %a) {
%c = icmp slt <2 x i64> <i64 0, i64 0>, %a
ret <2 x i1> %c
}
+
+; Test TST optimization for i8 sign bit testing with cross-type select
+; This tests the pattern: icmp slt i8 %val, 0; select i1 %cmp, i32 %a, i32 %b
+; The optimization should convert sxtb+cmp to tst for sign bit testing.
+
+define i32 @i8_signbit_tst_constants(i8 %x, i8 %y) {
+; CHECK-SD-LABEL: i8_signbit_tst_constants:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: add w9, w0, w1
+; CHECK-SD-NEXT: mov w8, #42 // =0x2a
+; CHECK-SD-NEXT: tst w9, #0x80
+; CHECK-SD-NEXT: mov w9, #20894 // =0x519e
+; CHECK-SD-NEXT: csel w0, w9, w8, mi
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: i8_signbit_tst_constants:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: add w8, w0, w1
+; CHECK-GI-NEXT: mov w9, #42 // =0x2a
+; CHECK-GI-NEXT: mov w10, #20894 // =0x519e
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: cmp w8, #0
+; CHECK-GI-NEXT: csel w0, w10, w9, mi
+; CHECK-GI-NEXT: ret
+ %add = add i8 %x, %y
+ %cmp = icmp slt i8 %add, 0
+ %sel = select i1 %cmp, i32 20894, i32 42
+ ret i32 %sel
+}
+
+; Test i8 sign bit testing with variable select values (problematic case)
+define i32 @i8_signbit_variables(i8 %x, i8 %y, i32 %a, i32 %b) {
+; CHECK-SD-LABEL: i8_signbit_variables:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: add w8, w0, w1
+; CHECK-SD-NEXT: tst w8, #0x80
+; CHECK-SD-NEXT: csel w0, w2, w3, mi
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: i8_signbit_variables:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: add w8, w0, w1
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: cmp w8, #0
+; CHECK-GI-NEXT: csel w0, w2, w3, mi
+; CHECK-GI-NEXT: ret
+ %add = add i8 %x, %y
+ %cmp = icmp slt i8 %add, 0
+ %sel = select i1 %cmp, i32 %a, i32 %b
+ ret i32 %sel
+}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
index 62d41fca10db3..198428e26825f 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
@@ -26,8 +26,8 @@ define i32 @reduce_and_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-LABEL: reduce_and_v1i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: smov w8, v0.b[0]
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: umov w8, v0.b[0]
+; CHECK-NEXT: tst w8, #0x80
; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i8> %a0, zeroinitializer
@@ -120,8 +120,8 @@ define i32 @reduce_and_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-LABEL: reduce_and_v1i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: smov w8, v0.h[0]
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: umov w8, v0.h[0]
+; CHECK-NEXT: tst w8, #0x8000
; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i16> %a0, zeroinitializer
@@ -305,8 +305,8 @@ define i32 @reduce_or_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-LABEL: reduce_or_v1i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: smov w8, v0.b[0]
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: umov w8, v0.b[0]
+; CHECK-NEXT: tst w8, #0x80
; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i8> %a0, zeroinitializer
@@ -399,8 +399,8 @@ define i32 @reduce_or_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-LABEL: reduce_or_v1i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: smov w8, v0.h[0]
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: umov w8, v0.h[0]
+; CHECK-NEXT: tst w8, #0x8000
; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i16> %a0, zeroinitializer
@@ -584,8 +584,8 @@ define i32 @reduce_xor_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-LABEL: reduce_xor_v1i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: smov w8, v0.b[0]
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: umov w8, v0.b[0]
+; CHECK-NEXT: tst w8, #0x80
; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i8> %a0, zeroinitializer
@@ -679,8 +679,8 @@ define i32 @reduce_xor_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-LABEL: reduce_xor_v1i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: smov w8, v0.h[0]
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: umov w8, v0.h[0]
+; CHECK-NEXT: tst w8, #0x8000
; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i16> %a0, zeroinitializer
|
cc @llvm/aarch64 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the patch. Looks like a nice optimization. It might be worth giving it a test to make sure it is producing the expected results, a boot strap or the llvm-test-suite can be a decent way to test it.
3b398cb
to
d52b40c
Compare
I tried both bootstrap and llvm-test-suite, and they show nothing wrong. |
e6056d4
to
3762d7b
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this looks OK. LGTM.
ping |
Ill merge this now! (We don't always know who has access, please to ask if needed). Thanks |
…_EXTEND patterns (llvm#158061) Hi, I recently found out in some cases LLVM doesn't generate optimal code like: ``` sxtb w8, w0 cmp w8, #0 csel w0, w1, w2, lt ``` ``` tst w0, #0x80 csel w0, w1, w2, mi ``` This optimization is only applied when the following conditions are met: 1. The comparison is setlt (signed less than) 2. The right-hand side is zero 3. The left-hand side is a sign extension operation (SIGN_EXTEND or SIGN_EXTEND_INREG) 4. The sign-extended value has only one use (hasOneUse()) 5. The original type is an integer type
Hi, I recently found out in some cases LLVM doesn't generate optimal code like:
This optimization is only applied when the following conditions are met: