-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[X86] Distribute Certain Bitwise Operations over SELECT #136555
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
Stacked on top of #136554. |
c6cb302 to
e1569b4
Compare
e1569b4 to
b802d18
Compare
b802d18 to
f06d644
Compare
|
@llvm/pr-subscribers-backend-x86 Author: Marius Kamp (mskamp) ChangesInstCombine canonicalizes Trigger the inverse transformation in the X86 backend if BMI is Alive proofs: https://alive2.llvm.org/ce/z/MT_pKi Fixes #131587, fixes #133848. Patch is 24.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/136555.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6b71f49165c60..e7dcf4a91e8fc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28,6 +28,7 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -35552,8 +35553,24 @@ bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(
unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
SDValue Y) const {
- if (SelectOpcode != ISD::VSELECT)
+ if (SelectOpcode == ISD::SELECT) {
+ if (VT.isVector())
+ return false;
+ if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
+ return false;
+ using namespace llvm::SDPatternMatch;
+ // BLSI
+ if (BinOpcode == ISD::AND && sd_match(Y, m_Neg(m_Specific(X))))
+ return true;
+ // BLSR
+ if (BinOpcode == ISD::AND && sd_match(Y, m_Add(m_Specific(X), m_AllOnes())))
+ return true;
+ // BLSMSK
+ if (BinOpcode == ISD::XOR && sd_match(Y, m_Add(m_Specific(X), m_AllOnes())))
+ return true;
+
return false;
+ }
// TODO: This is too general. There are cases where pre-AVX512 codegen would
// benefit. The transform may also be profitable for scalar code.
if (!Subtarget.hasAVX512())
diff --git a/llvm/test/CodeGen/X86/bmi-select-distrib.ll b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
new file mode 100644
index 0000000000000..466f877f57600
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
@@ -0,0 +1,778 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi,+bmi2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=X64
+
+define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi1:
+; X86: # %bb.0:
+; X86-NEXT: blsil %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_to_blsi1:
+; X64: # %bb.0:
+; X64-NEXT: blsil %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 0, %a1
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi2:
+; X86: # %bb.0:
+; X86-NEXT: blsil %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_to_blsi2:
+; X64: # %bb.0:
+; X64-NEXT: blsil %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 0, %a1
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %bls, %a1
+ ret i32 %ret
+}
+
+define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi3:
+; X86: # %bb.0:
+; X86-NEXT: blsil %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_to_blsi3:
+; X64: # %bb.0:
+; X64-NEXT: blsil %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 0, %a1
+ %bls = select i1 %a0, i32 -1, i32 %sub
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi_i64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: negl %eax
+; X86-NEXT: sbbl %esi, %edx
+; X86-NEXT: andl %esi, %edx
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovel %esi, %edx
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_to_blsi_i64:
+; X64: # %bb.0:
+; X64-NEXT: blsiq %rsi, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmoveq %rsi, %rax
+; X64-NEXT: retq
+ %sub = sub i64 0, %a1
+ %bls = select i1 %a0, i64 %sub, i64 -1
+ %ret = and i64 %a1, %bls
+ ret i64 %ret
+}
+
+; Negative test
+define i16 @and_select_neg_i16(i1 %a0, i16 %a1) nounwind {
+; X86-LABEL: and_select_neg_i16:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andb $1, %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: negl %esi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpb $1, %cl
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_i16:
+; X64: # %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: negl %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpb $1, %dil
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+ %sub = sub i16 0, %a1
+ %bls = select i1 %a0, i16 %sub, i16 -1
+ %ret = and i16 %a1, %bls
+ ret i16 %ret
+}
+
+; Negative test
+define <4 x i32> @and_select_neg_v4xi32(i1 %a0, <4 x i32> %a1) nounwind {
+; X86-LABEL: and_select_neg_v4xi32:
+; X86: # %bb.0:
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: jne .LBB5_1
+; X86-NEXT: # %bb.2:
+; X86-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-NEXT: pand %xmm1, %xmm0
+; X86-NEXT: retl
+; X86-NEXT: .LBB5_1:
+; X86-NEXT: pxor %xmm1, %xmm1
+; X86-NEXT: psubd %xmm0, %xmm1
+; X86-NEXT: pand %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_v4xi32:
+; X64: # %bb.0:
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: jne .LBB5_1
+; X64-NEXT: # %bb.2:
+; X64-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-NEXT: pand %xmm1, %xmm0
+; X64-NEXT: retq
+; X64-NEXT: .LBB5_1:
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: psubd %xmm0, %xmm1
+; X64-NEXT: pand %xmm1, %xmm0
+; X64-NEXT: retq
+ %sub = sub <4 x i32> zeroinitializer, %a1
+ %bls = select i1 %a0, <4 x i32> %sub, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+ %ret = and <4 x i32> %a1, %bls
+ ret <4 x i32> %ret
+}
+
+; Negative test
+define i32 @and_select_no_neg(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_no_neg:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andb $1, %cl
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: cmpb $1, %cl
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_no_neg:
+; X64: # %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpb $1, %dil
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: orl %esi, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 %a1, 0
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_neg_wrong_const(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_wrong_const:
+; X86: # %bb.0:
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: negl %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: cmovnel %ecx, %edx
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_wrong_const:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: negl %ecx
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 0, %a1
+ %bls = select i1 %a0, i32 %sub, i32 1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_neg_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) nounwind {
+; X86-LABEL: and_select_neg_different_op:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andb $1, %cl
+; X86-NEXT: negl %edx
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: cmpb $1, %cl
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_different_op:
+; X64: # %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: negl %edx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpb $1, %dil
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: orl %edx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 0, %a2
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr1:
+; X86: # %bb.0:
+; X86-NEXT: blsrl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr1:
+; X64: # %bb.0:
+; X64-NEXT: blsrl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr2:
+; X86: # %bb.0:
+; X86-NEXT: blsrl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr2:
+; X64: # %bb.0:
+; X64-NEXT: blsrl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %bls, %a1
+ ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr3:
+; X86: # %bb.0:
+; X86-NEXT: blsrl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr3:
+; X64: # %bb.0:
+; X64-NEXT: blsrl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 -1, i32 %sub
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr4:
+; X86: # %bb.0:
+; X86-NEXT: blsrl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr4:
+; X64: # %bb.0:
+; X64-NEXT: blsrl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 %a1, 1
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr_i64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: andl %esi, %edx
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: cmovel %esi, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr_i64:
+; X64: # %bb.0:
+; X64-NEXT: blsrq %rsi, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmoveq %rsi, %rax
+; X64-NEXT: retq
+ %sub = add i64 %a1, -1
+ %bls = select i1 %a0, i64 %sub, i64 -1
+ %ret = and i64 %a1, %bls
+ ret i64 %ret
+}
+
+; Negative test
+define i16 @and_select_sub_1_i16(i1 %a0, i16 %a1) nounwind {
+; X86-LABEL: and_select_sub_1_i16:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andb $1, %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: leal -1(%edx), %esi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpb $1, %cl
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_i16:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: leal -1(%rsi), %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpb $1, %dil
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+ %sub = add i16 %a1, -1
+ %bls = select i1 %a0, i16 %sub, i16 -1
+ %ret = and i16 %a1, %bls
+ ret i16 %ret
+}
+
+; Negative test
+define <4 x i32> @and_select_sub_1_v4xi32(i1 %a0, <4 x i32> %a1) nounwind {
+; X86-LABEL: and_select_sub_1_v4xi32:
+; X86: # %bb.0:
+; X86-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: je .LBB15_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: paddd %xmm0, %xmm1
+; X86-NEXT: .LBB15_2:
+; X86-NEXT: pand %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_v4xi32:
+; X64: # %bb.0:
+; X64-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: je .LBB15_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: paddd %xmm0, %xmm1
+; X64-NEXT: .LBB15_2:
+; X64-NEXT: pand %xmm1, %xmm0
+; X64-NEXT: retq
+ %sub = add <4 x i32> %a1, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %bls = select i1 %a0, <4 x i32> %sub, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+ %ret = and <4 x i32> %a1, %bls
+ ret <4 x i32> %ret
+}
+
+; Negative test
+define i32 @and_select_no_sub_1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_no_sub_1:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andb $1, %cl
+; X86-NEXT: leal -2(%eax), %edx
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: cmpb $1, %cl
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_no_sub_1:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: leal -2(%rsi), %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpb $1, %dil
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -2
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_sub_1_wrong_const(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_wrong_const:
+; X86: # %bb.0:
+; X86-NEXT: leal -1(%eax), %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: cmovnel %ecx, %edx
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_wrong_const:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: leal -1(%rsi), %ecx
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 %sub, i32 1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_sub_1_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) nounwind {
+; X86-LABEL: and_select_sub_1_different_op:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andb $1, %cl
+; X86-NEXT: decl %edx
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: cmpb $1, %cl
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_different_op:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $edx killed $edx def $rdx
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: leal -1(%rdx), %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpb $1, %dil
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a2, -1
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk1:
+; X86: # %bb.0:
+; X86-NEXT: blsmskl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk1:
+; X64: # %bb.0:
+; X64-NEXT: blsmskl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 %sub, i32 0
+ %ret = xor i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk2(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk2:
+; X86: # %bb.0:
+; X86-NEXT: blsmskl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk2:
+; X64: # %bb.0:
+; X64-NEXT: blsmskl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 %sub, i32 0
+ %ret = xor i32 %bls, %a1
+ ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk3(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk3:
+; X86: # %bb.0:
+; X86-NEXT: blsmskl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk3:
+; X64: # %bb.0:
+; X64-NEXT: blsmskl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 0, i32 %sub
+ %ret = xor i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk4(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk4:
+; X86: # %bb.0:
+; X86-NEXT: blsmskl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk4:
+; X64: # %bb.0:
+; X64-NEXT: blsmskl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 %a1, 1
+ %bls = select i1 %a0, i32 %sub, i32 0
+ %ret = xor i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i64 @xor_select_sub_1_to_blsmsk_i64(i1 %a0, i64 %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk_i64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: xorl %esi, %edx
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: cmovel %esi, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk_i64:
+; X64: # %bb.0:
+; X64-NEXT: blsmskq %rsi, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmoveq %rsi, %rax
+; X64-NEXT: retq
+ %sub = add i64 %a1, -1
+ %bls = select i1 %a0, i64 %sub,...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can't sd_match(X, m_Neg(m_Specific(Y))) occur as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, and I've added this condition now. In this case, however, we still do not emit a BLSI instruction because the negation is used more than once after the transformation (in the other operand of the select). Codegen seems to improve slightly, though. I'm not sure if it's worth it in general. So I'm quite indifferent whether we keep this additional condition or remove it again.
f06d644 to
aac70ed
Compare
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one minor
InstCombine canonicalizes `(select P (and X (- X)) X)` to `(and (select P (- X) umax) X)`. This is counterproductive for the X86 backend when BMI is available because we can encode `(and X (- X))` using the `BLSI` instruction. A similar situation arises if we have `(select P (and X (sub X 1)) X)` (prevents use of `BLSR` instruction) or `(select P (xor X (sub X 1)) X)` (prevents use of `BLSMSK` instruction). Trigger the inverse transformation in the X86 backend if BMI is available and we can use the mentioned BMI instructions. This is done by adjusting the `shouldFoldSelectWithIdentityConstant()` implementation for the X86 backend. In this way, we get `(select P (and X (- X)) X)` again, which enables the use of `BLSI` (similar for the other cases described above). Alive proofs: https://alive2.llvm.org/ce/z/MT_pKi Fixes llvm#131587, fixes llvm#133848.
aac70ed to
4ced124
Compare
|
@mskamp do you need me to commit this? |
Yes, please. Thank you! |
InstCombine canonicalizes `(select P (and X (- X)) X)` to
`(and (select P (- X) umax) X)`. This is counterproductive for the X86
backend when BMI is available because we can encode `(and X (- X))`
using the `BLSI` instruction. A similar situation arises if we have
`(select P (and X (sub X 1)) X)` (prevents use of `BLSR` instruction) or
`(select P (xor X (sub X 1)) X)` (prevents use of `BLSMSK` instruction).
Trigger the inverse transformation in the X86 backend if BMI is
available and we can use the mentioned BMI instructions. This is done by
overriding the appropriate `shouldFoldSelectWithIdentityConstant()`
overload. In this way, we get `(select P (and X (- X)) X)` again, which
enables the use of `BLSI` (similar for the other cases described above).
Alive proofs: https://alive2.llvm.org/ce/z/MT_pKi
Fixes llvm#131587, fixes llvm#133848.
---------
Co-authored-by: Simon Pilgrim <[email protected]>
InstCombine canonicalizes
(select P (and X (- X)) X)to(and (select P (- X) umax) X). This is counterproductive for the X86backend when BMI is available because we can encode
(and X (- X))using the
BLSIinstruction. A similar situation arises if we have(select P (and X (sub X 1)) X)(prevents use ofBLSRinstruction) or(select P (xor X (sub X 1)) X)(prevents use ofBLSMSKinstruction).Trigger the inverse transformation in the X86 backend if BMI is
available and we can use the mentioned BMI instructions. This is done by
overriding the appropriate
shouldFoldSelectWithIdentityConstant()overload. In this way, we get
(select P (and X (- X)) X)again, whichenables the use of
BLSI(similar for the other cases described above).Alive proofs: https://alive2.llvm.org/ce/z/MT_pKi
Fixes #131587, fixes #133848.