-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[X86] SimplifyDemandedBitsForTargetNode - PCMPGT(0,X) only demands the signbit of X #163981
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…e signbit of X PCMPGT(0,X) is a sign-splat pattern - we only need the signbit of X The vector-compress.ll simplification is a side effect of the inner SimplifyDemandedBits call passing the DemandedElts mask, only demanding the lowest 4 elements of a legalised v16i8 type.
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesPCMPGT(0,X) is a sign-splat pattern - we only need the signbit of X The vector-compress.ll simplification is a side effect of the inner SimplifyDemandedBits call passing through the DemandedElts mask, only demanding the lowest 4 elements of a legalised v16i8 type. Full diff: https://github.com/llvm/llvm-project/pull/163981.diff 5 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2feee053ea813..b05d7c7fd7da3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -44813,10 +44813,16 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
- // iff we only need the sign bit then we can use R directly.
- if (OriginalDemandedBits.isSignMask() &&
- ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
- return TLO.CombineTo(Op, Op.getOperand(1));
+ if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) {
+ // iff we only need the signbit then we can use R directly.
+ if (OriginalDemandedBits.isSignMask())
+ return TLO.CombineTo(Op, Op.getOperand(1));
+ // otherwise we just need R's signbit for the comparison.
+ APInt SignMask = APInt::getSignMask(BitWidth);
+ if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts,
+ Known, TLO, Depth + 1))
+ return true;
+ }
break;
case X86ISD::MOVMSK: {
SDValue Src = Op.getOperand(0);
diff --git a/llvm/test/CodeGen/X86/combine-umax.ll b/llvm/test/CodeGen/X86/combine-umax.ll
index 25f8ec891a247..482b4fcd744ed 100644
--- a/llvm/test/CodeGen/X86/combine-umax.ll
+++ b/llvm/test/CodeGen/X86/combine-umax.ll
@@ -60,7 +60,7 @@ define <16 x i8> @test_v16i8_reassociation(<16 x i8> %a) {
define <16 x i8> @test_v16i8_demandedbits(<16 x i8> %x, <16 x i8> %y, <16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: test_v16i8_demandedbits:
; SSE2: # %bb.0:
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm3
diff --git a/llvm/test/CodeGen/X86/combine-umin.ll b/llvm/test/CodeGen/X86/combine-umin.ll
index 76dbcb50bf8c7..e2757d00d1f61 100644
--- a/llvm/test/CodeGen/X86/combine-umin.ll
+++ b/llvm/test/CodeGen/X86/combine-umin.ll
@@ -77,7 +77,7 @@ define <16 x i8> @test_v16i8_reassociation(<16 x i8> %a) {
define <16 x i8> @test_v16i8_demandedbits(<16 x i8> %x, <16 x i8> %y, <16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: test_v16i8_demandedbits:
; SSE2: # %bb.0:
-; SSE2-NEXT: pminub %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index ac932d51017ae..1a6351524ffbd 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -1090,7 +1090,6 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1335,7 +1334,6 @@ define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8>
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $64, %rsp
; AVX2-NEXT: vpsllw $7, %ymm1, %ymm1
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm3
; AVX2-NEXT: vmovaps %ymm2, (%rsp)
@@ -4733,7 +4731,6 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) nounwind {
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
@@ -4751,72 +4748,7 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) nounwind {
; AVX2-NEXT: vpextrb $3, %xmm1, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: vpextrb $4, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT: vpextrb $4, %xmm1, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $5, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT: vpextrb $6, %xmm1, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $6, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $7, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT: vpextrb $8, %xmm1, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $8, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $9, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT: vpextrb $10, %xmm1, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $10, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $11, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT: vpextrb $12, %xmm1, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $12, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $13, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT: vpextrb $14, %xmm1, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addl %ecx, %eax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $14, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $15, %xmm0, -24(%rsp,%rax)
+; AVX2-NEXT: vpextrb $15, %xmm0, -24(%rsp,%rcx)
; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index 1c5be0330b04e..ac330a7e60396 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -151,23 +151,19 @@ define <32 x i8> @PR22706(<32 x i1> %x) {
; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR22706:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
|
@boomanaiden154 Any idea why the CI has started to always fail here? |
That was #163937. It should be fixed if you update the branch. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
; SSE2-LABEL: test_v16i8_demandedbits: | ||
; SSE2: # %bb.0: | ||
; SSE2-NEXT: pmaxub %xmm1, %xmm0 | ||
; SSE2-NEXT: por %xmm1, %xmm0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems AVX512BW doesn't utilize the info. I assume vpor + vpmovb2m
is a litter better.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes we don't have an equivalent for generic ICMP yet
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/18/builds/22048 Here is the relevant piece of the build log for the reference
|
PCMPGT(0,X) is a sign-splat pattern - we only need the signbit of X
The vector-compress.ll simplification is a side effect of the inner SimplifyDemandedBits call passing through the DemandedElts mask, only demanding the lowest 4 elements of a legalised v16i8 type.