Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Type.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Compiler.h"
Expand Down Expand Up @@ -3480,6 +3481,26 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
break;
}
break;
case ISD::VECTOR_COMPRESS: {
assert(!Op.getValueType().isScalableVector());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this always the case or do you mean sve isn't supported?


SDValue Vec = Op.getOperand(0);
SDValue PassThru = Op.getOperand(2);
// If PassThru is undefined, early out
if (PassThru.isUndef())
break;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really see the need for this since if PassThru is undef then the first recursive call to computeKnownBits below should return quickly with all bits unknown.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1


Known.Zero.setAllBits();
Known.One.setAllBits();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The setAllBits calls can now be removed

Known2 = computeKnownBits(PassThru, Depth + 1);
Known = Known.intersectWith(Known2);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could just use Known = computeKnownBits(PassThru, Depth + 1); here

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, can we use DemandedElts for the PassThru knownbits? AFAICT that will stay in the same elements if used?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding is that given a Mask like {0, 0, 1, 1}, VECTOR_COMPRESS will take elements from the third and four indices in passthru? I believe DemandedElts would be able to work then if so.

// If we don't know any bits, early out.
if (Known.isUnknown())
break;
Known2 = computeKnownBits(Vec, Depth + 1);
Known = Known.intersectWith(Known2);
break;
}
case ISD::VECTOR_SHUFFLE: {
assert(!Op.getValueType().isScalableVector());
// Collect the known bits that are shared by every vector element referenced
Expand Down Expand Up @@ -4792,6 +4813,19 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
}
return Tmp;

case ISD::VECTOR_COMPRESS: {
SDValue Vec = Op.getOperand(0);
SDValue Mask = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
// If PassThru is undefined, early out.
if (PassThru.isUndef())
return 1;
Tmp = ComputeNumSignBits(Vec, Depth + 1);
Tmp2 = ComputeNumSignBits(PassThru, Depth + 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can make the early-out more generic:

Suggested change
// If PassThru is undefined, early out.
if (PassThru.isUndef())
return 1;
Tmp = ComputeNumSignBits(Vec, Depth + 1);
Tmp2 = ComputeNumSignBits(PassThru, Depth + 1);
Tmp = ComputeNumSignBits(PassThru, Depth + 1);
if (Tmp == 1)
return 1;
Tmp2 = ComputeNumSignBits(Vec, Depth + 1);

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again - we might be able to use DemandedElts for the PassThru signbits calc

Tmp = std::min(Tmp, Tmp2);
return Tmp;
}

case ISD::VECTOR_SHUFFLE: {
// Collect the minimum number of sign bits that are shared by every vector
// element referenced by the shuffle.
Expand Down
113 changes: 113 additions & 0 deletions llvm/test/CodeGen/AArch64/vector-compress.ll
Original file line number Diff line number Diff line change
Expand Up @@ -471,3 +471,116 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i
%out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef)
ret <3 x i3> %out
}

define <4 x i32> @test_compress_knownbits_zext_v4i16_4i32(<4 x i16> %vec, <4 x i1> %mask, <4 x i32> %passthru) nounwind {
; CHECK-LABEL: test_compress_knownbits_zext_v4i16_4i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: movi.4s v3, #1
; CHECK-NEXT: mov x14, sp
; CHECK-NEXT: movi.4s v4, #3
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: mov x13, sp
; CHECK-NEXT: mov x12, sp
; CHECK-NEXT: mov x15, sp
; CHECK-NEXT: shl.4s v1, v1, #31
; CHECK-NEXT: and.16b v2, v2, v4
; CHECK-NEXT: cmlt.4s v1, v1, #0
; CHECK-NEXT: str q2, [sp]
; CHECK-NEXT: and.16b v3, v1, v3
; CHECK-NEXT: mov.s w8, v1[1]
; CHECK-NEXT: mov.s w9, v1[2]
; CHECK-NEXT: mov.s w10, v1[3]
; CHECK-NEXT: fmov w11, s1
; CHECK-NEXT: addv.4s s1, v3
; CHECK-NEXT: and x16, x11, #0x1
; CHECK-NEXT: and x8, x8, #0x1
; CHECK-NEXT: bfi x14, x11, #2, #1
; CHECK-NEXT: add x8, x16, x8
; CHECK-NEXT: and x9, x9, #0x1
; CHECK-NEXT: and x10, x10, #0x1
; CHECK-NEXT: fmov w11, s1
; CHECK-NEXT: add x9, x8, x9
; CHECK-NEXT: mov w16, #3 ; =0x3
; CHECK-NEXT: add x10, x9, x10
; CHECK-NEXT: orr x8, x12, x8, lsl #2
; CHECK-NEXT: bfi x15, x9, #2, #2
; CHECK-NEXT: cmp x10, #3
; CHECK-NEXT: bfi x13, x11, #2, #2
; CHECK-NEXT: mov.s w11, v0[3]
; CHECK-NEXT: csel x9, x10, x16, lo
; CHECK-NEXT: ldr w13, [x13]
; CHECK-NEXT: str s0, [sp]
; CHECK-NEXT: st1.s { v0 }[1], [x14]
; CHECK-NEXT: st1.s { v0 }[2], [x8]
; CHECK-NEXT: orr x8, x12, x9, lsl #2
; CHECK-NEXT: csel w9, w11, w13, hi
; CHECK-NEXT: st1.s { v0 }[3], [x15]
; CHECK-NEXT: str w9, [x8]
; CHECK-NEXT: ldr q0, [sp], #16
; CHECK-NEXT: ret
entry:
%xvec = zext <4 x i16> %vec to <4 x i32>
%xpassthru = and <4 x i32> %passthru, splat (i32 3)
%out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %xvec, <4 x i1> %mask, <4 x i32> %xpassthru)
%res = and <4 x i32> %out, splat (i32 65535)
ret <4 x i32> %res
}

define <4 x i32> @test_compress_numsignbits_sext_v4i16_4i32(<4 x i16> %vec, <4 x i1> %mask, <4 x i32> %passthru) nounwind {
; CHECK-LABEL: test_compress_numsignbits_sext_v4i16_4i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: movi.4s v3, #1
; CHECK-NEXT: mov x14, sp
; CHECK-NEXT: movi.4s v4, #3
; CHECK-NEXT: sshll.4s v0, v0, #0
; CHECK-NEXT: mov x13, sp
; CHECK-NEXT: mov x12, sp
; CHECK-NEXT: mov x15, sp
; CHECK-NEXT: shl.4s v1, v1, #31
; CHECK-NEXT: and.16b v2, v2, v4
; CHECK-NEXT: cmlt.4s v1, v1, #0
; CHECK-NEXT: str q2, [sp]
; CHECK-NEXT: and.16b v3, v1, v3
; CHECK-NEXT: mov.s w8, v1[1]
; CHECK-NEXT: mov.s w9, v1[2]
; CHECK-NEXT: mov.s w10, v1[3]
; CHECK-NEXT: fmov w11, s1
; CHECK-NEXT: addv.4s s1, v3
; CHECK-NEXT: and x16, x11, #0x1
; CHECK-NEXT: and x8, x8, #0x1
; CHECK-NEXT: bfi x14, x11, #2, #1
; CHECK-NEXT: add x8, x16, x8
; CHECK-NEXT: and x9, x9, #0x1
; CHECK-NEXT: and x10, x10, #0x1
; CHECK-NEXT: fmov w11, s1
; CHECK-NEXT: add x9, x8, x9
; CHECK-NEXT: mov w16, #3 ; =0x3
; CHECK-NEXT: add x10, x9, x10
; CHECK-NEXT: orr x8, x12, x8, lsl #2
; CHECK-NEXT: bfi x15, x9, #2, #2
; CHECK-NEXT: cmp x10, #3
; CHECK-NEXT: bfi x13, x11, #2, #2
; CHECK-NEXT: mov.s w11, v0[3]
; CHECK-NEXT: csel x9, x10, x16, lo
; CHECK-NEXT: ldr w13, [x13]
; CHECK-NEXT: str s0, [sp]
; CHECK-NEXT: st1.s { v0 }[1], [x14]
; CHECK-NEXT: st1.s { v0 }[2], [x8]
; CHECK-NEXT: orr x8, x12, x9, lsl #2
; CHECK-NEXT: csel w9, w11, w13, hi
; CHECK-NEXT: st1.s { v0 }[3], [x15]
; CHECK-NEXT: str w9, [x8]
; CHECK-NEXT: ldr q0, [sp], #16
; CHECK-NEXT: ret
entry:
%xvec = sext <4 x i16> %vec to <4 x i32>
%xpassthru = and <4 x i32> %passthru, splat(i32 3)
%out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %xvec, <4 x i1> %mask, <4 x i32> %xpassthru)
%shl = shl <4 x i32> %out, splat(i32 16)
%res = ashr <4 x i32> %shl, splat(i32 16)
ret <4 x i32> %res
}
209 changes: 209 additions & 0 deletions llvm/test/CodeGen/X86/vector-compress.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4427,6 +4427,215 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
ret <64 x i32> %out
}

define <8 x i64> @test_compress_knownbits_zext_v8i16_8i64(<8 x i16> %vec, <8 x i1> %mask, <8 x i64> %passthru) nounwind {
; AVX2-LABEL: test_compress_knownbits_zext_v8i16_8i64:
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $96, %rsp
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [3,3,3,3]
; AVX2-NEXT: vandps %ymm5, %ymm2, %ymm2
; AVX2-NEXT: vandps %ymm5, %ymm3, %ymm3
; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovaps %ymm2, (%rsp)
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
; AVX2-NEXT: vmovq %xmm2, %rax
; AVX2-NEXT: addl %ecx, %eax
; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: vpextrw $1, %xmm1, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: vmovd %xmm1, %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addq %rdx, %rcx
; AVX2-NEXT: vpextrw $2, %xmm1, %esi
; AVX2-NEXT: andl $1, %esi
; AVX2-NEXT: addq %rcx, %rsi
; AVX2-NEXT: vpextrw $3, %xmm1, %edi
; AVX2-NEXT: andl $1, %edi
; AVX2-NEXT: addq %rsi, %rdi
; AVX2-NEXT: vpextrw $4, %xmm1, %r8d
; AVX2-NEXT: andl $1, %r8d
; AVX2-NEXT: addq %rdi, %r8
; AVX2-NEXT: vpextrw $5, %xmm1, %r9d
; AVX2-NEXT: andl $1, %r9d
; AVX2-NEXT: addq %r8, %r9
; AVX2-NEXT: vpextrw $6, %xmm1, %r10d
; AVX2-NEXT: andl $1, %r10d
; AVX2-NEXT: addq %r9, %r10
; AVX2-NEXT: vpextrw $7, %xmm1, %r11d
; AVX2-NEXT: andl $1, %r11d
; AVX2-NEXT: addq %r10, %r11
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm1, %rbx
; AVX2-NEXT: cmpq $8, %r11
; AVX2-NEXT: cmovbq (%rsp,%rax,8), %rbx
; AVX2-NEXT: vmovq %xmm4, (%rsp)
; AVX2-NEXT: vpextrq $1, %xmm4, (%rsp,%rdx,8)
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2
; AVX2-NEXT: vmovq %xmm2, (%rsp,%rcx,8)
; AVX2-NEXT: vpextrq $1, %xmm2, (%rsp,%rsi,8)
; AVX2-NEXT: andl $7, %edi
; AVX2-NEXT: vmovq %xmm0, (%rsp,%rdi,8)
; AVX2-NEXT: andl $7, %r8d
; AVX2-NEXT: vpextrq $1, %xmm0, (%rsp,%r8,8)
; AVX2-NEXT: andl $7, %r9d
; AVX2-NEXT: vmovq %xmm1, (%rsp,%r9,8)
; AVX2-NEXT: andl $7, %r10d
; AVX2-NEXT: vpextrq $1, %xmm1, (%rsp,%r10,8)
; AVX2-NEXT: cmpq $7, %r11
; AVX2-NEXT: movl $7, %eax
; AVX2-NEXT: cmovbq %r11, %rax
; AVX2-NEXT: movl %eax, %eax
; AVX2-NEXT: movq %rbx, (%rsp,%rax,8)
; AVX2-NEXT: vmovaps (%rsp), %ymm0
; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
; AVX2-NEXT: leaq -8(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_compress_knownbits_zext_v8i16_8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_compress_knownbits_zext_v8i16_8i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1
; AVX512VL-NEXT: vpmovw2m %xmm1, %k1
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
%xvec = zext <8 x i16> %vec to <8 x i64> ; 0 -> 65535
%xpassthru = and <8 x i64> %passthru, splat (i64 3) ; 0 -> 3
%out = call <8 x i64> @llvm.experimental.vector.compress(<8 x i64> %xvec, <8 x i1> %mask, <8 x i64> %xpassthru)
%res = and <8 x i64> %out, splat (i64 65535) ; unnecessary - %out guaranteed to be 0 -> 65535
ret <8 x i64> %res
}

define <8 x i64> @test_compress_knownbits_sext_v8i16_8i64(<8 x i16> %vec, <8 x i1> %mask, <8 x i64> %passthru) nounwind {
; AVX2-LABEL: test_compress_knownbits_sext_v8i16_8i64:
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $96, %rsp
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm4
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [3,3,3,3]
; AVX2-NEXT: vandps %ymm5, %ymm2, %ymm2
; AVX2-NEXT: vandps %ymm5, %ymm3, %ymm3
; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovaps %ymm2, (%rsp)
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
; AVX2-NEXT: vmovq %xmm2, %rax
; AVX2-NEXT: addl %ecx, %eax
; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: vpextrw $1, %xmm1, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: vmovd %xmm1, %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addq %rdx, %rcx
; AVX2-NEXT: vpextrw $2, %xmm1, %esi
; AVX2-NEXT: andl $1, %esi
; AVX2-NEXT: addq %rcx, %rsi
; AVX2-NEXT: vpextrw $3, %xmm1, %edi
; AVX2-NEXT: andl $1, %edi
; AVX2-NEXT: addq %rsi, %rdi
; AVX2-NEXT: vpextrw $4, %xmm1, %r8d
; AVX2-NEXT: andl $1, %r8d
; AVX2-NEXT: addq %rdi, %r8
; AVX2-NEXT: vpextrw $5, %xmm1, %r9d
; AVX2-NEXT: andl $1, %r9d
; AVX2-NEXT: addq %r8, %r9
; AVX2-NEXT: vpextrw $6, %xmm1, %r10d
; AVX2-NEXT: andl $1, %r10d
; AVX2-NEXT: addq %r9, %r10
; AVX2-NEXT: vpextrw $7, %xmm1, %r11d
; AVX2-NEXT: andl $1, %r11d
; AVX2-NEXT: addq %r10, %r11
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm1, %rbx
; AVX2-NEXT: cmpq $8, %r11
; AVX2-NEXT: cmovbq (%rsp,%rax,8), %rbx
; AVX2-NEXT: vmovq %xmm4, (%rsp)
; AVX2-NEXT: vpextrq $1, %xmm4, (%rsp,%rdx,8)
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2
; AVX2-NEXT: vmovq %xmm2, (%rsp,%rcx,8)
; AVX2-NEXT: vpextrq $1, %xmm2, (%rsp,%rsi,8)
; AVX2-NEXT: andl $7, %edi
; AVX2-NEXT: vmovq %xmm0, (%rsp,%rdi,8)
; AVX2-NEXT: andl $7, %r8d
; AVX2-NEXT: vpextrq $1, %xmm0, (%rsp,%r8,8)
; AVX2-NEXT: andl $7, %r9d
; AVX2-NEXT: vmovq %xmm1, (%rsp,%r9,8)
; AVX2-NEXT: andl $7, %r10d
; AVX2-NEXT: vpextrq $1, %xmm1, (%rsp,%r10,8)
; AVX2-NEXT: cmpq $7, %r11
; AVX2-NEXT: movl $7, %eax
; AVX2-NEXT: cmovbq %r11, %rax
; AVX2-NEXT: movl %eax, %eax
; AVX2-NEXT: movq %rbx, (%rsp,%rax,8)
; AVX2-NEXT: vmovaps (%rsp), %ymm0
; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
; AVX2-NEXT: leaq -8(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_compress_knownbits_sext_v8i16_8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm1
; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_compress_knownbits_sext_v8i16_8i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1
; AVX512VL-NEXT: vpmovw2m %xmm1, %k1
; AVX512VL-NEXT: vpmovsxwq %xmm0, %zmm1
; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
%xvec = sext <8 x i16> %vec to <8 x i64> ; sign extend vec
%xpassthru = and <8 x i64> %passthru, splat(i64 3)
%out = call <8 x i64> @llvm.experimental.vector.compress(<8 x i64> %xvec, <8 x i1> %mask, <8 x i64> %xpassthru)
%shl = shl <8 x i64> %out, splat(i64 48)
%res = ashr <8 x i64> %shl, splat(i64 48)
ret <8 x i64> %res
}

define <4 x i32> @test_compress_all_const() nounwind {
; AVX2-LABEL: test_compress_all_const:
; AVX2: # %bb.0:
Expand Down
Loading