Skip to content

Conversation

@topperc
Copy link
Collaborator

@topperc topperc commented Oct 28, 2024

We can use the kz form to automatically zero the extra elements.

Fixes #113263.

We can use the kz form to automatically zero the extra elements.

Fixes llvm#113263.
@llvmbot
Copy link
Member

llvmbot commented Oct 28, 2024

@llvm/pr-subscribers-backend-x86

Author: Craig Topper (topperc)

Changes

We can use the kz form to automatically zero the extra elements.

Fixes #113263.


Full diff: https://github.com/llvm/llvm-project/pull/113970.diff

2 Files Affected:

  • (modified) llvm/lib/Target/X86/X86InstrAVX512.td (+3)
  • (modified) llvm/test/CodeGen/X86/vector-compress.ll (+56)
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 98c31867e6b22b..32c4ebc331f1d7 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -10549,6 +10549,9 @@ multiclass compress_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
   def : Pat<(_.VT (vector_compress _.RC:$src, _.KRCWM:$mask, undef)),
             (!cast<Instruction>(Name#_.ZSuffix#rrkz)
                             _.KRCWM:$mask, _.RC:$src)>;
+  def : Pat<(_.VT (vector_compress _.RC:$src, _.KRCWM:$mask, _.ImmAllZerosV)),
+            (!cast<Instruction>(Name#_.ZSuffix#rrkz)
+                            _.KRCWM:$mask, _.RC:$src)>;
   def : Pat<(_.VT (vector_compress _.RC:$src, _.KRCWM:$mask, _.RC:$passthru)),
               (!cast<Instruction>(Name#_.ZSuffix#rrk)
                             _.RC:$passthru, _.KRCWM:$mask, _.RC:$src)>;
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index 2b963ab896cc9e..f8c076db65de94 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -1211,3 +1211,59 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i
     %out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef)
     ret <3 x i3> %out
 }
+
+define <4 x i32> @test_compress_v4i32_zero_passthru(<4 x i32> %vec, <4 x i1> %mask) {
+; AVX2-LABEL: test_compress_v4i32_zero_passthru:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovd %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    vextractps $1, %xmm0, -24(%rsp,%rax,4)
+; AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    vextractps $2, %xmm0, -24(%rsp,%rcx,4)
+; AVX2-NEXT:    vpextrd $2, %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $3, %eax
+; AVX2-NEXT:    vextractps $3, %xmm0, -24(%rsp,%rax,4)
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    cmpq $3, %rcx
+; AVX2-NEXT:    movl $3, %edx
+; AVX2-NEXT:    cmovbq %rcx, %rdx
+; AVX2-NEXT:    vextractps $3, %xmm0, %ecx
+; AVX2-NEXT:    cmovbel %eax, %ecx
+; AVX2-NEXT:    movl %ecx, -24(%rsp,%rdx,4)
+; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_compress_v4i32_zero_passthru:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_compress_v4i32_zero_passthru:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; AVX512VL-NEXT:    vpcompressd %xmm0, %xmm0 {%k1} {z}
+; AVX512VL-NEXT:    retq
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> zeroinitializer)
+    ret <4 x i32> %out
+}

@topperc topperc merged commit 635c344 into llvm:main Oct 29, 2024
10 checks passed
@topperc topperc deleted the pr/x86-compress branch October 29, 2024 02:59
NoumanAmir657 pushed a commit to NoumanAmir657/llvm-project that referenced this pull request Nov 4, 2024
…#113970)

We can use the kz form to automatically zero the extra elements.

Fixes llvm#113263.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[AVX-512] llvm.experimental.vector.compress emits a vector-zeroing instruction instead of using {z}

3 participants