[X86] Handle X86ISD::EXPAND/COMPRESS nodes as target shuffles (#171119)

RKSimon · web-flow · commit ebdb903c105d · 2025-12-08T15:48:43.000Z
Allows for shuffle simplification

Required a minor fix to the overly reduced compress-undef-float-passthrough.ll regression test
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2888,6 +2888,8 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::VPERMV:
   case X86ISD::VPERMV3:
   case X86ISD::VZEXT_MOVL:
+  case X86ISD::COMPRESS:
+  case X86ISD::EXPAND:
     return true;
   }
 }
@@ -5839,6 +5841,48 @@ static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
     }
     return false;
   }
+  case X86ISD::COMPRESS: {
+    SDValue CmpVec = N.getOperand(0);
+    SDValue PassThru = N.getOperand(1);
+    SDValue CmpMask = N.getOperand(2);
+    APInt UndefElts;
+    SmallVector<APInt> EltBits;
+    if (!getTargetConstantBitsFromNode(CmpMask, 1, UndefElts, EltBits))
+      return false;
+    assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
+           "Illegal compression mask");
+    for (unsigned I = 0; I != NumElems; ++I) {
+      if (!EltBits[I].isZero())
+        Mask.push_back(I);
+    }
+    while (Mask.size() != NumElems) {
+      Mask.push_back(NumElems + Mask.size());
+    }
+    Ops.push_back(CmpVec);
+    Ops.push_back(PassThru);
+    return true;
+  }
+  case X86ISD::EXPAND: {
+    SDValue ExpVec = N.getOperand(0);
+    SDValue PassThru = N.getOperand(1);
+    SDValue ExpMask = N.getOperand(2);
+    APInt UndefElts;
+    SmallVector<APInt> EltBits;
+    if (!getTargetConstantBitsFromNode(ExpMask, 1, UndefElts, EltBits))
+      return false;
+    assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
+           "Illegal expansion mask");
+    unsigned ExpIndex = 0;
+    for (unsigned I = 0; I != NumElems; ++I) {
+      if (EltBits[I].isZero())
+        Mask.push_back(I + NumElems);
+      else
+        Mask.push_back(ExpIndex++);
+    }
+    Ops.push_back(ExpVec);
+    Ops.push_back(PassThru);
+    return true;
+  }
   default:
     llvm_unreachable("unknown target shuffle node");
   }
@@ -61325,6 +61369,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::VPERM2X128:
   case X86ISD::SHUF128:
   case X86ISD::VZEXT_MOVL:
+  case X86ISD::COMPRESS:
+  case X86ISD::EXPAND:
   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
   case X86ISD::FMADD_RND:
   case X86ISD::FMSUB:
diff --git a/llvm/test/CodeGen/X86/compress-undef-float-passthrough.ll b/llvm/test/CodeGen/X86/compress-undef-float-passthrough.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512f,avx512vl | FileCheck %s --check-prefix=CHECK
 
-define void @test_compress_undef_float_passthrough() {
+define void @test_compress_undef_float_passthrough(<4 x double> %a0) {
 ; CHECK-LABEL: test_compress_undef_float_passthrough:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movb $5, %al
@@ -12,7 +12,7 @@ define void @test_compress_undef_float_passthrough() {
 ; CHECK-NEXT:    retq
 entry:                                          ; preds = %loop.50
   %0 = bitcast i4 undef to <4 x i1>
-  %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> <i1 1, i1 0, i1 1, i1 0>)
+  %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %a0, <4 x double> undef, <4 x i1> <i1 1, i1 0, i1 1, i1 0>)
   call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %1, <4 x ptr> undef, i32 0, <4 x i1> %0)
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
@@ -1035,68 +1035,23 @@ define <8 x double> @concat_vpermilvar_v8f64_v4f64(<4 x double> %a0, <4 x double
   ret <8 x double> %res
 }
 
-; TODO - shift elements up by one
+; shift elements up by one
 define <16 x i32> @combine_vexpandd_as_valignd(<16 x i32>  %x) {
-; X86-AVX512F-LABEL: combine_vexpandd_as_valignd:
-; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    movw $-2, %ax
-; X86-AVX512F-NEXT:    kmovw %eax, %k1
-; X86-AVX512F-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
-; X86-AVX512F-NEXT:    retl
-;
-; X86-AVX512BW-LABEL: combine_vexpandd_as_valignd:
-; X86-AVX512BW:       # %bb.0:
-; X86-AVX512BW-NEXT:    movw $-2, %ax
-; X86-AVX512BW-NEXT:    kmovd %eax, %k1
-; X86-AVX512BW-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
-; X86-AVX512BW-NEXT:    retl
-;
-; X64-AVX512F-LABEL: combine_vexpandd_as_valignd:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    movw $-2, %ax
-; X64-AVX512F-NEXT:    kmovw %eax, %k1
-; X64-AVX512F-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
-; X64-AVX512F-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: combine_vexpandd_as_valignd:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    movw $-2, %ax
-; X64-AVX512BW-NEXT:    kmovd %eax, %k1
-; X64-AVX512BW-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
-; X64-AVX512BW-NEXT:    retq
+; CHECK-LABEL: combine_vexpandd_as_valignd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    valignd {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %x, <16 x i32> zeroinitializer, <16 x i1> <i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
   ret <16 x i32> %res
 }
 
-; TODO - zero upper half of vector
+; zero upper half of vector
 define <16 x i32> @combine_vcompressd_as_vmov(<16 x i32> %x) {
-; X86-AVX512F-LABEL: combine_vcompressd_as_vmov:
-; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    movw $255, %ax
-; X86-AVX512F-NEXT:    kmovw %eax, %k1
-; X86-AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
-; X86-AVX512F-NEXT:    retl
-;
-; X86-AVX512BW-LABEL: combine_vcompressd_as_vmov:
-; X86-AVX512BW:       # %bb.0:
-; X86-AVX512BW-NEXT:    movw $255, %ax
-; X86-AVX512BW-NEXT:    kmovd %eax, %k1
-; X86-AVX512BW-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
-; X86-AVX512BW-NEXT:    retl
-;
-; X64-AVX512F-LABEL: combine_vcompressd_as_vmov:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    movw $255, %ax
-; X64-AVX512F-NEXT:    kmovw %eax, %k1
-; X64-AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
-; X64-AVX512F-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: combine_vcompressd_as_vmov:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    movw $255, %ax
-; X64-AVX512BW-NEXT:    kmovd %eax, %k1
-; X64-AVX512BW-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
-; X64-AVX512BW-NEXT:    retq
+; CHECK-LABEL: combine_vcompressd_as_vmov:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %x, <16 x i32> zeroinitializer, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
   ret <16 x i32> %res
 }