From 125c6cb9de6ac7851f0453f46ae53aa86b28f61f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 23 May 2025 09:59:42 +0100
Subject: [PATCH] [X86] lowerVECTOR_SHUFFLE - canonicalize zeros/ones/fp splat
 constants to ensure no undefs

Make it easier for splat/element-equivalent detection by ensuring constant splats contain no undefs.

Integer constants are limited to rematerializable zeros/ones values to avoid unnecessary scalar_to_vector(int) -> load conversions - we can relax this later if useful
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 19 +++++++++++++++++++
 llvm/test/CodeGen/X86/pr34592.ll              |  4 ++--
 llvm/test/CodeGen/X86/pr38639.ll              |  7 ++-----
 .../test/CodeGen/X86/vector-shuffle-128-v4.ll |  4 ++--
 .../test/CodeGen/X86/vector-shuffle-avx512.ll |  3 +--
 5 files changed, 26 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2ce4fa51692b3..92e980574a187 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -18322,6 +18322,25 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
          "canonicalizeShuffleMaskWithHorizOp "
          "shouldn't alter the shuffle mask size");
 
+  // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
+  // These will be materialized uniformly anyway, so make splat matching easier.
+  // TODO: Allow all int constants?
+  auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
+    if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
+      BitVector Undefs;
+      if (SDValue Splat = BV->getSplatValue(&Undefs)) {
+        if (Undefs.any() &&
+            (isNullConstant(Splat) || isAllOnesConstant(Splat) ||
+             isa<ConstantFPSDNode>(Splat))) {
+          V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
+        }
+      }
+    }
+    return V;
+  };
+  V1 = CanonicalizeConstant(V1);
+  V2 = CanonicalizeConstant(V2);
+
   // Commute the shuffle if it will improve canonicalization.
   if (canonicalizeShuffleMaskWithCommute(Mask)) {
     ShuffleVectorSDNode::commuteMask(Mask);
diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll
index aed5ea3ed217b..7cbdb39ddf860 100644
--- a/llvm/test/CodeGen/X86/pr34592.ll
+++ b/llvm/test/CodeGen/X86/pr34592.ll
@@ -24,12 +24,12 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
 ; CHECK-O0-NEXT:    vmovaps 48(%rbp), %ymm11
 ; CHECK-O0-NEXT:    vmovaps 16(%rbp), %ymm11
 ; CHECK-O0-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-O0-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-O0-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
 ; CHECK-O0-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
 ; CHECK-O0-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,1,2,1]
 ; CHECK-O0-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7]
 ; CHECK-O0-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm6[0,1]
-; CHECK-O0-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-O0-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
 ; CHECK-O0-NEXT:    vmovaps %xmm1, %xmm3
 ; CHECK-O0-NEXT:    vmovaps %xmm7, %xmm1
@@ -55,12 +55,12 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
 ; CHECK-O3-NEXT:    vmovdqa 208(%rbp), %ymm3
 ; CHECK-O3-NEXT:    vmovdqa 144(%rbp), %ymm0
 ; CHECK-O3-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6,7]
+; CHECK-O3-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-O3-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
 ; CHECK-O3-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; CHECK-O3-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,1,2,1]
 ; CHECK-O3-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
 ; CHECK-O3-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1]
-; CHECK-O3-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-O3-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
 ; CHECK-O3-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
 ; CHECK-O3-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
diff --git a/llvm/test/CodeGen/X86/pr38639.ll b/llvm/test/CodeGen/X86/pr38639.ll
index 15cc7581454aa..8eb3da1190285 100644
--- a/llvm/test/CodeGen/X86/pr38639.ll
+++ b/llvm/test/CodeGen/X86/pr38639.ll
@@ -6,11 +6,8 @@ define <8 x double> @test(<4 x double> %a, <4 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1]
 ; CHECK-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1]
-; CHECK-NEXT:    # xmm2 = mem[0,0]
-; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
 ; CHECK-NEXT:    retq
   %1 = shufflevector <4 x double> %a, <4 x double> <double undef, double 0x3FEA435134576E1C, double 0x3FEA435134576E1C, double 0x3FEA435134576E1C>, <8 x i32> <i32 6, i32 5, i32 2, i32 3, i32 5, i32 1, i32 3, i32 7>
   ret <8 x double> %1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 62f59e918f00c..0eb72c8bc0be4 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -2402,7 +2402,7 @@ define <4 x float> @shuffle_mem_pmovzx_v4f32(ptr %p0, ptr %p1) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX1-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
 ; AVX1-NEXT:    vmovaps %xmm1, (%rsi)
 ; AVX1-NEXT:    retq
@@ -2411,7 +2411,7 @@ define <4 x float> @shuffle_mem_pmovzx_v4f32(ptr %p0, ptr %p1) {
 ; AVX2OR512VL:       # %bb.0:
 ; AVX2OR512VL-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2OR512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2OR512VL-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2OR512VL-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %xmm0
 ; AVX2OR512VL-NEXT:    vmovaps %xmm1, (%rsi)
 ; AVX2OR512VL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index 545a9d3e314a2..07498c1233b5d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -640,8 +640,7 @@ define <32 x float> @PR47534(<8 x float> %tmp) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [7,25,26,27,7,29,30,31,7,25,26,27,7,29,30,31]
-; CHECK-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [7,17,18,19,7,21,22,23,0,25,26,27,0,29,30,31]
 ; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1
 ; CHECK-NEXT:    ret{{[l|q]}}
   %tmp1 = shufflevector <8 x float> %tmp, <8 x float> undef, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>