From 125c6cb9de6ac7851f0453f46ae53aa86b28f61f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 23 May 2025 09:59:42 +0100 Subject: [PATCH] [X86] lowerVECTOR_SHUFFLE - canonicalize zeros/ones/fp splat constants to ensure no undefs Make it easier for splat/element-equivalent detection by ensuring constant splats contain no undefs. Integer constants are limited to rematerializable zeros/ones values to avoid unnecessary scalar_to_vector(int) -> load conversions - we can relax this later if useful --- llvm/lib/Target/X86/X86ISelLowering.cpp | 19 +++++++++++++++++++ llvm/test/CodeGen/X86/pr34592.ll | 4 ++-- llvm/test/CodeGen/X86/pr38639.ll | 7 ++----- .../test/CodeGen/X86/vector-shuffle-128-v4.ll | 4 ++-- .../test/CodeGen/X86/vector-shuffle-avx512.ll | 3 +-- 5 files changed, 26 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2ce4fa51692b3..92e980574a187 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -18322,6 +18322,25 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget, "canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"); + // Canonicalize zeros/ones/fp splat constants to ensure no undefs. + // These will be materialized uniformly anyway, so make splat matching easier. + // TODO: Allow all int constants? + auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) { + if (auto *BV = dyn_cast(V)) { + BitVector Undefs; + if (SDValue Splat = BV->getSplatValue(&Undefs)) { + if (Undefs.any() && + (isNullConstant(Splat) || isAllOnesConstant(Splat) || + isa(Splat))) { + V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat)); + } + } + } + return V; + }; + V1 = CanonicalizeConstant(V1); + V2 = CanonicalizeConstant(V2); + // Commute the shuffle if it will improve canonicalization. if (canonicalizeShuffleMaskWithCommute(Mask)) { ShuffleVectorSDNode::commuteMask(Mask); diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll index aed5ea3ed217b..7cbdb39ddf860 100644 --- a/llvm/test/CodeGen/X86/pr34592.ll +++ b/llvm/test/CodeGen/X86/pr34592.ll @@ -24,12 +24,12 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1 ; CHECK-O0-NEXT: vmovaps 48(%rbp), %ymm11 ; CHECK-O0-NEXT: vmovaps 16(%rbp), %ymm11 ; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; CHECK-O0-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; CHECK-O0-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] ; CHECK-O0-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,1] ; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7] ; CHECK-O0-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm6[0,1] -; CHECK-O0-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; CHECK-O0-NEXT: vmovaps %xmm1, %xmm3 ; CHECK-O0-NEXT: vmovaps %xmm7, %xmm1 @@ -55,12 +55,12 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1 ; CHECK-O3-NEXT: vmovdqa 208(%rbp), %ymm3 ; CHECK-O3-NEXT: vmovdqa 144(%rbp), %ymm0 ; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; CHECK-O3-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; CHECK-O3-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-O3-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,2,1] ; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; CHECK-O3-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1] -; CHECK-O3-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] ; CHECK-O3-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] diff --git a/llvm/test/CodeGen/X86/pr38639.ll b/llvm/test/CodeGen/X86/pr38639.ll index 15cc7581454aa..8eb3da1190285 100644 --- a/llvm/test/CodeGen/X86/pr38639.ll +++ b/llvm/test/CodeGen/X86/pr38639.ll @@ -6,11 +6,8 @@ define <8 x double> @test(<4 x double> %a, <4 x double> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1] ; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1] -; CHECK-NEXT: # xmm2 = mem[0,0] -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; CHECK-NEXT: retq %1 = shufflevector <4 x double> %a, <4 x double> , <8 x i32> ret <8 x double> %1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll index 62f59e918f00c..0eb72c8bc0be4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -2402,7 +2402,7 @@ define <4 x float> @shuffle_mem_pmovzx_v4f32(ptr %p0, ptr %p1) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] ; AVX1-NEXT: vmovaps %xmm1, (%rsi) ; AVX1-NEXT: retq @@ -2411,7 +2411,7 @@ define <4 x float> @shuffle_mem_pmovzx_v4f32(ptr %p0, ptr %p1) { ; AVX2OR512VL: # %bb.0: ; AVX2OR512VL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %xmm0 ; AVX2OR512VL-NEXT: vmovaps %xmm1, (%rsi) ; AVX2OR512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll index 545a9d3e314a2..07498c1233b5d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -640,8 +640,7 @@ define <32 x float> @PR47534(<8 x float> %tmp) { ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [7,25,26,27,7,29,30,31,7,25,26,27,7,29,30,31] -; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [7,17,18,19,7,21,22,23,0,25,26,27,0,29,30,31] ; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: ret{{[l|q]}} %tmp1 = shufflevector <8 x float> %tmp, <8 x float> undef, <32 x i32>