From 259d7115a6b7efe14153d1f05c378421b6d9e1f7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 13 May 2025 14:28:20 +0100 Subject: [PATCH] [X86] IsElementEquivalent - add basic handling for X86ISD::SUBV_BROADCAST_LOAD If we're broadcasting a subvector, then the elements at equal positions within each subvector are the same. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 8 ++ ...d_vector_inreg_of_broadcast_from_memory.ll | 19 ++- ...d_vector_inreg_of_broadcast_from_memory.ll | 108 +++++++----------- 3 files changed, 56 insertions(+), 79 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e3bb5db07ac40..fdf5c339c7169 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10010,6 +10010,14 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, case X86ISD::VBROADCAST_LOAD: // TODO: Handle MaskSize != VT.getVectorNumElements()? return (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize); + case X86ISD::SUBV_BROADCAST_LOAD: + // TODO: Handle MaskSize != VT.getVectorNumElements()? + if (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize) { + auto *MemOp = cast(Op); + unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements(); + return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts); + } + break; case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::FHADD: diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 61e122b1aba36..d6e6ad184e03f 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1560,8 +1560,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7] ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -3381,15 +3380,13 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] -; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5,6,7] +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index a3e2fb5321f32..f42b03a077220 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1560,8 +1560,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7] ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -3723,19 +3722,17 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX2-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastd (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3743,16 +3740,15 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -3760,16 +3756,15 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero +; AVX2-FAST-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -5317,40 +5312,17 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-SLOW-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq -; -; AVX2-FAST-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX512F: # %bb.0: