From c0a419dfac3eaee79b877d24c44730bc5a490460 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 6 Mar 2025 16:28:56 +0000 Subject: [PATCH] [X86] combineConcatVectorOps - concat(shuffle(x,y,m1),shuffle(x,y,m2)) -> shuffle(concat(x,x),concat(y,y),m3) on VBMI targets With VBMI we are guaranteed to support cross-lane 256-bit shuffles, so subvector splats should always be cheap. Fixes #116931 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 9 +++- llvm/test/CodeGen/X86/vector-fshr-128.ll | 11 ++--- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 6 +-- .../CodeGen/X86/vector-shuffle-256-v32.ll | 45 ++++++++++++++----- 4 files changed, 49 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index deab638b7e546..68dac675e0c20 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57948,9 +57948,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, switch (Op0.getOpcode()) { case ISD::VECTOR_SHUFFLE: { - if (NumOps == 2 && VT.is256BitVector() && + // TODO: Relax VBMI requirement for repeated shuffle ops - currently + // limited to targets that should always have good cross lane shuffles. + if (!IsSplat && NumOps == 2 && VT.is256BitVector() && (EltSizeInBits >= 32 || Subtarget.hasInt256()) && - (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) { + (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1) || + (Ops[0].getOperand(0) == Ops[1].getOperand(0) && + Ops[0].getOperand(1) == Ops[1].getOperand(1) && + Subtarget.hasVBMI()))) { int NumSubElts = Op0.getValueType().getVectorNumElements(); SmallVector NewMask; for (int M : cast(Ops[0])->getMask()) { diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index d830834a83ab4..a56b0a6351a3b 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -1566,11 +1566,12 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512VLVBMI2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 -; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3 +; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %ymm3, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512VLVBMI2-NEXT: vzeroupper ; AVX512VLVBMI2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 399bdb7a248b8..f45405d885377 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -1218,9 +1218,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512VLVBMI2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VLVBMI2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index d08bfc6e2d7ea..28d637a50109d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -5059,18 +5059,39 @@ define void @shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] -; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2OR512VL-NEXT: vmovdqa %xmm0, 16(%rdi) -; AVX2OR512VL-NEXT: vmovdqa %xmm2, (%rdi) -; AVX2OR512VL-NEXT: vzeroupper -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT: vmovdqa %xmm0, 16(%rdi) +; AVX2-NEXT: vmovdqa %xmm2, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VLBW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512VLBW-NEXT: vmovdqa %xmm0, 16(%rdi) +; AVX512VLBW-NEXT: vmovdqa %xmm2, (%rdi) +; AVX512VLBW-NEXT: vzeroupper +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: vmovdqa %ymm0, (%rdi) +; AVX512VLVBMI-NEXT: vzeroupper +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31: ; XOPAVX1: # %bb.0: