Skip to content

Commit c2dc2f8

Browse files
authored
[X86] lowerShuffleAsDecomposedShuffleMerge - prefer permute+unpck patterns vs blend+permute on pre-SSE41 targets (#160301)
Pre-SSE41 we don't have BLENDI so blend patterns tend to get expanded to more complex shuffles Fixes 128-bit case from #159670
1 parent bcc1e75 commit c2dc2f8

File tree

3 files changed

+22
-29
lines changed

3 files changed

+22
-29
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11721,10 +11721,19 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
1172111721
// we'll have to do 2x as many shuffles in order to achieve this, a 2-input
1172211722
// pre-shuffle first is a better strategy.
1172311723
if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11724+
// If we don't have blends, see if we can create a cheap unpack.
11725+
if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
11726+
(is128BitUnpackShuffleMask(V1Mask, DAG) ||
11727+
is128BitUnpackShuffleMask(V2Mask, DAG)))
11728+
if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11729+
DL, VT, V1, V2, Mask, Subtarget, DAG))
11730+
return PermUnpack;
11731+
1172411732
// Only prefer immediate blends to unpack/rotate.
11725-
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11726-
DAG, true))
11733+
if (SDValue BlendPerm =
11734+
lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
1172711735
return BlendPerm;
11736+
1172811737
// If either input vector provides only a single element which is repeated
1172911738
// multiple times, unpacking from both input vectors would generate worse
1173011739
// code. e.g. for
@@ -11736,13 +11745,16 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
1173611745
if (SDValue UnpackPerm =
1173711746
lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
1173811747
return UnpackPerm;
11748+
1173911749
if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
1174011750
DL, VT, V1, V2, Mask, Subtarget, DAG))
1174111751
return RotatePerm;
11752+
1174211753
// Unpack/rotate failed - try again with variable blends.
1174311754
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
1174411755
DAG))
1174511756
return BlendPerm;
11757+
1174611758
if (VT.getScalarSizeInBits() >= 32)
1174711759
if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
1174811760
DL, VT, V1, V2, Mask, Subtarget, DAG))

llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1051,28 +1051,11 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
10511051

10521052
; PR159670
10531053
define <16 x i8> @shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31(<16 x i8> %a, <16 x i8> %b) {
1054-
; SSE2-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31:
1055-
; SSE2: # %bb.0:
1056-
; SSE2-NEXT: pxor %xmm2, %xmm2
1057-
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1058-
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1059-
; SSE2-NEXT: movdqa %xmm0, %xmm2
1060-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1061-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1062-
; SSE2-NEXT: packuswb %xmm2, %xmm0
1063-
; SSE2-NEXT: retq
1064-
;
1065-
; SSSE3-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31:
1066-
; SSSE3: # %bb.0:
1067-
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1068-
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1069-
; SSSE3-NEXT: retq
1070-
;
1071-
; SSE41-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31:
1072-
; SSE41: # %bb.0:
1073-
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1074-
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1075-
; SSE41-NEXT: retq
1054+
; SSE-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31:
1055+
; SSE: # %bb.0:
1056+
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1057+
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1058+
; SSE-NEXT: retq
10761059
;
10771060
; AVX-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31:
10781061
; AVX: # %bb.0:

llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -362,11 +362,9 @@ define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) {
362362
define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
363363
; AMD10H-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
364364
; AMD10H: # %bb.0:
365-
; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
366-
; AMD10H-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
367-
; AMD10H-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
368-
; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
369-
; AMD10H-NEXT: packuswb %xmm0, %xmm0
365+
; AMD10H-NEXT: psrld $16, %xmm1
366+
; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
367+
; AMD10H-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
370368
; AMD10H-NEXT: retq
371369
;
372370
; BTVER1-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:

0 commit comments

Comments
 (0)