Skip to content

Commit 70a0713

Browse files
RKSimonmemfrob
authored andcommitted
[X86][SSE] combineMulToPMADDWD - improve recognition of sign/zero extended upper bits
PMADDWD(v8i16 x, v8i16 y) == (v4i32) { (int)x[0]*y[0] + (int)x[1]*y[1], ..., (int)x[6]*y[6] + (int)x[7]*y[7] } Currently combineMulToPMADDWD only folds cases where the upper 17 bits of both vXi32 inputs are known zero (i.e. the first half is positive and the second half of the pair is zero in each 2xi16 pair), this can be relaxed to only require one zero-extended input if the other input has at least 17 sign bits. That way the sign of the result is still preserved, and the second half is still zero. Noticed while investigating PR47437. Differential Revision: https://reviews.llvm.org/D108522
1 parent 4f88f6b commit 70a0713

File tree

4 files changed

+55
-61
lines changed

4 files changed

+55
-61
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44005,8 +44005,9 @@ static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
4400544005
return SDValue();
4400644006
}
4400744007

44008-
// If the upper 17 bits of each element are zero then we can use PMADDWD,
44009-
// which is always at least as quick as PMULLD, except on KNL.
44008+
// If the upper 17 bits of either element are zero and the other element are
44009+
// zero/sign bits then we can use PMADDWD, which is always at least as quick as
44010+
// PMULLD, except on KNL.
4401044011
static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
4401144012
const X86Subtarget &Subtarget) {
4401244013
if (!Subtarget.hasSSE2())
@@ -44043,9 +44044,13 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
4404344044
N1.getOperand(0).getScalarValueSizeInBits() <= 8))
4404444045
return SDValue();
4404544046

44047+
// Sign bits must extend through the upper 17 bits.
44048+
if (DAG.ComputeNumSignBits(N1) < 17 || DAG.ComputeNumSignBits(N0) < 17)
44049+
return SDValue();
44050+
44051+
// At least one of the elements must be zero in the upper 17 bits.
4404644052
APInt Mask17 = APInt::getHighBitsSet(32, 17);
44047-
if (!DAG.MaskedValueIsZero(N1, Mask17) ||
44048-
!DAG.MaskedValueIsZero(N0, Mask17))
44053+
if (!DAG.MaskedValueIsZero(N1, Mask17) && !DAG.MaskedValueIsZero(N0, Mask17))
4404944054
return SDValue();
4405044055

4405144056
// Use SplitOpsAndApply to handle AVX splitting.

llvm/test/CodeGen/X86/madd.ll

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2049,27 +2049,23 @@ define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) {
20492049
}
20502050

20512051
; Do not select if constant is too large
2052+
; Lower half is too large, upper half is in range.
20522053
define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
20532054
; SSE2-LABEL: pmaddwd_negative2:
20542055
; SSE2: # %bb.0:
2055-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2056+
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
20562057
; SSE2-NEXT: psrad $16, %xmm1
2057-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
2058-
; SSE2-NEXT: psrad $16, %xmm0
2059-
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
2060-
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,7,42,32]
2061-
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
2062-
; SSE2-NEXT: pmuludq %xmm2, %xmm4
2063-
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
2064-
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [32768,4294934528,0,0]
2065-
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
2066-
; SSE2-NEXT: pmuludq %xmm2, %xmm6
2067-
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm4[0,2]
2058+
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2059+
; SSE2-NEXT: psrad $16, %xmm2
2060+
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
2061+
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,4294934528,0,0]
2062+
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
20682063
; SSE2-NEXT: pmuludq %xmm3, %xmm0
2069-
; SSE2-NEXT: pmuludq %xmm5, %xmm1
2070-
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
2071-
; SSE2-NEXT: paddd %xmm6, %xmm1
2072-
; SSE2-NEXT: movdqa %xmm1, %xmm0
2064+
; SSE2-NEXT: pmuludq %xmm4, %xmm2
2065+
; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2066+
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
2067+
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
2068+
; SSE2-NEXT: paddd %xmm2, %xmm0
20732069
; SSE2-NEXT: retq
20742070
;
20752071
; AVX1-LABEL: pmaddwd_negative2:
@@ -2078,7 +2074,7 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
20782074
; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1
20792075
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
20802076
; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2081-
; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2077+
; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
20822078
; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
20832079
; AVX1-NEXT: retq
20842080
;

llvm/test/CodeGen/X86/pmaddubsw.ll

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ define <8 x i16> @pmaddubsw_commuted_mul(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
291291
ret <8 x i16> %trunc
292292
}
293293

294+
; If the extensions don't match see if we can use PMADDWD instead.
294295
define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
295296
; SSE-LABEL: pmaddubsw_bad_extend:
296297
; SSE: # %bb.0:
@@ -334,19 +335,19 @@ define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
334335
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
335336
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
336337
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
337-
; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
338+
; AVX1-NEXT: vpmaddwd %xmm5, %xmm4, %xmm4
338339
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
339340
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
340-
; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2
341+
; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2
341342
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
342343
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
343344
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
344345
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
345-
; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3
346+
; AVX1-NEXT: vpmaddwd %xmm5, %xmm3, %xmm3
346347
; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
347348
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
348349
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
349-
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
350+
; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
350351
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
351352
; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0
352353
; AVX1-NEXT: retq
@@ -363,10 +364,10 @@ define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
363364
; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1
364365
; AVX256-NEXT: vpmovsxbd %xmm3, %ymm3
365366
; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
366-
; AVX256-NEXT: vpmulld %ymm2, %ymm3, %ymm2
367+
; AVX256-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2
367368
; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
368369
; AVX256-NEXT: vpmovsxbd %xmm1, %ymm1
369-
; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0
370+
; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
370371
; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0
371372
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
372373
; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
@@ -435,19 +436,19 @@ define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
435436
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
436437
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
437438
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
438-
; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
439+
; AVX1-NEXT: vpmaddwd %xmm5, %xmm4, %xmm4
439440
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
440441
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
441-
; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2
442+
; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2
442443
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm3
443444
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
444445
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
445446
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
446-
; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3
447+
; AVX1-NEXT: vpmaddwd %xmm5, %xmm3, %xmm3
447448
; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
448449
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
449450
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
450-
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
451+
; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
451452
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
452453
; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0
453454
; AVX1-NEXT: retq
@@ -462,10 +463,10 @@ define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
462463
; AVX256-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
463464
; AVX256-NEXT: vpmovsxbd %xmm2, %ymm2
464465
; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
465-
; AVX256-NEXT: vpmulld %ymm3, %ymm2, %ymm2
466+
; AVX256-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2
466467
; AVX256-NEXT: vpmovsxbd %xmm0, %ymm0
467468
; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
468-
; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0
469+
; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
469470
; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0
470471
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
471472
; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0

llvm/test/CodeGen/X86/shrink_vmul.ll

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -985,17 +985,16 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
985985
; X86-SSE-NEXT: movl c, %esi
986986
; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
987987
; X86-SSE-NEXT: movd %edx, %xmm0
988+
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
989+
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
990+
; X86-SSE-NEXT: psrad $24, %xmm0
988991
; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
989992
; X86-SSE-NEXT: movd %eax, %xmm1
990993
; X86-SSE-NEXT: pxor %xmm2, %xmm2
991994
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
992-
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
993-
; X86-SSE-NEXT: psraw $8, %xmm0
994-
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
995-
; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
996-
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
997-
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
998-
; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
995+
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
996+
; X86-SSE-NEXT: pmaddwd %xmm0, %xmm1
997+
; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
999998
; X86-SSE-NEXT: popl %esi
1000999
; X86-SSE-NEXT: retl
10011000
;
@@ -1012,7 +1011,7 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
10121011
; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
10131012
; X86-AVX-NEXT: vmovd %eax, %xmm1
10141013
; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1015-
; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1014+
; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
10161015
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
10171016
; X86-AVX-NEXT: popl %esi
10181017
; X86-AVX-NEXT: retl
@@ -1022,17 +1021,16 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
10221021
; X64-SSE-NEXT: movq c(%rip), %rax
10231022
; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
10241023
; X64-SSE-NEXT: movd %ecx, %xmm0
1024+
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1025+
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1026+
; X64-SSE-NEXT: psrad $24, %xmm0
10251027
; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
10261028
; X64-SSE-NEXT: movd %ecx, %xmm1
10271029
; X64-SSE-NEXT: pxor %xmm2, %xmm2
10281030
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1029-
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1030-
; X64-SSE-NEXT: psraw $8, %xmm0
1031-
; X64-SSE-NEXT: movdqa %xmm1, %xmm2
1032-
; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
1033-
; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1034-
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1035-
; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
1031+
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1032+
; X64-SSE-NEXT: pmaddwd %xmm0, %xmm1
1033+
; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
10361034
; X64-SSE-NEXT: retq
10371035
;
10381036
; X64-AVX-LABEL: mul_2xi8_sext_zext:
@@ -1044,7 +1042,7 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
10441042
; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
10451043
; X64-AVX-NEXT: vmovd %ecx, %xmm1
10461044
; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1047-
; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1045+
; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
10481046
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
10491047
; X64-AVX-NEXT: retq
10501048
entry:
@@ -1605,11 +1603,8 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
16051603
; X86-SSE-NEXT: movd %ecx, %xmm0
16061604
; X86-SSE-NEXT: pxor %xmm1, %xmm1
16071605
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1608-
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
1609-
; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1610-
; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1611-
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1612-
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1606+
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1607+
; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
16131608
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
16141609
; X86-SSE-NEXT: retl
16151610
;
@@ -1621,7 +1616,7 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
16211616
; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
16221617
; X86-AVX-NEXT: vmovd %ecx, %xmm0
16231618
; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1624-
; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1619+
; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
16251620
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
16261621
; X86-AVX-NEXT: retl
16271622
;
@@ -1632,11 +1627,8 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
16321627
; X64-SSE-NEXT: movd %ecx, %xmm0
16331628
; X64-SSE-NEXT: pxor %xmm1, %xmm1
16341629
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1635-
; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
1636-
; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1637-
; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1638-
; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1639-
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1630+
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1631+
; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
16401632
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
16411633
; X64-SSE-NEXT: retq
16421634
;
@@ -1646,7 +1638,7 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
16461638
; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
16471639
; X64-AVX-NEXT: vmovd %ecx, %xmm0
16481640
; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1649-
; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1641+
; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
16501642
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
16511643
; X64-AVX-NEXT: retq
16521644
entry:

0 commit comments

Comments
 (0)