@@ -985,17 +985,16 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
985
985
; X86-SSE-NEXT: movl c, %esi
986
986
; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
987
987
; X86-SSE-NEXT: movd %edx, %xmm0
988
+ ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
989
+ ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
990
+ ; X86-SSE-NEXT: psrad $24, %xmm0
988
991
; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
989
992
; X86-SSE-NEXT: movd %eax, %xmm1
990
993
; X86-SSE-NEXT: pxor %xmm2, %xmm2
991
994
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
992
- ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
993
- ; X86-SSE-NEXT: psraw $8, %xmm0
994
- ; X86-SSE-NEXT: movdqa %xmm1, %xmm2
995
- ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
996
- ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
997
- ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
998
- ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
995
+ ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
996
+ ; X86-SSE-NEXT: pmaddwd %xmm0, %xmm1
997
+ ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
999
998
; X86-SSE-NEXT: popl %esi
1000
999
; X86-SSE-NEXT: retl
1001
1000
;
@@ -1012,7 +1011,7 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
1012
1011
; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
1013
1012
; X86-AVX-NEXT: vmovd %eax, %xmm1
1014
1013
; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1015
- ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1014
+ ; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
1016
1015
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
1017
1016
; X86-AVX-NEXT: popl %esi
1018
1017
; X86-AVX-NEXT: retl
@@ -1022,17 +1021,16 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
1022
1021
; X64-SSE-NEXT: movq c(%rip), %rax
1023
1022
; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
1024
1023
; X64-SSE-NEXT: movd %ecx, %xmm0
1024
+ ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1025
+ ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1026
+ ; X64-SSE-NEXT: psrad $24, %xmm0
1025
1027
; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
1026
1028
; X64-SSE-NEXT: movd %ecx, %xmm1
1027
1029
; X64-SSE-NEXT: pxor %xmm2, %xmm2
1028
1030
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1029
- ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1030
- ; X64-SSE-NEXT: psraw $8, %xmm0
1031
- ; X64-SSE-NEXT: movdqa %xmm1, %xmm2
1032
- ; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
1033
- ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1034
- ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1035
- ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
1031
+ ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1032
+ ; X64-SSE-NEXT: pmaddwd %xmm0, %xmm1
1033
+ ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
1036
1034
; X64-SSE-NEXT: retq
1037
1035
;
1038
1036
; X64-AVX-LABEL: mul_2xi8_sext_zext:
@@ -1044,7 +1042,7 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
1044
1042
; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
1045
1043
; X64-AVX-NEXT: vmovd %ecx, %xmm1
1046
1044
; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1047
- ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
1045
+ ; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
1048
1046
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
1049
1047
; X64-AVX-NEXT: retq
1050
1048
entry:
@@ -1605,11 +1603,8 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
1605
1603
; X86-SSE-NEXT: movd %ecx, %xmm0
1606
1604
; X86-SSE-NEXT: pxor %xmm1, %xmm1
1607
1605
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1608
- ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
1609
- ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1610
- ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
1611
- ; X86-SSE-NEXT: pmullw %xmm1, %xmm0
1612
- ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1606
+ ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1607
+ ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1613
1608
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
1614
1609
; X86-SSE-NEXT: retl
1615
1610
;
@@ -1621,7 +1616,7 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
1621
1616
; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
1622
1617
; X86-AVX-NEXT: vmovd %ecx, %xmm0
1623
1618
; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1624
- ; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1619
+ ; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1625
1620
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
1626
1621
; X86-AVX-NEXT: retl
1627
1622
;
@@ -1632,11 +1627,8 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
1632
1627
; X64-SSE-NEXT: movd %ecx, %xmm0
1633
1628
; X64-SSE-NEXT: pxor %xmm1, %xmm1
1634
1629
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1635
- ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
1636
- ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1637
- ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
1638
- ; X64-SSE-NEXT: pmullw %xmm1, %xmm0
1639
- ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1630
+ ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1631
+ ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1640
1632
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
1641
1633
; X64-SSE-NEXT: retq
1642
1634
;
@@ -1646,7 +1638,7 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
1646
1638
; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
1647
1639
; X64-AVX-NEXT: vmovd %ecx, %xmm0
1648
1640
; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1649
- ; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1641
+ ; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1650
1642
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
1651
1643
; X64-AVX-NEXT: retq
1652
1644
entry:
0 commit comments