11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2- ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,ZNVER
3- ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s --check-prefixes=CHECK,ZNVER
2+ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,ZNVER,AVX512BW-VNNI
3+ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s --check-prefixes=CHECK,ZNVER,AVX-VNNI
44; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512-VNNI
55; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512VL-VNNI
66
@@ -16,56 +16,28 @@ define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
1616define <16 x i32 > @vpdpwssd_v16i32_accumulate (<32 x i16 > %a0 , <32 x i16 > %a1 , <16 x i32 > %a2 ) {
1717; ZNVER-LABEL: vpdpwssd_v16i32_accumulate:
1818; ZNVER: # %bb.0:
19- ; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm3
20- ; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm4
21- ; ZNVER-NEXT: vextracti64x4 $1, %zmm0, %ymm0
22- ; ZNVER-NEXT: vextracti64x4 $1, %zmm1, %ymm1
23- ; ZNVER-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
24- ; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm0
25- ; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm1
26- ; ZNVER-NEXT: vpmulld %zmm4, %zmm3, %zmm3
27- ; ZNVER-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
28- ; ZNVER-NEXT: vpmulld %zmm1, %zmm0, %zmm0
29- ; ZNVER-NEXT: vpermi2d %zmm0, %zmm3, %zmm5
30- ; ZNVER-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
31- ; ZNVER-NEXT: vpaddd %zmm2, %zmm5, %zmm0
32- ; ZNVER-NEXT: vpaddd %zmm4, %zmm0, %zmm0
19+ ; ZNVER-NEXT: vpdpwssd %zmm1, %zmm0, %zmm2
20+ ; ZNVER-NEXT: vmovdqa64 %zmm2, %zmm0
3321; ZNVER-NEXT: retq
3422;
3523; AVX512-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
3624; AVX512-VNNI: # %bb.0:
37- ; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm3
38- ; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm0
39- ; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
40- ; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm4
41- ; AVX512-VNNI-NEXT: vpmulld %zmm4, %zmm3, %zmm3
42- ; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm1
43- ; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
44- ; AVX512-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
45- ; AVX512-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
46- ; AVX512-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm1
47- ; AVX512-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
48- ; AVX512-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
49- ; AVX512-VNNI-NEXT: vpaddd %zmm2, %zmm1, %zmm0
50- ; AVX512-VNNI-NEXT: vpaddd %zmm4, %zmm0, %zmm0
25+ ; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm3
26+ ; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm4
27+ ; AVX512-VNNI-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3
28+ ; AVX512-VNNI-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
29+ ; AVX512-VNNI-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
30+ ; AVX512-VNNI-NEXT: vpaddd %zmm2, %zmm0, %zmm0
5131; AVX512-VNNI-NEXT: retq
5232;
5333; AVX512VL-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
5434; AVX512VL-VNNI: # %bb.0:
55- ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm3
56- ; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm0
57- ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
58- ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm4
59- ; AVX512VL-VNNI-NEXT: vpmulld %zmm4, %zmm3, %zmm3
60- ; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm1
61- ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
62- ; AVX512VL-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
63- ; AVX512VL-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
64- ; AVX512VL-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm1
65- ; AVX512VL-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
66- ; AVX512VL-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
67- ; AVX512VL-VNNI-NEXT: vpaddd %zmm2, %zmm1, %zmm0
68- ; AVX512VL-VNNI-NEXT: vpaddd %zmm4, %zmm0, %zmm0
35+ ; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm3
36+ ; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm4
37+ ; AVX512VL-VNNI-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3
38+ ; AVX512VL-VNNI-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
39+ ; AVX512VL-VNNI-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
40+ ; AVX512VL-VNNI-NEXT: vpaddd %zmm2, %zmm0, %zmm0
6941; AVX512VL-VNNI-NEXT: retq
7042 %x0 = sext <32 x i16 > %a0 to <32 x i32 >
7143 %x1 = sext <32 x i16 > %a1 to <32 x i32 >
@@ -78,43 +50,28 @@ define <16 x i32> @vpdpwssd_v16i32_accumulate(<32 x i16> %a0, <32 x i16> %a1, <1
7850}
7951
8052define <8 x i32 > @vpdpwssd_v8i32_accumulate (<16 x i16 > %a0 , <16 x i16 > %a1 , <8 x i32 > %a2 ) {
81- ; ZNVER-LABEL: vpdpwssd_v8i32_accumulate:
82- ; ZNVER: # %bb.0:
83- ; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm0
84- ; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm1
85- ; ZNVER-NEXT: vpmulld %zmm1, %zmm0, %zmm0
86- ; ZNVER-NEXT: vpmovqd %zmm0, %ymm1
87- ; ZNVER-NEXT: vextracti64x4 $1, %zmm0, %ymm3
88- ; ZNVER-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
89- ; ZNVER-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
90- ; ZNVER-NEXT: vpaddd %ymm2, %ymm0, %ymm0
91- ; ZNVER-NEXT: vpaddd %ymm0, %ymm1, %ymm0
92- ; ZNVER-NEXT: retq
53+ ; AVX512BW-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
54+ ; AVX512BW-VNNI: # %bb.0:
55+ ; AVX512BW-VNNI-NEXT: vpdpwssd %ymm1, %ymm0, %ymm2
56+ ; AVX512BW-VNNI-NEXT: vmovdqa %ymm2, %ymm0
57+ ; AVX512BW-VNNI-NEXT: retq
58+ ;
59+ ; AVX-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
60+ ; AVX-VNNI: # %bb.0:
61+ ; AVX-VNNI-NEXT: {vex} vpdpwssd %ymm1, %ymm0, %ymm2
62+ ; AVX-VNNI-NEXT: vmovdqa %ymm2, %ymm0
63+ ; AVX-VNNI-NEXT: retq
9364;
9465; AVX512-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
9566; AVX512-VNNI: # %bb.0:
96- ; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
97- ; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
98- ; AVX512-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
99- ; AVX512-VNNI-NEXT: vpmovqd %zmm0, %ymm1
100- ; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm3
101- ; AVX512-VNNI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
102- ; AVX512-VNNI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
103- ; AVX512-VNNI-NEXT: vpaddd %ymm2, %ymm1, %ymm1
104- ; AVX512-VNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0
67+ ; AVX512-VNNI-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
68+ ; AVX512-VNNI-NEXT: vpaddd %ymm2, %ymm0, %ymm0
10569; AVX512-VNNI-NEXT: retq
10670;
10771; AVX512VL-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
10872; AVX512VL-VNNI: # %bb.0:
109- ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
110- ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
111- ; AVX512VL-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
112- ; AVX512VL-VNNI-NEXT: vpmovqd %zmm0, %ymm1
113- ; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm3
114- ; AVX512VL-VNNI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
115- ; AVX512VL-VNNI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
116- ; AVX512VL-VNNI-NEXT: vpaddd %ymm2, %ymm1, %ymm1
117- ; AVX512VL-VNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0
73+ ; AVX512VL-VNNI-NEXT: vpdpwssd %ymm1, %ymm0, %ymm2
74+ ; AVX512VL-VNNI-NEXT: vmovdqa %ymm2, %ymm0
11875; AVX512VL-VNNI-NEXT: retq
11976 %x0 = sext <16 x i16 > %a0 to <16 x i32 >
12077 %x1 = sext <16 x i16 > %a1 to <16 x i32 >
@@ -127,43 +84,28 @@ define <8 x i32> @vpdpwssd_v8i32_accumulate(<16 x i16> %a0, <16 x i16> %a1, <8 x
12784}
12885
12986define <4 x i32 > @vpdpwssd_v4i32_accumulate (<8 x i16 > %a0 , <8 x i16 > %a1 , <4 x i32 > %a2 ) {
130- ; ZNVER-LABEL: vpdpwssd_v4i32_accumulate:
131- ; ZNVER: # %bb.0:
132- ; ZNVER-NEXT: vpmovsxwd %xmm0, %ymm0
133- ; ZNVER-NEXT: vpmovsxwd %xmm1, %ymm1
134- ; ZNVER-NEXT: vpmulld %ymm1, %ymm0, %ymm0
135- ; ZNVER-NEXT: vpmovqd %ymm0, %xmm1
136- ; ZNVER-NEXT: vextracti128 $1, %ymm0, %xmm3
137- ; ZNVER-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
138- ; ZNVER-NEXT: vpaddd %xmm2, %xmm1, %xmm1
139- ; ZNVER-NEXT: vpaddd %xmm1, %xmm0, %xmm0
140- ; ZNVER-NEXT: vzeroupper
141- ; ZNVER-NEXT: retq
87+ ; AVX512BW-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
88+ ; AVX512BW-VNNI: # %bb.0:
89+ ; AVX512BW-VNNI-NEXT: vpdpwssd %xmm1, %xmm0, %xmm2
90+ ; AVX512BW-VNNI-NEXT: vmovdqa %xmm2, %xmm0
91+ ; AVX512BW-VNNI-NEXT: retq
92+ ;
93+ ; AVX-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
94+ ; AVX-VNNI: # %bb.0:
95+ ; AVX-VNNI-NEXT: {vex} vpdpwssd %xmm1, %xmm0, %xmm2
96+ ; AVX-VNNI-NEXT: vmovdqa %xmm2, %xmm0
97+ ; AVX-VNNI-NEXT: retq
14298;
14399; AVX512-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
144100; AVX512-VNNI: # %bb.0:
145- ; AVX512-VNNI-NEXT: vpmovsxwd %xmm0, %ymm0
146- ; AVX512-VNNI-NEXT: vpmovsxwd %xmm1, %ymm1
147- ; AVX512-VNNI-NEXT: vpmulld %ymm1, %ymm0, %ymm0
148- ; AVX512-VNNI-NEXT: vextracti128 $1, %ymm0, %xmm1
149- ; AVX512-VNNI-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm1[0,2]
150- ; AVX512-VNNI-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
151- ; AVX512-VNNI-NEXT: vpaddd %xmm2, %xmm3, %xmm1
152- ; AVX512-VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
153- ; AVX512-VNNI-NEXT: vzeroupper
101+ ; AVX512-VNNI-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
102+ ; AVX512-VNNI-NEXT: vpaddd %xmm2, %xmm0, %xmm0
154103; AVX512-VNNI-NEXT: retq
155104;
156105; AVX512VL-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
157106; AVX512VL-VNNI: # %bb.0:
158- ; AVX512VL-VNNI-NEXT: vpmovsxwd %xmm0, %ymm0
159- ; AVX512VL-VNNI-NEXT: vpmovsxwd %xmm1, %ymm1
160- ; AVX512VL-VNNI-NEXT: vpmulld %ymm1, %ymm0, %ymm0
161- ; AVX512VL-VNNI-NEXT: vpmovqd %ymm0, %xmm1
162- ; AVX512VL-VNNI-NEXT: vextracti128 $1, %ymm0, %xmm3
163- ; AVX512VL-VNNI-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
164- ; AVX512VL-VNNI-NEXT: vpaddd %xmm2, %xmm1, %xmm1
165- ; AVX512VL-VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
166- ; AVX512VL-VNNI-NEXT: vzeroupper
107+ ; AVX512VL-VNNI-NEXT: vpdpwssd %xmm1, %xmm0, %xmm2
108+ ; AVX512VL-VNNI-NEXT: vmovdqa %xmm2, %xmm0
167109; AVX512VL-VNNI-NEXT: retq
168110 %x0 = sext <8 x i16 > %a0 to <8 x i32 >
169111 %x1 = sext <8 x i16 > %a1 to <8 x i32 >
0 commit comments