11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2- ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
3- ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s
4- ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s
2+ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,ZNVER
3+ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s --check-prefixes=CHECK,ZNVER
4+ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512-VNNI
5+ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512VL-VNNI
56
67define <16 x i32 > @vpdpwssd_test (<16 x i32 > %0 , <16 x i32 > %1 , <16 x i32 > %2 ) {
78; CHECK-LABEL: vpdpwssd_test:
@@ -11,3 +12,165 @@ define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
1112 %4 = tail call <16 x i32 > @llvm.x86.avx512.vpdpwssd.512 (<16 x i32 > %0 , <16 x i32 > %1 , <16 x i32 > %2 )
1213 ret <16 x i32 > %4
1314}
15+
16+ define <16 x i32 > @vpdpwssd_v16i32_accumulate (<32 x i16 > %a0 , <32 x i16 > %a1 , <16 x i32 > %a2 ) {
17+ ; ZNVER-LABEL: vpdpwssd_v16i32_accumulate:
18+ ; ZNVER: # %bb.0:
19+ ; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm3
20+ ; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm4
21+ ; ZNVER-NEXT: vextracti64x4 $1, %zmm0, %ymm0
22+ ; ZNVER-NEXT: vextracti64x4 $1, %zmm1, %ymm1
23+ ; ZNVER-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
24+ ; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm0
25+ ; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm1
26+ ; ZNVER-NEXT: vpmulld %zmm4, %zmm3, %zmm3
27+ ; ZNVER-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
28+ ; ZNVER-NEXT: vpmulld %zmm1, %zmm0, %zmm0
29+ ; ZNVER-NEXT: vpermi2d %zmm0, %zmm3, %zmm5
30+ ; ZNVER-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
31+ ; ZNVER-NEXT: vpaddd %zmm2, %zmm5, %zmm0
32+ ; ZNVER-NEXT: vpaddd %zmm4, %zmm0, %zmm0
33+ ; ZNVER-NEXT: retq
34+ ;
35+ ; AVX512-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
36+ ; AVX512-VNNI: # %bb.0:
37+ ; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm3
38+ ; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm0
39+ ; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
40+ ; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm4
41+ ; AVX512-VNNI-NEXT: vpmulld %zmm4, %zmm3, %zmm3
42+ ; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm1
43+ ; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
44+ ; AVX512-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
45+ ; AVX512-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
46+ ; AVX512-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm1
47+ ; AVX512-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
48+ ; AVX512-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
49+ ; AVX512-VNNI-NEXT: vpaddd %zmm2, %zmm1, %zmm0
50+ ; AVX512-VNNI-NEXT: vpaddd %zmm4, %zmm0, %zmm0
51+ ; AVX512-VNNI-NEXT: retq
52+ ;
53+ ; AVX512VL-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
54+ ; AVX512VL-VNNI: # %bb.0:
55+ ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm3
56+ ; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm0
57+ ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
58+ ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm4
59+ ; AVX512VL-VNNI-NEXT: vpmulld %zmm4, %zmm3, %zmm3
60+ ; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm1
61+ ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
62+ ; AVX512VL-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
63+ ; AVX512VL-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
64+ ; AVX512VL-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm1
65+ ; AVX512VL-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
66+ ; AVX512VL-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
67+ ; AVX512VL-VNNI-NEXT: vpaddd %zmm2, %zmm1, %zmm0
68+ ; AVX512VL-VNNI-NEXT: vpaddd %zmm4, %zmm0, %zmm0
69+ ; AVX512VL-VNNI-NEXT: retq
70+ %x0 = sext <32 x i16 > %a0 to <32 x i32 >
71+ %x1 = sext <32 x i16 > %a1 to <32 x i32 >
72+ %m = mul nsw <32 x i32 > %x0 , %x1
73+ %lo = shufflevector <32 x i32 > %m , <32 x i32 > poison, <16 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 , i32 8 , i32 10 , i32 12 , i32 14 , i32 16 , i32 18 , i32 20 , i32 22 , i32 24 , i32 26 , i32 28 , i32 30 >
74+ %hi = shufflevector <32 x i32 > %m , <32 x i32 > poison, <16 x i32 > <i32 1 , i32 3 , i32 5 , i32 7 , i32 9 , i32 11 , i32 13 , i32 15 , i32 17 , i32 19 , i32 21 , i32 23 , i32 25 , i32 27 , i32 29 , i32 31 >
75+ %r0 = add <16 x i32 > %lo , %a2
76+ %r1 = add <16 x i32 > %r0 , %hi
77+ ret <16 x i32 > %r1
78+ }
79+
80+ define <8 x i32 > @vpdpwssd_v8i32_accumulate (<16 x i16 > %a0 , <16 x i16 > %a1 , <8 x i32 > %a2 ) {
81+ ; ZNVER-LABEL: vpdpwssd_v8i32_accumulate:
82+ ; ZNVER: # %bb.0:
83+ ; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm0
84+ ; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm1
85+ ; ZNVER-NEXT: vpmulld %zmm1, %zmm0, %zmm0
86+ ; ZNVER-NEXT: vpmovqd %zmm0, %ymm1
87+ ; ZNVER-NEXT: vextracti64x4 $1, %zmm0, %ymm3
88+ ; ZNVER-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
89+ ; ZNVER-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
90+ ; ZNVER-NEXT: vpaddd %ymm2, %ymm0, %ymm0
91+ ; ZNVER-NEXT: vpaddd %ymm0, %ymm1, %ymm0
92+ ; ZNVER-NEXT: retq
93+ ;
94+ ; AVX512-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
95+ ; AVX512-VNNI: # %bb.0:
96+ ; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
97+ ; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
98+ ; AVX512-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
99+ ; AVX512-VNNI-NEXT: vpmovqd %zmm0, %ymm1
100+ ; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm3
101+ ; AVX512-VNNI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
102+ ; AVX512-VNNI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
103+ ; AVX512-VNNI-NEXT: vpaddd %ymm2, %ymm1, %ymm1
104+ ; AVX512-VNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0
105+ ; AVX512-VNNI-NEXT: retq
106+ ;
107+ ; AVX512VL-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
108+ ; AVX512VL-VNNI: # %bb.0:
109+ ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
110+ ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
111+ ; AVX512VL-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
112+ ; AVX512VL-VNNI-NEXT: vpmovqd %zmm0, %ymm1
113+ ; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm3
114+ ; AVX512VL-VNNI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
115+ ; AVX512VL-VNNI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
116+ ; AVX512VL-VNNI-NEXT: vpaddd %ymm2, %ymm1, %ymm1
117+ ; AVX512VL-VNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0
118+ ; AVX512VL-VNNI-NEXT: retq
119+ %x0 = sext <16 x i16 > %a0 to <16 x i32 >
120+ %x1 = sext <16 x i16 > %a1 to <16 x i32 >
121+ %m = mul nsw <16 x i32 > %x0 , %x1
122+ %lo = shufflevector <16 x i32 > %m , <16 x i32 > poison, <8 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 , i32 8 , i32 10 , i32 12 , i32 14 >
123+ %hi = shufflevector <16 x i32 > %m , <16 x i32 > poison, <8 x i32 > <i32 1 , i32 3 , i32 5 , i32 7 , i32 9 , i32 11 , i32 13 , i32 15 >
124+ %r0 = add <8 x i32 > %hi , %a2
125+ %r1 = add <8 x i32 > %lo , %r0
126+ ret <8 x i32 > %r1
127+ }
128+
129+ define <4 x i32 > @vpdpwssd_v4i32_accumulate (<8 x i16 > %a0 , <8 x i16 > %a1 , <4 x i32 > %a2 ) {
130+ ; ZNVER-LABEL: vpdpwssd_v4i32_accumulate:
131+ ; ZNVER: # %bb.0:
132+ ; ZNVER-NEXT: vpmovsxwd %xmm0, %ymm0
133+ ; ZNVER-NEXT: vpmovsxwd %xmm1, %ymm1
134+ ; ZNVER-NEXT: vpmulld %ymm1, %ymm0, %ymm0
135+ ; ZNVER-NEXT: vpmovqd %ymm0, %xmm1
136+ ; ZNVER-NEXT: vextracti128 $1, %ymm0, %xmm3
137+ ; ZNVER-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
138+ ; ZNVER-NEXT: vpaddd %xmm2, %xmm1, %xmm1
139+ ; ZNVER-NEXT: vpaddd %xmm1, %xmm0, %xmm0
140+ ; ZNVER-NEXT: vzeroupper
141+ ; ZNVER-NEXT: retq
142+ ;
143+ ; AVX512-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
144+ ; AVX512-VNNI: # %bb.0:
145+ ; AVX512-VNNI-NEXT: vpmovsxwd %xmm0, %ymm0
146+ ; AVX512-VNNI-NEXT: vpmovsxwd %xmm1, %ymm1
147+ ; AVX512-VNNI-NEXT: vpmulld %ymm1, %ymm0, %ymm0
148+ ; AVX512-VNNI-NEXT: vextracti128 $1, %ymm0, %xmm1
149+ ; AVX512-VNNI-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm1[0,2]
150+ ; AVX512-VNNI-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
151+ ; AVX512-VNNI-NEXT: vpaddd %xmm2, %xmm3, %xmm1
152+ ; AVX512-VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
153+ ; AVX512-VNNI-NEXT: vzeroupper
154+ ; AVX512-VNNI-NEXT: retq
155+ ;
156+ ; AVX512VL-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
157+ ; AVX512VL-VNNI: # %bb.0:
158+ ; AVX512VL-VNNI-NEXT: vpmovsxwd %xmm0, %ymm0
159+ ; AVX512VL-VNNI-NEXT: vpmovsxwd %xmm1, %ymm1
160+ ; AVX512VL-VNNI-NEXT: vpmulld %ymm1, %ymm0, %ymm0
161+ ; AVX512VL-VNNI-NEXT: vpmovqd %ymm0, %xmm1
162+ ; AVX512VL-VNNI-NEXT: vextracti128 $1, %ymm0, %xmm3
163+ ; AVX512VL-VNNI-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
164+ ; AVX512VL-VNNI-NEXT: vpaddd %xmm2, %xmm1, %xmm1
165+ ; AVX512VL-VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
166+ ; AVX512VL-VNNI-NEXT: vzeroupper
167+ ; AVX512VL-VNNI-NEXT: retq
168+ %x0 = sext <8 x i16 > %a0 to <8 x i32 >
169+ %x1 = sext <8 x i16 > %a1 to <8 x i32 >
170+ %m = mul nsw <8 x i32 > %x0 , %x1
171+ %lo = shufflevector <8 x i32 > %m , <8 x i32 > poison, <4 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 >
172+ %hi = shufflevector <8 x i32 > %m , <8 x i32 > poison, <4 x i32 > <i32 1 , i32 3 , i32 5 , i32 7 >
173+ %r0 = add <4 x i32 > %lo , %a2
174+ %r1 = add <4 x i32 > %hi , %r0
175+ ret <4 x i32 > %r1
176+ }
0 commit comments