11; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
2- ; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
2+ ; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
33
44target triple = "wasm32"
55
66define hidden i32 @i32_mac_s8 (ptr nocapture noundef readonly %a , ptr nocapture noundef readonly %b , i32 noundef %N ) {
77; CHECK-LABEL: i32_mac_s8:
8- ; CHECK: v128.load32_zero 0:p2align=0
9- ; CHECK: i16x8.extend_low_i8x16_s
10- ; CHECK: v128.load32_zero 0:p2align=0
11- ; CHECK: i16x8.extend_low_i8x16_s
12- ; CHECK: i32x4.extmul_low_i16x8_s
13- ; CHECK: i32x4.add
14-
15- ; MAX-BANDWIDTH: v128.load
16- ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
17- ; MAX-BANDWIDTH: v128.load
18- ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
19- ; MAX-BANDWIDTH: i32x4.dot_i16x8_s
20- ; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
21- ; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
22- ; MAX-BANDWIDTH: i32x4.dot_i16x8_s
23- ; MAX-BANDWIDTH: i32x4.add
24- ; MAX-BANDWIDTH: i32x4.add
8+ ; CHECK: v128.load
9+ ; CHECK: i16x8.extend_low_i8x16_s
10+ ; CHECK: v128.load
11+ ; CHECK: i16x8.extend_low_i8x16_s
12+ ; CHECK: i32x4.dot_i16x8_s
13+ ; CHECK: i16x8.extend_high_i8x16_s
14+ ; CHECK: i16x8.extend_high_i8x16_s
15+ ; CHECK: i32x4.dot_i16x8_s
16+ ; CHECK: i32x4.add
17+ ; CHECK: i32x4.add
2518
2619entry:
2720 %cmp7.not = icmp eq i32 %N , 0
@@ -49,14 +42,9 @@ for.body: ; preds = %entry, %for.body
4942
5043define hidden i32 @i32_mac_s16 (ptr nocapture noundef readonly %a , ptr nocapture noundef readonly %b , i32 noundef %N ) {
5144; CHECK-LABEL: i32_mac_s16:
52- ; CHECK: i32x4.load16x4_s 0:p2align=1
53- ; CHECK: i32x4.load16x4_s 0:p2align=1
54- ; CHECK: i32x4.mul
55- ; CHECK: i32x4.add
56-
57- ; MAX-BANDWIDTH: v128.load
58- ; MAX-BANDWIDTH: v128.load
59- ; MAX-BANDWIDTH: i32x4.dot_i16x8_s
45+ ; CHECK: v128.load
46+ ; CHECK: v128.load
47+ ; CHECK: i32x4.dot_i16x8_s
6048
6149entry:
6250 %cmp7.not = icmp eq i32 %N , 0
@@ -84,37 +72,30 @@ for.body: ; preds = %entry, %for.body
8472
8573define hidden i64 @i64_mac_s16 (ptr nocapture noundef readonly %a , ptr nocapture noundef readonly %b , i32 noundef %N ) {
8674; CHECK-LABEL: i64_mac_s16:
87- ; CHECK: v128.load32_zero 0:p2align=1
88- ; CHECK: i32x4.extend_low_i16x8_s
89- ; CHECK: v128.load32_zero 0:p2align=1
90- ; CHECK: i32x4.extend_low_i16x8_s
91- ; CHECK: i64x2.extmul_low_i32x4_s
92- ; CHECK: i64x2.add
93-
94- ; MAX-BANDWIDTH: v128.load
95- ; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
96- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
97- ; MAX-BANDWIDTH: v128.load
98- ; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
99- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
100- ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
101- ; MAX-BANDWIDTH: i64x2.add
102- ; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
103- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
104- ; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
105- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
106- ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
107- ; MAX-BANDWIDTH: i64x2.add
108- ; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
109- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
110- ; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
111- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
112- ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
113- ; MAX-BANDWIDTH: i64x2.add
114- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
115- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
116- ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
117- ; MAX-BANDWIDTH: i64x2.add
75+ ; CHECK: v128.load
76+ ; CHECK: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
77+ ; CHECK: i32x4.extend_low_i16x8_s
78+ ; CHECK: v128.load
79+ ; CHECK: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
80+ ; CHECK: i32x4.extend_low_i16x8_s
81+ ; CHECK: i64x2.extmul_low_i32x4_s
82+ ; CHECK: i64x2.add
83+ ; CHECK: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
84+ ; CHECK: i32x4.extend_low_i16x8_s
85+ ; CHECK: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
86+ ; CHECK: i32x4.extend_low_i16x8_s
87+ ; CHECK: i64x2.extmul_low_i32x4_s
88+ ; CHECK: i64x2.add
89+ ; CHECK: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
90+ ; CHECK: i32x4.extend_low_i16x8_s
91+ ; CHECK: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
92+ ; CHECK: i32x4.extend_low_i16x8_s
93+ ; CHECK: i64x2.extmul_low_i32x4_s
94+ ; CHECK: i64x2.add
95+ ; CHECK: i32x4.extend_low_i16x8_s
96+ ; CHECK: i32x4.extend_low_i16x8_s
97+ ; CHECK: i64x2.extmul_low_i32x4_s
98+ ; CHECK: i64x2.add
11899
119100entry:
120101 %cmp7.not = icmp eq i32 %N , 0
@@ -142,19 +123,13 @@ for.body: ; preds = %entry, %for.body
142123
143124define hidden i64 @i64_mac_s32 (ptr nocapture noundef readonly %a , ptr nocapture noundef readonly %b , i32 noundef %N ) {
144125; CHECK-LABEL: i64_mac_s32:
145- ; CHECK: v128.load64_zero 0:p2align=2
146- ; CHECK: v128.load64_zero 0:p2align=2
147- ; CHECK: i32x4.mul
148- ; CHECK: i64x2.extend_low_i32x4_s
149- ; CHECK: i64x2.add
150-
151- ; MAX-BANDWIDTH: v128.load
152- ; MAX-BANDWIDTH: v128.load
153- ; MAX-BANDWIDTH: i32x4.mul
154- ; MAX-BANDWIDTH: i64x2.extend_high_i32x4_s
155- ; MAX-BANDWIDTH: i64x2.add
156- ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
157- ; MAX-BANDWIDTH: i64x2.add
126+ ; CHECK: v128.load
127+ ; CHECK: v128.load
128+ ; CHECK: i32x4.mul
129+ ; CHECK: i64x2.extend_high_i32x4_s
130+ ; CHECK: i64x2.add
131+ ; CHECK: i64x2.extend_low_i32x4_s
132+ ; CHECK: i64x2.add
158133
159134entry:
160135 %cmp6.not = icmp eq i32 %N , 0
@@ -181,25 +156,18 @@ for.body: ; preds = %entry, %for.body
181156
182157define hidden i32 @i32_mac_u8 (ptr nocapture noundef readonly %a , ptr nocapture noundef readonly %b , i32 noundef %N ) {
183158; CHECK-LABEL: i32_mac_u8:
184- ; CHECK: v128.load32_zero 0:p2align=0
185- ; CHECK: i16x8.extend_low_i8x16_u
186- ; CHECK: v128.load32_zero 0:p2align=0
187- ; CHECK: i16x8.extend_low_i8x16_u
188- ; CHECK: i32x4.extmul_low_i16x8_u
189- ; CHECK: i32x4.add
190-
191- ; MAX-BANDWIDTH: v128.load
192- ; MAX-BANDWIDTH: v128.load
193- ; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u
194- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
195- ; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
196- ; MAX-BANDWIDTH: i32x4.add
197- ; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u
198- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
199- ; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
200- ; MAX-BANDWIDTH: i32x4.add
201- ; MAX-BANDWIDTH: i32x4.add
202- ; MAX-BANDWIDTH: i32x4.add
159+ ; CHECK: v128.load
160+ ; CHECK: v128.load
161+ ; CHECK: i16x8.extmul_low_i8x16_u
162+ ; CHECK: i32x4.extend_low_i16x8_u
163+ ; CHECK: i32x4.extend_high_i16x8_u
164+ ; CHECK: i32x4.add
165+ ; CHECK: i16x8.extmul_high_i8x16_u
166+ ; CHECK: i32x4.extend_low_i16x8_u
167+ ; CHECK: i32x4.extend_high_i16x8_u
168+ ; CHECK: i32x4.add
169+ ; CHECK: i32x4.add
170+ ; CHECK: i32x4.add
203171
204172entry:
205173 %cmp7.not = icmp eq i32 %N , 0
@@ -227,17 +195,12 @@ for.body: ; preds = %entry, %for.body
227195
228196define hidden i32 @i32_mac_u16 (ptr nocapture noundef readonly %a , ptr nocapture noundef readonly %b , i32 noundef %N ) {
229197; CHECK-LABEL: i32_mac_u16:
230- ; CHECK: i32x4.load16x4_u 0:p2align=1
231- ; CHECK: i32x4.load16x4_u 0:p2align=1
232- ; CHECK: i32x4.mul
233- ; CHECK: i32x4.add
234-
235- ; MAX-BANDWIDTH: v128.load
236- ; MAX-BANDWIDTH: v128.load
237- ; MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u
238- ; MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u
239- ; MAX-BANDWIDTH: i32x4.add
240- ; MAX-BANDWIDTH: i32x4.add
198+ ; CHECK: v128.load
199+ ; CHECK: v128.load
200+ ; CHECK: i32x4.extmul_low_i16x8_u
201+ ; CHECK: i32x4.extmul_high_i16x8_u
202+ ; CHECK: i32x4.add
203+ ; CHECK: i32x4.add
241204
242205entry:
243206 %cmp7.not = icmp eq i32 %N , 0
@@ -265,21 +228,16 @@ for.body: ; preds = %entry, %for.body
265228
266229define hidden i32 @i32_mac_u16_s16 (ptr nocapture noundef readonly %a , ptr nocapture noundef readonly %b , i32 noundef %N ) {
267230; CHECK-LABEL: i32_mac_u16_s16:
268- ; CHECK: i32x4.load16x4_s 0:p2align=1
269- ; CHECK: i32x4.load16x4_u 0:p2align=1
270- ; CHECK: i32x4.mul
271- ; CHECK: i32x4.add
272-
273- ; MAX-BANDWIDTH: v128.load
274- ; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
275- ; MAX-BANDWIDTH: v128.load
276- ; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
277- ; MAX-BANDWIDTH: i32x4.mul
278- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
279- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
280- ; MAX-BANDWIDTH: i32x4.mul
281- ; MAX-BANDWIDTH: i32x4.add
282- ; MAX-BANDWIDTH: i32x4.add
231+ ; CHECK: v128.load
232+ ; CHECK: i32x4.extend_high_i16x8_s
233+ ; CHECK: v128.load
234+ ; CHECK: i32x4.extend_high_i16x8_u
235+ ; CHECK: i32x4.mul
236+ ; CHECK: i32x4.extend_low_i16x8_s
237+ ; CHECK: i32x4.extend_low_i16x8_u
238+ ; CHECK: i32x4.mul
239+ ; CHECK: i32x4.add
240+ ; CHECK: i32x4.add
283241
284242entry:
285243 %cmp7.not = icmp eq i32 %N , 0
@@ -307,37 +265,30 @@ for.body: ; preds = %entry, %for.body
307265
308266define hidden i64 @i64_mac_u16 (ptr nocapture noundef readonly %a , ptr nocapture noundef readonly %b , i32 noundef %N ) {
309267; CHECK-LABEL: i64_mac_u16:
310- ; CHECK: v128.load32_zero 0:p2align=1
311- ; CHECK: i32x4.extend_low_i16x8_u
312- ; CHECK: v128.load32_zero 0:p2align=1
313- ; CHECK: i32x4.extend_low_i16x8_u
314- ; CHECK: i64x2.extmul_low_i32x4_u
315- ; CHECK: i64x2.add
316-
317- ; MAX-BANDWIDTH: v128.load
318- ; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
319- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
320- ; MAX-BANDWIDTH: v128.load
321- ; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
322- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
323- ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
324- ; MAX-BANDWIDTH: i64x2.add
325- ; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
326- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
327- ; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
328- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
329- ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
330- ; MAX-BANDWIDTH: i64x2.add
331- ; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
332- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
333- ; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
334- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
335- ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
336- ; MAX-BANDWIDTH: i64x2.add
337- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
338- ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
339- ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
340- ; MAX-BANDWIDTH: i64x2.add
268+ ; CHECK: v128.load
269+ ; CHECK: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
270+ ; CHECK: i32x4.extend_low_i16x8_u
271+ ; CHECK: v128.load
272+ ; CHECK: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
273+ ; CHECK: i32x4.extend_low_i16x8_u
274+ ; CHECK: i64x2.extmul_low_i32x4_u
275+ ; CHECK: i64x2.add
276+ ; CHECK: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
277+ ; CHECK: i32x4.extend_low_i16x8_u
278+ ; CHECK: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
279+ ; CHECK: i32x4.extend_low_i16x8_u
280+ ; CHECK: i64x2.extmul_low_i32x4_u
281+ ; CHECK: i64x2.add
282+ ; CHECK: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
283+ ; CHECK: i32x4.extend_low_i16x8_u
284+ ; CHECK: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
285+ ; CHECK: i32x4.extend_low_i16x8_u
286+ ; CHECK: i64x2.extmul_low_i32x4_u
287+ ; CHECK: i64x2.add
288+ ; CHECK: i32x4.extend_low_i16x8_u
289+ ; CHECK: i32x4.extend_low_i16x8_u
290+ ; CHECK: i64x2.extmul_low_i32x4_u
291+ ; CHECK: i64x2.add
341292
342293entry:
343294 %cmp8.not = icmp eq i32 %N , 0
@@ -365,19 +316,13 @@ for.body: ; preds = %entry, %for.body
365316
366317define hidden i64 @i64_mac_u32 (ptr nocapture noundef readonly %a , ptr nocapture noundef readonly %b , i32 noundef %N ) {
367318; CHECK-LABEL: i64_mac_u32:
368- ; CHECK: v128.load64_zero 0:p2align=2
369- ; CHECK: v128.load64_zero 0:p2align=2
370- ; CHECK: i32x4.mul
371- ; CHECK: i64x2.extend_low_i32x4_u
372- ; CHECK: i64x2.add
373-
374- ; MAX-BANDWIDTH: v128.load
375- ; MAX-BANDWIDTH: v128.load
376- ; MAX-BANDWIDTH: i32x4.mul
377- ; MAX-BANDWIDTH: i64x2.extend_high_i32x4_u
378- ; MAX-BANDWIDTH: i64x2.add
379- ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
380- ; MAX-BANDWIDTH: i64x2.add
319+ ; CHECK: v128.load
320+ ; CHECK: v128.load
321+ ; CHECK: i32x4.mul
322+ ; CHECK: i64x2.extend_high_i32x4_u
323+ ; CHECK: i64x2.add
324+ ; CHECK: i64x2.extend_low_i32x4_u
325+ ; CHECK: i64x2.add
381326
382327entry:
383328 %cmp6.not = icmp eq i32 %N , 0
0 commit comments