11; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
22; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
3+ ; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH
34
45target triple = "wasm32"
56
@@ -23,6 +24,10 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n
2324; MAX-BANDWIDTH: i32x4.add
2425; MAX-BANDWIDTH: i32x4.add
2526
27+ ; RELAXED-MAX-BANDWIDTH: v128.load
28+ ; RELAXED-MAX-BANDWIDTH: v128.load
29+ ; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
30+
2631entry:
2732 %cmp7.not = icmp eq i32 %N , 0
2833 br i1 %cmp7.not , label %for.cond.cleanup , label %for.body
@@ -47,6 +52,109 @@ for.body: ; preds = %entry, %for.body
4752 br i1 %exitcond.not , label %for.cond.cleanup , label %for.body
4853}
4954
55+ define hidden i32 @i32_mac_u8_s8 (ptr nocapture noundef readonly %a , ptr nocapture noundef readonly %b , i32 noundef %N ) {
56+ ; CHECK-LABEL: i32_mac_u8_s8:
57+ ; CHECK: loop
58+ ; CHECK: v128.load32_zero
59+ ; CHECK: i16x8.extend_low_i8x16_u
60+ ; CHECK: i32x4.extend_low_i16x8_u
61+ ; CHECK: v128.load32_zero
62+ ; CHECK: i16x8.extend_low_i8x16_s
63+ ; CHECK: i32x4.extend_low_i16x8_s
64+ ; CHECK: i32x4.mul
65+ ; CHECK: i32x4.add
66+
67+ ; MAX-BANDWIDTH: loop
68+ ; MAX-BANDWIDTH: v128.load
69+ ; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
70+ ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
71+ ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
72+ ; MAX-BANDWIDTH: v128.load
73+ ; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
74+ ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
75+ ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
76+ ; MAX-BANDWIDTH: i32x4.mul
77+ ; MAX-BANDWIDTH: i32x4.add
78+ ; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
79+ ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
80+ ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
81+ ; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
82+ ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
83+ ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
84+ ; MAX-BANDWIDTH: i32x4.mul
85+ ; MAX-BANDWIDTH: i32x4.add
86+ ; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
87+ ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
88+ ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
89+ ; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
90+ ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
91+ ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
92+ ; MAX-BANDWIDTH: i32x4.mul
93+ ; MAX-BANDWIDTH: i32x4.add
94+ ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
95+ ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
96+ ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
97+ ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
98+ ; MAX-BANDWIDTH: i32x4.mul
99+ ; MAX-BANDWIDTH: i32x4.add
100+
101+ ; RELAXED-MAX-BANDWIDTH: loop
102+ ; RELAXED-MAX-BANDWIDTH: v128.load
103+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
104+ ; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
105+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
106+ ; RELAXED-MAX-BANDWIDTH: v128.load
107+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
108+ ; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
109+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
110+ ; RELAXED-MAX-BANDWIDTH: i32x4.mul
111+ ; RELAXED-MAX-BANDWIDTH: i32x4.add
112+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
113+ ; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
114+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
115+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
116+ ; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
117+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
118+ ; RELAXED-MAX-BANDWIDTH: i32x4.mul
119+ ; RELAXED-MAX-BANDWIDTH: i32x4.add
120+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
121+ ; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
122+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
123+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
124+ ; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
125+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
126+ ; RELAXED-MAX-BANDWIDTH: i32x4.mul
127+ ; RELAXED-MAX-BANDWIDTH: i32x4.add
128+ ; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
129+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
130+ ; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
131+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
132+ ; RELAXED-MAX-BANDWIDTH: i32x4.mul
133+ ; RELAXED-MAX-BANDWIDTH: i32x4.add
134+ entry:
135+ %cmp7.not = icmp eq i32 %N , 0
136+ br i1 %cmp7.not , label %for.cond.cleanup , label %for.body
137+
138+ for.cond.cleanup: ; preds = %for.body, %entry
139+ %res.0.lcssa = phi i32 [ 0 , %entry ], [ %add , %for.body ]
140+ ret i32 %res.0.lcssa
141+
142+ for.body: ; preds = %entry, %for.body
143+ %i.09 = phi i32 [ %inc , %for.body ], [ 0 , %entry ]
144+ %res.08 = phi i32 [ %add , %for.body ], [ 0 , %entry ]
145+ %arrayidx = getelementptr inbounds i8 , ptr %a , i32 %i.09
146+ %0 = load i8 , ptr %arrayidx , align 1
147+ %conv = sext i8 %0 to i32
148+ %arrayidx1 = getelementptr inbounds i8 , ptr %b , i32 %i.09
149+ %1 = load i8 , ptr %arrayidx1 , align 1
150+ %conv2 = zext i8 %1 to i32
151+ %mul = mul nsw i32 %conv2 , %conv
152+ %add = add nsw i32 %mul , %res.08
153+ %inc = add nuw i32 %i.09 , 1
154+ %exitcond.not = icmp eq i32 %inc , %N
155+ br i1 %exitcond.not , label %for.cond.cleanup , label %for.body
156+ }
157+
50158define hidden i32 @i32_mac_s16 (ptr nocapture noundef readonly %a , ptr nocapture noundef readonly %b , i32 noundef %N ) {
51159; CHECK-LABEL: i32_mac_s16:
52160; CHECK: i32x4.load16x4_s 0:p2align=1
@@ -57,6 +165,12 @@ define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
57165; MAX-BANDWIDTH: v128.load
58166; MAX-BANDWIDTH: v128.load
59167; MAX-BANDWIDTH: i32x4.dot_i16x8_s
168+ ; MAX-BANDWIDTH: i32x4.add
169+
170+ ; RELAXED-MAX-BANDWIDTH: v128.load
171+ ; RELAXED-MAX-BANDWIDTH: v128.load
172+ ; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s
173+ ; RELAXED-MAX-BANDWIDTH: i32x4.add
60174
61175entry:
62176 %cmp7.not = icmp eq i32 %N , 0
@@ -116,6 +230,31 @@ define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
116230; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
117231; MAX-BANDWIDTH: i64x2.add
118232
233+ ; RELAXED-MAX-BANDWIDTH: v128.load
234+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
235+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
236+ ; RELAXED-MAX-BANDWIDTH: v128.load
237+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
238+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
239+ ; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
240+ ; RELAXED-MAX-BANDWIDTH: i64x2.add
241+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
242+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
243+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
244+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
245+ ; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
246+ ; RELAXED-MAX-BANDWIDTH: i64x2.add
247+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
248+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
249+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
250+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
251+ ; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
252+ ; RELAXED-MAX-BANDWIDTH: i64x2.add
253+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
254+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
255+ ; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
256+ ; RELAXED-MAX-BANDWIDTH: i64x2.add
257+
119258entry:
120259 %cmp7.not = icmp eq i32 %N , 0
121260 br i1 %cmp7.not , label %for.cond.cleanup , label %for.body
@@ -156,6 +295,14 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture
156295; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
157296; MAX-BANDWIDTH: i64x2.add
158297
298+ ; RELAXED-MAX-BANDWIDTH: v128.load
299+ ; RELAXED-MAX-BANDWIDTH: v128.load
300+ ; RELAXED-MAX-BANDWIDTH: i32x4.mul
301+ ; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_s
302+ ; RELAXED-MAX-BANDWIDTH: i64x2.add
303+ ; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
304+ ; RELAXED-MAX-BANDWIDTH: i64x2.add
305+
159306entry:
160307 %cmp6.not = icmp eq i32 %N , 0
161308 br i1 %cmp6.not , label %for.cond.cleanup , label %for.body
@@ -197,6 +344,15 @@ define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture n
197344; MAX-BANDWIDTH: i32x4.add
198345; MAX-BANDWIDTH: i32x4.add
199346
347+ ; RELAXED-MAX-BANDWIDTH: v128.load
348+ ; RELAXED-MAX-BANDWIDTH: v128.load
349+ ; RELAXED-MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u
350+ ; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
351+ ; RELAXED-MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u
352+ ; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
353+ ; RELAXED-MAX-BANDWIDTH: i32x4.add
354+ ; RELAXED-MAX-BANDWIDTH: i32x4.add
355+
200356entry:
201357 %cmp7.not = icmp eq i32 %N , 0
202358 br i1 %cmp7.not , label %for.cond.cleanup , label %for.body
@@ -235,6 +391,13 @@ define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
235391; MAX-BANDWIDTH: i32x4.add
236392; MAX-BANDWIDTH: i32x4.add
237393
394+ ; RELAXED-MAX-BANDWIDTH: v128.load
395+ ; RELAXED-MAX-BANDWIDTH: v128.load
396+ ; RELAXED-MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u
397+ ; RELAXED-MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u
398+ ; RELAXED-MAX-BANDWIDTH: i32x4.add
399+ ; RELAXED-MAX-BANDWIDTH: i32x4.add
400+
238401entry:
239402 %cmp7.not = icmp eq i32 %N , 0
240403 br i1 %cmp7.not , label %for.cond.cleanup , label %for.body
@@ -277,6 +440,17 @@ define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapt
277440; MAX-BANDWIDTH: i32x4.add
278441; MAX-BANDWIDTH: i32x4.add
279442
443+ ; RELAXED-MAX-BANDWIDTH: v128.load
444+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
445+ ; RELAXED-MAX-BANDWIDTH: v128.load
446+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
447+ ; RELAXED-MAX-BANDWIDTH: i32x4.mul
448+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
449+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
450+ ; RELAXED-MAX-BANDWIDTH: i32x4.mul
451+ ; RELAXED-MAX-BANDWIDTH: i32x4.add
452+ ; RELAXED-MAX-BANDWIDTH: i32x4.add
453+
280454entry:
281455 %cmp7.not = icmp eq i32 %N , 0
282456 br i1 %cmp7.not , label %for.cond.cleanup , label %for.body
@@ -335,6 +509,32 @@ define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
335509; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
336510; MAX-BANDWIDTH: i64x2.add
337511
512+ ; RELAXED-MAX-BANDWIDTH: v128.load
513+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
514+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
515+ ; RELAXED-MAX-BANDWIDTH: v128.load
516+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
517+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
518+ ; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
519+ ; RELAXED-MAX-BANDWIDTH: i64x2.add
520+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
521+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
522+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
523+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
524+ ; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
525+ ; RELAXED-MAX-BANDWIDTH: i64x2.add
526+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
527+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
528+ ; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
529+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
530+ ; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
531+ ; RELAXED-MAX-BANDWIDTH: i64x2.add
532+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
533+ ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
534+ ; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
535+ ; RELAXED-MAX-BANDWIDTH: i64x2.add
536+
537+
338538entry:
339539 %cmp8.not = icmp eq i32 %N , 0
340540 br i1 %cmp8.not , label %for.cond.cleanup , label %for.body
@@ -375,6 +575,14 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture
375575; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
376576; MAX-BANDWIDTH: i64x2.add
377577
578+ ; RELAXED-MAX-BANDWIDTH: v128.load
579+ ; RELAXED-MAX-BANDWIDTH: v128.load
580+ ; RELAXED-MAX-BANDWIDTH: i32x4.mul
581+ ; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_u
582+ ; RELAXED-MAX-BANDWIDTH: i64x2.add
583+ ; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
584+ ; RELAXED-MAX-BANDWIDTH: i64x2.add
585+
378586entry:
379587 %cmp6.not = icmp eq i32 %N , 0
380588 br i1 %cmp6.not , label %for.cond.cleanup , label %for.body
0 commit comments