Skip to content

Commit 71c2091

Browse files
committed
[LV] Enable considering higher VFs when data extend ops are present in the loop
LV currently limits the VF based on the widest type in the loop. This might not be beneficial for loops with data extend ops in them. In some cases, this strategy has been found to inhibit considering higher VFs even though a higher VF might be profitable. This patch aims to relax this constraint to enable higher VFs and lets the cost model take the decision of considering whether a particular VF is beneficial or not.
1 parent 3cd6b86 commit 71c2091

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+2909
-2284
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4123,6 +4123,15 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
41234123
auto MaxVectorElementCount = ElementCount::get(
41244124
llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
41254125
ComputeScalableMaxVF);
4126+
4127+
// For loops with extend operations e.g. zext, sext etc., limiting the max VF
4128+
// based on widest type inhibits considering higher VFs even though
4129+
// vectorizing with higher VF might be profitable. In such cases, we should
4130+
// limit the max VF based on smallest type and the decision whether a
4131+
// particular VF is beneficial or not be left to cost model.
4132+
if (WidestType != SmallestType)
4133+
MaximizeBandwidth = true;
4134+
41264135
MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
41274136
LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
41284137
<< (MaxVectorElementCount * WidestType) << " bits.\n");

llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll

Lines changed: 104 additions & 159 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,20 @@
11
; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
2-
; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
2+
; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
33

44
target triple = "wasm32"
55

66
define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
77
; CHECK-LABEL: i32_mac_s8:
8-
; CHECK: v128.load32_zero 0:p2align=0
9-
; CHECK: i16x8.extend_low_i8x16_s
10-
; CHECK: v128.load32_zero 0:p2align=0
11-
; CHECK: i16x8.extend_low_i8x16_s
12-
; CHECK: i32x4.extmul_low_i16x8_s
13-
; CHECK: i32x4.add
14-
15-
; MAX-BANDWIDTH: v128.load
16-
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
17-
; MAX-BANDWIDTH: v128.load
18-
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
19-
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
20-
; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
21-
; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
22-
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
23-
; MAX-BANDWIDTH: i32x4.add
24-
; MAX-BANDWIDTH: i32x4.add
8+
; CHECK: v128.load
9+
; CHECK: i16x8.extend_low_i8x16_s
10+
; CHECK: v128.load
11+
; CHECK: i16x8.extend_low_i8x16_s
12+
; CHECK: i32x4.dot_i16x8_s
13+
; CHECK: i16x8.extend_high_i8x16_s
14+
; CHECK: i16x8.extend_high_i8x16_s
15+
; CHECK: i32x4.dot_i16x8_s
16+
; CHECK: i32x4.add
17+
; CHECK: i32x4.add
2518

2619
entry:
2720
%cmp7.not = icmp eq i32 %N, 0
@@ -49,14 +42,9 @@ for.body: ; preds = %entry, %for.body
4942

5043
define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
5144
; CHECK-LABEL: i32_mac_s16:
52-
; CHECK: i32x4.load16x4_s 0:p2align=1
53-
; CHECK: i32x4.load16x4_s 0:p2align=1
54-
; CHECK: i32x4.mul
55-
; CHECK: i32x4.add
56-
57-
; MAX-BANDWIDTH: v128.load
58-
; MAX-BANDWIDTH: v128.load
59-
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
45+
; CHECK: v128.load
46+
; CHECK: v128.load
47+
; CHECK: i32x4.dot_i16x8_s
6048

6149
entry:
6250
%cmp7.not = icmp eq i32 %N, 0
@@ -84,37 +72,30 @@ for.body: ; preds = %entry, %for.body
8472

8573
define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
8674
; CHECK-LABEL: i64_mac_s16:
87-
; CHECK: v128.load32_zero 0:p2align=1
88-
; CHECK: i32x4.extend_low_i16x8_s
89-
; CHECK: v128.load32_zero 0:p2align=1
90-
; CHECK: i32x4.extend_low_i16x8_s
91-
; CHECK: i64x2.extmul_low_i32x4_s
92-
; CHECK: i64x2.add
93-
94-
; MAX-BANDWIDTH: v128.load
95-
; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
96-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
97-
; MAX-BANDWIDTH: v128.load
98-
; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
99-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
100-
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
101-
; MAX-BANDWIDTH: i64x2.add
102-
; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
103-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
104-
; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
105-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
106-
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
107-
; MAX-BANDWIDTH: i64x2.add
108-
; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
109-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
110-
; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
111-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
112-
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
113-
; MAX-BANDWIDTH: i64x2.add
114-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
115-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
116-
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
117-
; MAX-BANDWIDTH: i64x2.add
75+
; CHECK: v128.load
76+
; CHECK: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
77+
; CHECK: i32x4.extend_low_i16x8_s
78+
; CHECK: v128.load
79+
; CHECK: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
80+
; CHECK: i32x4.extend_low_i16x8_s
81+
; CHECK: i64x2.extmul_low_i32x4_s
82+
; CHECK: i64x2.add
83+
; CHECK: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
84+
; CHECK: i32x4.extend_low_i16x8_s
85+
; CHECK: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
86+
; CHECK: i32x4.extend_low_i16x8_s
87+
; CHECK: i64x2.extmul_low_i32x4_s
88+
; CHECK: i64x2.add
89+
; CHECK: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
90+
; CHECK: i32x4.extend_low_i16x8_s
91+
; CHECK: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
92+
; CHECK: i32x4.extend_low_i16x8_s
93+
; CHECK: i64x2.extmul_low_i32x4_s
94+
; CHECK: i64x2.add
95+
; CHECK: i32x4.extend_low_i16x8_s
96+
; CHECK: i32x4.extend_low_i16x8_s
97+
; CHECK: i64x2.extmul_low_i32x4_s
98+
; CHECK: i64x2.add
11899

119100
entry:
120101
%cmp7.not = icmp eq i32 %N, 0
@@ -142,19 +123,13 @@ for.body: ; preds = %entry, %for.body
142123

143124
define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
144125
; CHECK-LABEL: i64_mac_s32:
145-
; CHECK: v128.load64_zero 0:p2align=2
146-
; CHECK: v128.load64_zero 0:p2align=2
147-
; CHECK: i32x4.mul
148-
; CHECK: i64x2.extend_low_i32x4_s
149-
; CHECK: i64x2.add
150-
151-
; MAX-BANDWIDTH: v128.load
152-
; MAX-BANDWIDTH: v128.load
153-
; MAX-BANDWIDTH: i32x4.mul
154-
; MAX-BANDWIDTH: i64x2.extend_high_i32x4_s
155-
; MAX-BANDWIDTH: i64x2.add
156-
; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
157-
; MAX-BANDWIDTH: i64x2.add
126+
; CHECK: v128.load
127+
; CHECK: v128.load
128+
; CHECK: i32x4.mul
129+
; CHECK: i64x2.extend_high_i32x4_s
130+
; CHECK: i64x2.add
131+
; CHECK: i64x2.extend_low_i32x4_s
132+
; CHECK: i64x2.add
158133

159134
entry:
160135
%cmp6.not = icmp eq i32 %N, 0
@@ -181,25 +156,18 @@ for.body: ; preds = %entry, %for.body
181156

182157
define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
183158
; CHECK-LABEL: i32_mac_u8:
184-
; CHECK: v128.load32_zero 0:p2align=0
185-
; CHECK: i16x8.extend_low_i8x16_u
186-
; CHECK: v128.load32_zero 0:p2align=0
187-
; CHECK: i16x8.extend_low_i8x16_u
188-
; CHECK: i32x4.extmul_low_i16x8_u
189-
; CHECK: i32x4.add
190-
191-
; MAX-BANDWIDTH: v128.load
192-
; MAX-BANDWIDTH: v128.load
193-
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u
194-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
195-
; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
196-
; MAX-BANDWIDTH: i32x4.add
197-
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u
198-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
199-
; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
200-
; MAX-BANDWIDTH: i32x4.add
201-
; MAX-BANDWIDTH: i32x4.add
202-
; MAX-BANDWIDTH: i32x4.add
159+
; CHECK: v128.load
160+
; CHECK: v128.load
161+
; CHECK: i16x8.extmul_low_i8x16_u
162+
; CHECK: i32x4.extend_low_i16x8_u
163+
; CHECK: i32x4.extend_high_i16x8_u
164+
; CHECK: i32x4.add
165+
; CHECK: i16x8.extmul_high_i8x16_u
166+
; CHECK: i32x4.extend_low_i16x8_u
167+
; CHECK: i32x4.extend_high_i16x8_u
168+
; CHECK: i32x4.add
169+
; CHECK: i32x4.add
170+
; CHECK: i32x4.add
203171

204172
entry:
205173
%cmp7.not = icmp eq i32 %N, 0
@@ -227,17 +195,12 @@ for.body: ; preds = %entry, %for.body
227195

228196
define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
229197
; CHECK-LABEL: i32_mac_u16:
230-
; CHECK: i32x4.load16x4_u 0:p2align=1
231-
; CHECK: i32x4.load16x4_u 0:p2align=1
232-
; CHECK: i32x4.mul
233-
; CHECK: i32x4.add
234-
235-
; MAX-BANDWIDTH: v128.load
236-
; MAX-BANDWIDTH: v128.load
237-
; MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u
238-
; MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u
239-
; MAX-BANDWIDTH: i32x4.add
240-
; MAX-BANDWIDTH: i32x4.add
198+
; CHECK: v128.load
199+
; CHECK: v128.load
200+
; CHECK: i32x4.extmul_low_i16x8_u
201+
; CHECK: i32x4.extmul_high_i16x8_u
202+
; CHECK: i32x4.add
203+
; CHECK: i32x4.add
241204

242205
entry:
243206
%cmp7.not = icmp eq i32 %N, 0
@@ -265,21 +228,16 @@ for.body: ; preds = %entry, %for.body
265228

266229
define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
267230
; CHECK-LABEL: i32_mac_u16_s16:
268-
; CHECK: i32x4.load16x4_s 0:p2align=1
269-
; CHECK: i32x4.load16x4_u 0:p2align=1
270-
; CHECK: i32x4.mul
271-
; CHECK: i32x4.add
272-
273-
; MAX-BANDWIDTH: v128.load
274-
; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
275-
; MAX-BANDWIDTH: v128.load
276-
; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
277-
; MAX-BANDWIDTH: i32x4.mul
278-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
279-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
280-
; MAX-BANDWIDTH: i32x4.mul
281-
; MAX-BANDWIDTH: i32x4.add
282-
; MAX-BANDWIDTH: i32x4.add
231+
; CHECK: v128.load
232+
; CHECK: i32x4.extend_high_i16x8_s
233+
; CHECK: v128.load
234+
; CHECK: i32x4.extend_high_i16x8_u
235+
; CHECK: i32x4.mul
236+
; CHECK: i32x4.extend_low_i16x8_s
237+
; CHECK: i32x4.extend_low_i16x8_u
238+
; CHECK: i32x4.mul
239+
; CHECK: i32x4.add
240+
; CHECK: i32x4.add
283241

284242
entry:
285243
%cmp7.not = icmp eq i32 %N, 0
@@ -307,37 +265,30 @@ for.body: ; preds = %entry, %for.body
307265

308266
define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
309267
; CHECK-LABEL: i64_mac_u16:
310-
; CHECK: v128.load32_zero 0:p2align=1
311-
; CHECK: i32x4.extend_low_i16x8_u
312-
; CHECK: v128.load32_zero 0:p2align=1
313-
; CHECK: i32x4.extend_low_i16x8_u
314-
; CHECK: i64x2.extmul_low_i32x4_u
315-
; CHECK: i64x2.add
316-
317-
; MAX-BANDWIDTH: v128.load
318-
; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
319-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
320-
; MAX-BANDWIDTH: v128.load
321-
; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
322-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
323-
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
324-
; MAX-BANDWIDTH: i64x2.add
325-
; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
326-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
327-
; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
328-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
329-
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
330-
; MAX-BANDWIDTH: i64x2.add
331-
; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
332-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
333-
; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
334-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
335-
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
336-
; MAX-BANDWIDTH: i64x2.add
337-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
338-
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
339-
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
340-
; MAX-BANDWIDTH: i64x2.add
268+
; CHECK: v128.load
269+
; CHECK: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
270+
; CHECK: i32x4.extend_low_i16x8_u
271+
; CHECK: v128.load
272+
; CHECK: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
273+
; CHECK: i32x4.extend_low_i16x8_u
274+
; CHECK: i64x2.extmul_low_i32x4_u
275+
; CHECK: i64x2.add
276+
; CHECK: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
277+
; CHECK: i32x4.extend_low_i16x8_u
278+
; CHECK: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
279+
; CHECK: i32x4.extend_low_i16x8_u
280+
; CHECK: i64x2.extmul_low_i32x4_u
281+
; CHECK: i64x2.add
282+
; CHECK: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
283+
; CHECK: i32x4.extend_low_i16x8_u
284+
; CHECK: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
285+
; CHECK: i32x4.extend_low_i16x8_u
286+
; CHECK: i64x2.extmul_low_i32x4_u
287+
; CHECK: i64x2.add
288+
; CHECK: i32x4.extend_low_i16x8_u
289+
; CHECK: i32x4.extend_low_i16x8_u
290+
; CHECK: i64x2.extmul_low_i32x4_u
291+
; CHECK: i64x2.add
341292

342293
entry:
343294
%cmp8.not = icmp eq i32 %N, 0
@@ -365,19 +316,13 @@ for.body: ; preds = %entry, %for.body
365316

366317
define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
367318
; CHECK-LABEL: i64_mac_u32:
368-
; CHECK: v128.load64_zero 0:p2align=2
369-
; CHECK: v128.load64_zero 0:p2align=2
370-
; CHECK: i32x4.mul
371-
; CHECK: i64x2.extend_low_i32x4_u
372-
; CHECK: i64x2.add
373-
374-
; MAX-BANDWIDTH: v128.load
375-
; MAX-BANDWIDTH: v128.load
376-
; MAX-BANDWIDTH: i32x4.mul
377-
; MAX-BANDWIDTH: i64x2.extend_high_i32x4_u
378-
; MAX-BANDWIDTH: i64x2.add
379-
; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
380-
; MAX-BANDWIDTH: i64x2.add
319+
; CHECK: v128.load
320+
; CHECK: v128.load
321+
; CHECK: i32x4.mul
322+
; CHECK: i64x2.extend_high_i32x4_u
323+
; CHECK: i64x2.add
324+
; CHECK: i64x2.extend_low_i32x4_u
325+
; CHECK: i64x2.add
381326

382327
entry:
383328
%cmp6.not = icmp eq i32 %N, 0

llvm/test/CodeGen/WebAssembly/interleave.ll

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,37 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-n32:64-S128-n
1515
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
1616
define hidden void @accumulate8x2(ptr dead_on_unwind noalias writable sret(%struct.Output32x2) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
1717
; CHECK-LABEL: accumulate8x2:
18-
; CHECK: loop
19-
; CHECK: v128.load64_zero
20-
; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
18+
; CHECK: v128.load 16:p2align=0
19+
; CHECK: i8x16.shuffle 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
2120
; CHECK: i16x8.extend_low_i8x16_u
2221
; CHECK: i32x4.extend_low_i16x8_u
23-
; CHECK: i32x4.add
24-
; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
22+
; CHECK: i32x4.add
23+
; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
24+
; CHECK: i16x8.extend_low_i8x16_u
25+
; CHECK: i32x4.extend_low_i16x8_u
26+
; CHECK: i32x4.add
27+
; CHECK: v128.load 0:p2align=0
28+
; CHECK: i8x16.shuffle 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
29+
; CHECK: i16x8.extend_low_i8x16_u
30+
; CHECK: i32x4.extend_low_i16x8_u
31+
; CHECK: i32x4.add
32+
; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
33+
; CHECK: i16x8.extend_low_i8x16_u
34+
; CHECK: i32x4.extend_low_i16x8_u
35+
; CHECK: i32x4.add
36+
; CHECK: i8x16.shuffle 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
37+
; CHECK: i16x8.extend_low_i8x16_u
38+
; CHECK: i32x4.extend_low_i16x8_u
39+
; CHECK: i32x4.add
40+
; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
41+
; CHECK: i16x8.extend_low_i8x16_u
42+
; CHECK: i32x4.extend_low_i16x8_u
43+
; CHECK: i32x4.add
44+
; CHECK: i8x16.shuffle 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
45+
; CHECK: i16x8.extend_low_i8x16_u
46+
; CHECK: i32x4.extend_low_i16x8_u
47+
; CHECK: i32x4.add
48+
; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
2549
; CHECK: i16x8.extend_low_i8x16_u
2650
; CHECK: i32x4.extend_low_i16x8_u
2751
; CHECK: i32x4.add

0 commit comments

Comments
 (0)