Skip to content

Commit 65363e6

Browse files
authored
[WebAssembly] Partial SMLA with relaxed dot (#163529)
Lower v16i8 to v4i32 partial_smla to relaxed_dot_add. I'm still unsure whether we could/should take advantage of the unknown signedness of the rhs, and also lower the partial_sumla operation too.
1 parent db2d8fc commit 65363e6

File tree

2 files changed

+212
-0
lines changed

2 files changed

+212
-0
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,6 +1771,10 @@ defm RELAXED_DOT_ADD :
17711771
"i32x4.relaxed_dot_i8x16_i7x16_add_s\t$dst, $lhs, $rhs, $acc",
17721772
"i32x4.relaxed_dot_i8x16_i7x16_add_s", 0x113>;
17731773

1774+
def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs),
1775+
(v16i8 V128:$rhs))),
1776+
(RELAXED_DOT_ADD $lhs, $rhs, $acc)>, Requires<[HasRelaxedSIMD]>;
1777+
17741778
//===----------------------------------------------------------------------===//
17751779
// Relaxed BFloat16 dot product
17761780
//===----------------------------------------------------------------------===//

llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
22
; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
3+
; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH
34

45
target triple = "wasm32"
56

@@ -23,6 +24,10 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n
2324
; MAX-BANDWIDTH: i32x4.add
2425
; MAX-BANDWIDTH: i32x4.add
2526

27+
; RELAXED-MAX-BANDWIDTH: v128.load
28+
; RELAXED-MAX-BANDWIDTH: v128.load
29+
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
30+
2631
entry:
2732
%cmp7.not = icmp eq i32 %N, 0
2833
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -47,6 +52,109 @@ for.body: ; preds = %entry, %for.body
4752
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
4853
}
4954

55+
define hidden i32 @i32_mac_u8_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
56+
; CHECK-LABEL: i32_mac_u8_s8:
57+
; CHECK: loop
58+
; CHECK: v128.load32_zero
59+
; CHECK: i16x8.extend_low_i8x16_u
60+
; CHECK: i32x4.extend_low_i16x8_u
61+
; CHECK: v128.load32_zero
62+
; CHECK: i16x8.extend_low_i8x16_s
63+
; CHECK: i32x4.extend_low_i16x8_s
64+
; CHECK: i32x4.mul
65+
; CHECK: i32x4.add
66+
67+
; MAX-BANDWIDTH: loop
68+
; MAX-BANDWIDTH: v128.load
69+
; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
70+
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
71+
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
72+
; MAX-BANDWIDTH: v128.load
73+
; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
74+
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
75+
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
76+
; MAX-BANDWIDTH: i32x4.mul
77+
; MAX-BANDWIDTH: i32x4.add
78+
; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
79+
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
80+
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
81+
; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
82+
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
83+
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
84+
; MAX-BANDWIDTH: i32x4.mul
85+
; MAX-BANDWIDTH: i32x4.add
86+
; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
87+
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
88+
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
89+
; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
90+
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
91+
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
92+
; MAX-BANDWIDTH: i32x4.mul
93+
; MAX-BANDWIDTH: i32x4.add
94+
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
95+
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
96+
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
97+
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
98+
; MAX-BANDWIDTH: i32x4.mul
99+
; MAX-BANDWIDTH: i32x4.add
100+
101+
; RELAXED-MAX-BANDWIDTH: loop
102+
; RELAXED-MAX-BANDWIDTH: v128.load
103+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
104+
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
105+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
106+
; RELAXED-MAX-BANDWIDTH: v128.load
107+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
108+
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
109+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
110+
; RELAXED-MAX-BANDWIDTH: i32x4.mul
111+
; RELAXED-MAX-BANDWIDTH: i32x4.add
112+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
113+
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
114+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
115+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
116+
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
117+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
118+
; RELAXED-MAX-BANDWIDTH: i32x4.mul
119+
; RELAXED-MAX-BANDWIDTH: i32x4.add
120+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
121+
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
122+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
123+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
124+
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
125+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
126+
; RELAXED-MAX-BANDWIDTH: i32x4.mul
127+
; RELAXED-MAX-BANDWIDTH: i32x4.add
128+
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
129+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
130+
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
131+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
132+
; RELAXED-MAX-BANDWIDTH: i32x4.mul
133+
; RELAXED-MAX-BANDWIDTH: i32x4.add
134+
entry:
135+
%cmp7.not = icmp eq i32 %N, 0
136+
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
137+
138+
for.cond.cleanup: ; preds = %for.body, %entry
139+
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
140+
ret i32 %res.0.lcssa
141+
142+
for.body: ; preds = %entry, %for.body
143+
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
144+
%res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
145+
%arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09
146+
%0 = load i8, ptr %arrayidx, align 1
147+
%conv = sext i8 %0 to i32
148+
%arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09
149+
%1 = load i8, ptr %arrayidx1, align 1
150+
%conv2 = zext i8 %1 to i32
151+
%mul = mul nsw i32 %conv2, %conv
152+
%add = add nsw i32 %mul, %res.08
153+
%inc = add nuw i32 %i.09, 1
154+
%exitcond.not = icmp eq i32 %inc, %N
155+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
156+
}
157+
50158
define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
51159
; CHECK-LABEL: i32_mac_s16:
52160
; CHECK: i32x4.load16x4_s 0:p2align=1
@@ -57,6 +165,12 @@ define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
57165
; MAX-BANDWIDTH: v128.load
58166
; MAX-BANDWIDTH: v128.load
59167
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
168+
; MAX-BANDWIDTH: i32x4.add
169+
170+
; RELAXED-MAX-BANDWIDTH: v128.load
171+
; RELAXED-MAX-BANDWIDTH: v128.load
172+
; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s
173+
; RELAXED-MAX-BANDWIDTH: i32x4.add
60174

61175
entry:
62176
%cmp7.not = icmp eq i32 %N, 0
@@ -116,6 +230,31 @@ define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
116230
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
117231
; MAX-BANDWIDTH: i64x2.add
118232

233+
; RELAXED-MAX-BANDWIDTH: v128.load
234+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
235+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
236+
; RELAXED-MAX-BANDWIDTH: v128.load
237+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
238+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
239+
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
240+
; RELAXED-MAX-BANDWIDTH: i64x2.add
241+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
242+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
243+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
244+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
245+
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
246+
; RELAXED-MAX-BANDWIDTH: i64x2.add
247+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
248+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
249+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
250+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
251+
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
252+
; RELAXED-MAX-BANDWIDTH: i64x2.add
253+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
254+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
255+
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
256+
; RELAXED-MAX-BANDWIDTH: i64x2.add
257+
119258
entry:
120259
%cmp7.not = icmp eq i32 %N, 0
121260
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -156,6 +295,14 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture
156295
; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
157296
; MAX-BANDWIDTH: i64x2.add
158297

298+
; RELAXED-MAX-BANDWIDTH: v128.load
299+
; RELAXED-MAX-BANDWIDTH: v128.load
300+
; RELAXED-MAX-BANDWIDTH: i32x4.mul
301+
; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_s
302+
; RELAXED-MAX-BANDWIDTH: i64x2.add
303+
; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
304+
; RELAXED-MAX-BANDWIDTH: i64x2.add
305+
159306
entry:
160307
%cmp6.not = icmp eq i32 %N, 0
161308
br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
@@ -197,6 +344,15 @@ define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture n
197344
; MAX-BANDWIDTH: i32x4.add
198345
; MAX-BANDWIDTH: i32x4.add
199346

347+
; RELAXED-MAX-BANDWIDTH: v128.load
348+
; RELAXED-MAX-BANDWIDTH: v128.load
349+
; RELAXED-MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u
350+
; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
351+
; RELAXED-MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u
352+
; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
353+
; RELAXED-MAX-BANDWIDTH: i32x4.add
354+
; RELAXED-MAX-BANDWIDTH: i32x4.add
355+
200356
entry:
201357
%cmp7.not = icmp eq i32 %N, 0
202358
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -235,6 +391,13 @@ define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
235391
; MAX-BANDWIDTH: i32x4.add
236392
; MAX-BANDWIDTH: i32x4.add
237393

394+
; RELAXED-MAX-BANDWIDTH: v128.load
395+
; RELAXED-MAX-BANDWIDTH: v128.load
396+
; RELAXED-MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u
397+
; RELAXED-MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u
398+
; RELAXED-MAX-BANDWIDTH: i32x4.add
399+
; RELAXED-MAX-BANDWIDTH: i32x4.add
400+
238401
entry:
239402
%cmp7.not = icmp eq i32 %N, 0
240403
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -277,6 +440,17 @@ define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapt
277440
; MAX-BANDWIDTH: i32x4.add
278441
; MAX-BANDWIDTH: i32x4.add
279442

443+
; RELAXED-MAX-BANDWIDTH: v128.load
444+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
445+
; RELAXED-MAX-BANDWIDTH: v128.load
446+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
447+
; RELAXED-MAX-BANDWIDTH: i32x4.mul
448+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
449+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
450+
; RELAXED-MAX-BANDWIDTH: i32x4.mul
451+
; RELAXED-MAX-BANDWIDTH: i32x4.add
452+
; RELAXED-MAX-BANDWIDTH: i32x4.add
453+
280454
entry:
281455
%cmp7.not = icmp eq i32 %N, 0
282456
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -335,6 +509,32 @@ define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
335509
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
336510
; MAX-BANDWIDTH: i64x2.add
337511

512+
; RELAXED-MAX-BANDWIDTH: v128.load
513+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
514+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
515+
; RELAXED-MAX-BANDWIDTH: v128.load
516+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
517+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
518+
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
519+
; RELAXED-MAX-BANDWIDTH: i64x2.add
520+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
521+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
522+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
523+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
524+
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
525+
; RELAXED-MAX-BANDWIDTH: i64x2.add
526+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
527+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
528+
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
529+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
530+
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
531+
; RELAXED-MAX-BANDWIDTH: i64x2.add
532+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
533+
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
534+
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
535+
; RELAXED-MAX-BANDWIDTH: i64x2.add
536+
537+
338538
entry:
339539
%cmp8.not = icmp eq i32 %N, 0
340540
br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
@@ -375,6 +575,14 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture
375575
; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
376576
; MAX-BANDWIDTH: i64x2.add
377577

578+
; RELAXED-MAX-BANDWIDTH: v128.load
579+
; RELAXED-MAX-BANDWIDTH: v128.load
580+
; RELAXED-MAX-BANDWIDTH: i32x4.mul
581+
; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_u
582+
; RELAXED-MAX-BANDWIDTH: i64x2.add
583+
; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
584+
; RELAXED-MAX-BANDWIDTH: i64x2.add
585+
378586
entry:
379587
%cmp6.not = icmp eq i32 %N, 0
380588
br i1 %cmp6.not, label %for.cond.cleanup, label %for.body

0 commit comments

Comments
 (0)