Skip to content

Commit 60a99c1

Browse files
sparker-armLukacma
authored andcommitted
[WebAssembly] Avoid dot for v16i8 partial_smla (llvm#163796)
The sequence is shorter, by two extend operations, if we just use extmul and extadd_pairwise.
1 parent d924248 commit 60a99c1

File tree

2 files changed

+7
-11
lines changed

2 files changed

+7
-11
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1583,11 +1583,9 @@ def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v8i16 V128:$lhs),
15831583
// MLA: v16i8 -> v4i32
15841584
def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs),
15851585
(v16i8 V128:$rhs))),
1586-
(ADD_I32x4 (ADD_I32x4 (DOT (extend_low_s_I16x8 $lhs),
1587-
(extend_low_s_I16x8 $rhs)),
1588-
(DOT (extend_high_s_I16x8 $lhs),
1589-
(extend_high_s_I16x8 $rhs))),
1590-
$acc)>;
1586+
(ADD_I32x4 (ADD_I32x4 (extadd_pairwise_s_I32x4 (EXTMUL_LOW_S_I16x8 $lhs, $rhs)),
1587+
(extadd_pairwise_s_I32x4 (EXTMUL_HIGH_S_I16x8 $lhs, $rhs))),
1588+
$acc)>;
15911589
def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v16i8 V128:$lhs),
15921590
(v16i8 V128:$rhs))),
15931591
(ADD_I32x4 (ADD_I32x4 (extadd_pairwise_u_I32x4 (EXTMUL_LOW_U_I16x8 $lhs, $rhs)),

llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,11 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n
1414
; CHECK: i32x4.add
1515

1616
; MAX-BANDWIDTH: v128.load
17-
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
1817
; MAX-BANDWIDTH: v128.load
19-
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
20-
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
21-
; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
22-
; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
23-
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
18+
; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
19+
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
20+
; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
21+
; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
2422
; MAX-BANDWIDTH: i32x4.add
2523
; MAX-BANDWIDTH: i32x4.add
2624

0 commit comments

Comments
 (0)