Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
Original file line number Diff line number Diff line change
Expand Up @@ -1719,6 +1719,10 @@ defm RELAXED_DOT_ADD :
"i32x4.relaxed_dot_i8x16_i7x16_add_s\t$dst, $lhs, $rhs, $acc",
"i32x4.relaxed_dot_i8x16_i7x16_add_s", 0x113>;

def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs),
(v16i8 V128:$rhs))),
(RELAXED_DOT_ADD $lhs, $rhs, $acc)>, Requires<[HasRelaxedSIMD]>;

//===----------------------------------------------------------------------===//
// Relaxed BFloat16 dot product
//===----------------------------------------------------------------------===//
Expand Down
208 changes: 208 additions & 0 deletions llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH

target triple = "wasm32"

Expand All @@ -23,6 +24,10 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add

; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s

entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
Expand All @@ -47,6 +52,109 @@ for.body: ; preds = %entry, %for.body
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}

define hidden i32 @i32_mac_u8_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
; CHECK-LABEL: i32_mac_u8_s8:
; CHECK: loop
; CHECK: v128.load32_zero
; CHECK: i16x8.extend_low_i8x16_u
; CHECK: i32x4.extend_low_i16x8_u
; CHECK: v128.load32_zero
; CHECK: i16x8.extend_low_i8x16_s
; CHECK: i32x4.extend_low_i16x8_s
; CHECK: i32x4.mul
; CHECK: i32x4.add

; MAX-BANDWIDTH: loop
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; MAX-BANDWIDTH: i32x4.mul
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; MAX-BANDWIDTH: i32x4.mul
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; MAX-BANDWIDTH: i32x4.mul
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; MAX-BANDWIDTH: i32x4.mul
; MAX-BANDWIDTH: i32x4.add

; RELAXED-MAX-BANDWIDTH: loop
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.mul
; RELAXED-MAX-BANDWIDTH: i32x4.add
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.mul
; RELAXED-MAX-BANDWIDTH: i32x4.add
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.mul
; RELAXED-MAX-BANDWIDTH: i32x4.add
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.mul
; RELAXED-MAX-BANDWIDTH: i32x4.add
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body

for.cond.cleanup: ; preds = %for.body, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
ret i32 %res.0.lcssa

for.body: ; preds = %entry, %for.body
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09
%0 = load i8, ptr %arrayidx, align 1
%conv = sext i8 %0 to i32
%arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09
%1 = load i8, ptr %arrayidx1, align 1
%conv2 = zext i8 %1 to i32
%mul = mul nsw i32 %conv2, %conv
%add = add nsw i32 %mul, %res.08
%inc = add nuw i32 %i.09, 1
%exitcond.not = icmp eq i32 %inc, %N
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}

define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
; CHECK-LABEL: i32_mac_s16:
; CHECK: i32x4.load16x4_s 0:p2align=1
Expand All @@ -57,6 +165,12 @@ define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
; MAX-BANDWIDTH: i32x4.add

; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.add

entry:
%cmp7.not = icmp eq i32 %N, 0
Expand Down Expand Up @@ -116,6 +230,31 @@ define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
; MAX-BANDWIDTH: i64x2.add

; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
; RELAXED-MAX-BANDWIDTH: i64x2.add
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
; RELAXED-MAX-BANDWIDTH: i64x2.add
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
; RELAXED-MAX-BANDWIDTH: i64x2.add
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
; RELAXED-MAX-BANDWIDTH: i64x2.add

entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
Expand Down Expand Up @@ -156,6 +295,14 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
; MAX-BANDWIDTH: i64x2.add

; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i32x4.mul
; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_s
; RELAXED-MAX-BANDWIDTH: i64x2.add
; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
; RELAXED-MAX-BANDWIDTH: i64x2.add

entry:
%cmp6.not = icmp eq i32 %N, 0
br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
Expand Down Expand Up @@ -197,6 +344,15 @@ define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture n
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add

; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u
; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
; RELAXED-MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u
; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
; RELAXED-MAX-BANDWIDTH: i32x4.add
; RELAXED-MAX-BANDWIDTH: i32x4.add

entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
Expand Down Expand Up @@ -235,6 +391,13 @@ define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add

; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u
; RELAXED-MAX-BANDWIDTH: i32x4.add
; RELAXED-MAX-BANDWIDTH: i32x4.add

entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
Expand Down Expand Up @@ -277,6 +440,17 @@ define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapt
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add

; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
; RELAXED-MAX-BANDWIDTH: i32x4.mul
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: i32x4.mul
; RELAXED-MAX-BANDWIDTH: i32x4.add
; RELAXED-MAX-BANDWIDTH: i32x4.add

entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
Expand Down Expand Up @@ -335,6 +509,32 @@ define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
; MAX-BANDWIDTH: i64x2.add

; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
; RELAXED-MAX-BANDWIDTH: i64x2.add
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
; RELAXED-MAX-BANDWIDTH: i64x2.add
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
; RELAXED-MAX-BANDWIDTH: i64x2.add
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
; RELAXED-MAX-BANDWIDTH: i64x2.add


entry:
%cmp8.not = icmp eq i32 %N, 0
br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
Expand Down Expand Up @@ -375,6 +575,14 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
; MAX-BANDWIDTH: i64x2.add

; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: v128.load
; RELAXED-MAX-BANDWIDTH: i32x4.mul
; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_u
; RELAXED-MAX-BANDWIDTH: i64x2.add
; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
; RELAXED-MAX-BANDWIDTH: i64x2.add

entry:
%cmp6.not = icmp eq i32 %N, 0
br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
Expand Down