Skip to content

Commit 9004ff2

Browse files
committed
Implement widening FMMLA intrinsics
- F16 to F32 - MF8 to F32 - MF8 to F16
1 parent 1f65ab1 commit 9004ff2

File tree

9 files changed

+247
-1
lines changed

9 files changed

+247
-1
lines changed

clang/include/clang/Basic/arm_sve.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,6 +1196,18 @@ def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla
11961196
let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in {
11971197
def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla">;
11981198

1199+
let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in {
1200+
def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "MMdd", "h", MergeNone, "aarch64_sve_fmmla_f16f32", [IsOverloadNone]>;
1201+
}
1202+
1203+
let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in {
1204+
def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "MM~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f32", [IsOverloadNone]>;
1205+
}
1206+
1207+
let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in {
1208+
def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "OO~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f16", [IsOverloadNone]>;
1209+
}
1210+
11991211
def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">;
12001212
def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn2q">;
12011213
def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_uzp1q">;
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
2+
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
3+
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
4+
// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
5+
// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
6+
// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
7+
8+
// REQUIRES: aarch64-registered-target
9+
10+
#include <arm_sve.h>
11+
12+
#ifdef SVE_OVERLOADED_FORMS
13+
// A simple used,unused... macro, long enough to represent any SVE builtin.
14+
#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
15+
#else
16+
#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
17+
#endif
18+
19+
// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32f16(
20+
// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
21+
// CHECK-NEXT: [[ENTRY:.*:]]
22+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
23+
// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
24+
//
25+
// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32f16u13__SVFloat32_tu13__SVFloat16_tS0_(
26+
// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
27+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
28+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
29+
// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
30+
//
31+
svfloat32_t test_f32f16(svfloat32_t acc, svfloat16_t a, svfloat16_t b) {
32+
return SVE_ACLE_FUNC(svmmla, _f32_f16, , )(acc, a, b);
33+
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
2+
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
3+
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
4+
// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
5+
// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
6+
// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
7+
8+
// REQUIRES: aarch64-registered-target
9+
10+
#include <arm_sve.h>
11+
12+
#ifdef SVE_OVERLOADED_FORMS
13+
// A simple used,unused... macro, long enough to represent any SVE builtin.
14+
#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
15+
#else
16+
#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
17+
#endif
18+
19+
// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_f16mf8(
20+
// CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
21+
// CHECK-NEXT: [[ENTRY:.*:]]
22+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
23+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
24+
// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
25+
//
26+
// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z11test_f16mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
27+
// CPP-CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
28+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
29+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
30+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
31+
// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
32+
//
33+
svfloat16_t test_f16mf8(svfloat16_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
34+
return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm, )(acc, a, b, fpmr);
35+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
2+
3+
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
4+
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
5+
// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
6+
// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
7+
// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
8+
9+
// REQUIRES: aarch64-registered-target
10+
11+
#include <arm_sve.h>
12+
13+
#ifdef SVE_OVERLOADED_FORMS
14+
// A simple used,unused... macro, long enough to represent any SVE builtin.
15+
#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
16+
#else
17+
#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
18+
#endif
19+
20+
// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32mf8(
21+
// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
22+
// CHECK-NEXT: [[ENTRY:.*:]]
23+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
24+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
25+
// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
26+
//
27+
// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
28+
// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
29+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
30+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
31+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
32+
// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
33+
//
34+
svfloat32_t test_f32mf8(svfloat32_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
35+
return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm, )(acc, a, b, fpmr);
36+
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2807,6 +2807,20 @@ def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic;
28072807
//
28082808
def int_aarch64_sve_fmmla : AdvSIMD_3VectorArg_Intrinsic;
28092809

2810+
def int_aarch64_sve_fmmla_f16f32
2811+
: DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
2812+
[ llvm_nxv4f32_ty, llvm_nxv8f16_ty, llvm_nxv8f16_ty ],
2813+
[IntrNoMem]>;
2814+
2815+
def int_aarch64_sve_fmmla_mf8f32
2816+
: DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
2817+
[ llvm_nxv4f32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ],
2818+
[IntrNoMem]>;
2819+
2820+
def int_aarch64_sve_fmmla_mf8f16
2821+
: DefaultAttrsIntrinsic<[llvm_nxv8f16_ty],
2822+
[ llvm_nxv8f16_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ],
2823+
[IntrNoMem]>;
28102824
//
28112825
// SVE ACLE: 7.2. BFloat16 extensions
28122826
//

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3684,7 +3684,7 @@ let Predicates = [HasSVE, HasMatMulFP32] in {
36843684
} // End HasSVE, HasMatMulFP32
36853685

36863686
let Predicates = [HasSVE_F16F32MM] in {
3687-
def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16>;
3687+
defm FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla_f16f32, nxv4f32, nxv8f16>;
36883688
} // End HasSVE_F16F32MM
36893689

36903690
let Predicates = [HasSVE, HasMatMulFP64] in {
@@ -4745,10 +4745,14 @@ defm FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt", nxv4f32, int_aarch64_
47454745

47464746
let Predicates = [HasSVE2, HasF8F32MM] in {
47474747
def FMMLA_ZZZ_BtoS : sve2_fp8_mmla<0b0, ZPR32, "fmmla">;
4748+
def : Pat<(nxv4f32 (int_aarch64_sve_fmmla_mf8f32 nxv4f32:$acc, nxv16i8:$zn, nxv16i8:$zm)),
4749+
(FMMLA_ZZZ_BtoS $acc, $zn, $zm)>;
47484750
}
47494751

47504752
let Predicates = [HasSVE2, HasF8F16MM] in {
47514753
def FMMLA_ZZZ_BtoH : sve2_fp8_mmla<0b1, ZPR16, "fmmla">;
4754+
def : Pat<(nxv8f16 (int_aarch64_sve_fmmla_mf8f16 nxv8f16:$acc, nxv16i8:$zn, nxv16i8:$zm)),
4755+
(FMMLA_ZZZ_BtoH $acc, $zn, $zm)>;
47524756
}
47534757

47544758
let Predicates = [HasSSVE_FP8DOT2] in {
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve-f16f32mm < %s | FileCheck %s --check-prefixes=CHECK
3+
4+
define <vscale x 4 x float> @_Z1tu13__SVFloat32_tu13__SVFloat16_tS0_(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
5+
; CHECK-LABEL: _Z1tu13__SVFloat32_tu13__SVFloat16_tS0_:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
8+
; CHECK-NEXT: addvl sp, sp, #-3
9+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
10+
; CHECK-NEXT: .cfi_offset w29, -16
11+
; CHECK-NEXT: str z0, [sp, #2, mul vl]
12+
; CHECK-NEXT: fmmla z0.s, z1.h, z2.h
13+
; CHECK-NEXT: str z1, [sp, #1, mul vl]
14+
; CHECK-NEXT: str z2, [sp]
15+
; CHECK-NEXT: addvl sp, sp, #3
16+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
17+
; CHECK-NEXT: ret
18+
entry:
19+
%acc.addr = alloca <vscale x 4 x float>, align 16
20+
%a.addr = alloca <vscale x 8 x half>, align 16
21+
%b.addr = alloca <vscale x 8 x half>, align 16
22+
store <vscale x 4 x float> %acc, ptr %acc.addr, align 16
23+
store <vscale x 8 x half> %a, ptr %a.addr, align 16
24+
store <vscale x 8 x half> %b, ptr %b.addr, align 16
25+
%0 = load <vscale x 4 x float>, ptr %acc.addr, align 16
26+
%1 = load <vscale x 8 x half>, ptr %a.addr, align 16
27+
%2 = load <vscale x 8 x half>, ptr %b.addr, align 16
28+
%3 = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2)
29+
ret <vscale x 4 x float> %3
30+
}
31+
32+
declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f16mm < %s | FileCheck %s --check-prefixes=CHECK
3+
4+
define <vscale x 8 x half> @_Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 %fpmr) {
5+
; CHECK-LABEL: _Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
8+
; CHECK-NEXT: addvl sp, sp, #-3
9+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
10+
; CHECK-NEXT: .cfi_offset w29, -16
11+
; CHECK-NEXT: addvl x8, sp, #3
12+
; CHECK-NEXT: str z1, [sp, #1, mul vl]
13+
; CHECK-NEXT: str z0, [sp, #2, mul vl]
14+
; CHECK-NEXT: str z2, [sp]
15+
; CHECK-NEXT: str x0, [x8, #8]
16+
; CHECK-NEXT: msr FPMR, x0
17+
; CHECK-NEXT: fmmla z0.h, z1.b, z2.b
18+
; CHECK-NEXT: addvl sp, sp, #3
19+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
20+
; CHECK-NEXT: ret
21+
entry:
22+
%acc.addr = alloca <vscale x 8 x half>, align 16
23+
%a.addr = alloca <vscale x 16 x i8>, align 16
24+
%b.addr = alloca <vscale x 16 x i8>, align 16
25+
%fpmr.addr = alloca i64, align 8
26+
store <vscale x 8 x half> %acc, ptr %acc.addr, align 16
27+
store <vscale x 16 x i8> %a, ptr %a.addr, align 16
28+
store <vscale x 16 x i8> %b, ptr %b.addr, align 16
29+
store i64 %fpmr, ptr %fpmr.addr, align 8
30+
%0 = load <vscale x 8 x half>, ptr %acc.addr, align 16
31+
%1 = load <vscale x 16 x i8>, ptr %a.addr, align 16
32+
%2 = load <vscale x 16 x i8>, ptr %b.addr, align 16
33+
%3 = load i64, ptr %fpmr.addr, align 8
34+
call void @llvm.aarch64.set.fpmr(i64 %3)
35+
%4 = call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2)
36+
ret <vscale x 8 x half> %4
37+
}
38+
39+
declare <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half>, <vscale x 16 x i8>, <vscale x 16 x i8>)
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f32mm < %s | FileCheck %s --check-prefixes=CHECK
3+
4+
define dso_local <vscale x 4 x float> @_Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 noundef %fpmr) #0 {
5+
; CHECK-LABEL: _Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
8+
; CHECK-NEXT: addvl sp, sp, #-3
9+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
10+
; CHECK-NEXT: .cfi_offset w29, -16
11+
; CHECK-NEXT: addvl x8, sp, #3
12+
; CHECK-NEXT: str z1, [sp, #1, mul vl]
13+
; CHECK-NEXT: str z0, [sp, #2, mul vl]
14+
; CHECK-NEXT: str z2, [sp]
15+
; CHECK-NEXT: str x0, [x8, #8]
16+
; CHECK-NEXT: msr FPMR, x0
17+
; CHECK-NEXT: fmmla z0.s, z1.b, z2.b
18+
; CHECK-NEXT: addvl sp, sp, #3
19+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
20+
; CHECK-NEXT: ret
21+
entry:
22+
%acc.addr = alloca <vscale x 4 x float>, align 16
23+
%a.addr = alloca <vscale x 16 x i8>, align 16
24+
%b.addr = alloca <vscale x 16 x i8>, align 16
25+
%fpmr.addr = alloca i64, align 8
26+
store <vscale x 4 x float> %acc, ptr %acc.addr, align 16
27+
store <vscale x 16 x i8> %a, ptr %a.addr, align 16
28+
store <vscale x 16 x i8> %b, ptr %b.addr, align 16
29+
store i64 %fpmr, ptr %fpmr.addr, align 8
30+
%0 = load <vscale x 4 x float>, ptr %acc.addr, align 16
31+
%1 = load <vscale x 16 x i8>, ptr %a.addr, align 16
32+
%2 = load <vscale x 16 x i8>, ptr %b.addr, align 16
33+
%3 = load i64, ptr %fpmr.addr, align 8
34+
call void @llvm.aarch64.set.fpmr(i64 %3)
35+
%4 = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2)
36+
ret <vscale x 4 x float> %4
37+
}
38+
39+
declare void @llvm.aarch64.set.fpmr(i64)
40+
41+
declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i8>)

0 commit comments

Comments
 (0)