Skip to content

Commit 7f2e6f1

Browse files
authored
[Clang][AArch64] Implement widening FMMLA intrinsics (#165282)
Proposed in [this ACLE proposal](ARM-software/acle#409), this PR implements widening FMMLA intrinsics. - F16 to F32 - MF8 to F32 - MF8 to F16 Additional changes: - IsOverloadCvt flag renamed to IsOverloadFirstandLast for clarity, as the name implies conversion. Implementation remains unchanged.
1 parent d984125 commit 7f2e6f1

File tree

15 files changed

+194
-25
lines changed

15 files changed

+194
-25
lines changed

clang/include/clang/Basic/TargetBuiltins.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,9 @@ namespace clang {
397397
}
398398
bool isOverloadDefault() const { return !(Flags & OverloadKindMask); }
399399
bool isOverloadWhileRW() const { return Flags & IsOverloadWhileRW; }
400-
bool isOverloadCvt() const { return Flags & IsOverloadCvt; }
400+
bool isOverloadFirstandLast() const {
401+
return Flags & IsOverloadFirstandLast;
402+
}
401403
bool isPrefetch() const { return Flags & IsPrefetch; }
402404
bool isReverseCompare() const { return Flags & ReverseCompare; }
403405
bool isAppendSVALL() const { return Flags & IsAppendSVALL; }

clang/include/clang/Basic/arm_sve.td

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -883,56 +883,56 @@ multiclass SInstCvtMX<string name, string m_types, string xz_types,
883883
}
884884

885885
// svcvt_s##_f16
886-
defm SVFCVTZS_S16_F16 : SInstCvtMXZ<"svcvt_s16[_f16]", "ddPO", "dPO", "s", "aarch64_sve_fcvtzs", [IsOverloadCvt]>;
886+
defm SVFCVTZS_S16_F16 : SInstCvtMXZ<"svcvt_s16[_f16]", "ddPO", "dPO", "s", "aarch64_sve_fcvtzs", [IsOverloadFirstandLast]>;
887887
defm SVFCVTZS_S32_F16 : SInstCvtMXZ<"svcvt_s32[_f16]", "ddPO", "dPO", "i", "aarch64_sve_fcvtzs_i32f16">;
888888
defm SVFCVTZS_S64_F16 : SInstCvtMXZ<"svcvt_s64[_f16]", "ddPO", "dPO", "l", "aarch64_sve_fcvtzs_i64f16">;
889889

890890
// svcvt_s##_f32
891-
defm SVFCVTZS_S32_F32 : SInstCvtMXZ<"svcvt_s32[_f32]", "ddPM", "dPM", "i", "aarch64_sve_fcvtzs", [IsOverloadCvt]>;
891+
defm SVFCVTZS_S32_F32 : SInstCvtMXZ<"svcvt_s32[_f32]", "ddPM", "dPM", "i", "aarch64_sve_fcvtzs", [IsOverloadFirstandLast]>;
892892
defm SVFCVTZS_S64_F32 : SInstCvtMXZ<"svcvt_s64[_f32]", "ddPM", "dPM", "l", "aarch64_sve_fcvtzs_i64f32">;
893893

894894
// svcvt_s##_f64
895895
defm SVFCVTZS_S32_F64 : SInstCvtMXZ<"svcvt_s32[_f64]", "ttPd", "tPd", "d", "aarch64_sve_fcvtzs_i32f64">;
896-
defm SVFCVTZS_S64_F64 : SInstCvtMXZ<"svcvt_s64[_f64]", "ddPN", "dPN", "l", "aarch64_sve_fcvtzs", [IsOverloadCvt]>;
896+
defm SVFCVTZS_S64_F64 : SInstCvtMXZ<"svcvt_s64[_f64]", "ddPN", "dPN", "l", "aarch64_sve_fcvtzs", [IsOverloadFirstandLast]>;
897897

898898
// svcvt_u##_f16
899-
defm SVFCVTZU_U16_F16 : SInstCvtMXZ<"svcvt_u16[_f16]", "ddPO", "dPO", "Us", "aarch64_sve_fcvtzu", [IsOverloadCvt]>;
899+
defm SVFCVTZU_U16_F16 : SInstCvtMXZ<"svcvt_u16[_f16]", "ddPO", "dPO", "Us", "aarch64_sve_fcvtzu", [IsOverloadFirstandLast]>;
900900
defm SVFCVTZU_U32_F16 : SInstCvtMXZ<"svcvt_u32[_f16]", "ddPO", "dPO", "Ui", "aarch64_sve_fcvtzu_i32f16">;
901901
defm SVFCVTZU_U64_F16 : SInstCvtMXZ<"svcvt_u64[_f16]", "ddPO", "dPO", "Ul", "aarch64_sve_fcvtzu_i64f16">;
902902

903903
// svcvt_u##_f32
904-
defm SVFCVTZU_U32_F32 : SInstCvtMXZ<"svcvt_u32[_f32]", "ddPM", "dPM", "Ui", "aarch64_sve_fcvtzu", [IsOverloadCvt]>;
904+
defm SVFCVTZU_U32_F32 : SInstCvtMXZ<"svcvt_u32[_f32]", "ddPM", "dPM", "Ui", "aarch64_sve_fcvtzu", [IsOverloadFirstandLast]>;
905905
defm SVFCVTZU_U64_F32 : SInstCvtMXZ<"svcvt_u64[_f32]", "ddPM", "dPM", "Ul", "aarch64_sve_fcvtzu_i64f32">;
906906

907907
// svcvt_u##_f64
908908
defm SVFCVTZU_U32_F64 : SInstCvtMXZ<"svcvt_u32[_f64]", "zzPd", "zPd", "d", "aarch64_sve_fcvtzu_i32f64">;
909-
defm SVFCVTZU_U64_F64 : SInstCvtMXZ<"svcvt_u64[_f64]", "ddPN", "dPN", "Ul", "aarch64_sve_fcvtzu", [IsOverloadCvt]>;
909+
defm SVFCVTZU_U64_F64 : SInstCvtMXZ<"svcvt_u64[_f64]", "ddPN", "dPN", "Ul", "aarch64_sve_fcvtzu", [IsOverloadFirstandLast]>;
910910

911911
// svcvt_f16_s##
912-
defm SVFCVTZS_F16_S16 : SInstCvtMXZ<"svcvt_f16[_s16]", "OOPd", "OPd", "s", "aarch64_sve_scvtf", [IsOverloadCvt]>;
912+
defm SVFCVTZS_F16_S16 : SInstCvtMXZ<"svcvt_f16[_s16]", "OOPd", "OPd", "s", "aarch64_sve_scvtf", [IsOverloadFirstandLast]>;
913913
defm SVFCVTZS_F16_S32 : SInstCvtMXZ<"svcvt_f16[_s32]", "OOPd", "OPd", "i", "aarch64_sve_scvtf_f16i32">;
914914
defm SVFCVTZS_F16_S64 : SInstCvtMXZ<"svcvt_f16[_s64]", "OOPd", "OPd", "l", "aarch64_sve_scvtf_f16i64">;
915915

916916
// svcvt_f32_s##
917-
defm SVFCVTZS_F32_S32 : SInstCvtMXZ<"svcvt_f32[_s32]", "MMPd", "MPd", "i", "aarch64_sve_scvtf", [IsOverloadCvt]>;
917+
defm SVFCVTZS_F32_S32 : SInstCvtMXZ<"svcvt_f32[_s32]", "MMPd", "MPd", "i", "aarch64_sve_scvtf", [IsOverloadFirstandLast]>;
918918
defm SVFCVTZS_F32_S64 : SInstCvtMXZ<"svcvt_f32[_s64]", "MMPd", "MPd", "l", "aarch64_sve_scvtf_f32i64">;
919919

920920
// svcvt_f64_s##
921921
defm SVFCVTZS_F64_S32 : SInstCvtMXZ<"svcvt_f64[_s32]", "ddPt", "dPt", "d", "aarch64_sve_scvtf_f64i32">;
922-
defm SVFCVTZS_F64_S64 : SInstCvtMXZ<"svcvt_f64[_s64]", "NNPd", "NPd", "l", "aarch64_sve_scvtf", [IsOverloadCvt]>;
922+
defm SVFCVTZS_F64_S64 : SInstCvtMXZ<"svcvt_f64[_s64]", "NNPd", "NPd", "l", "aarch64_sve_scvtf", [IsOverloadFirstandLast]>;
923923

924924
// svcvt_f16_u##
925-
defm SVFCVTZU_F16_U16 : SInstCvtMXZ<"svcvt_f16[_u16]", "OOPd", "OPd", "Us", "aarch64_sve_ucvtf", [IsOverloadCvt]>;
925+
defm SVFCVTZU_F16_U16 : SInstCvtMXZ<"svcvt_f16[_u16]", "OOPd", "OPd", "Us", "aarch64_sve_ucvtf", [IsOverloadFirstandLast]>;
926926
defm SVFCVTZU_F16_U32 : SInstCvtMXZ<"svcvt_f16[_u32]", "OOPd", "OPd", "Ui", "aarch64_sve_ucvtf_f16i32">;
927927
defm SVFCVTZU_F16_U64 : SInstCvtMXZ<"svcvt_f16[_u64]", "OOPd", "OPd", "Ul", "aarch64_sve_ucvtf_f16i64">;
928928

929929
// svcvt_f32_u##
930-
defm SVFCVTZU_F32_U32 : SInstCvtMXZ<"svcvt_f32[_u32]", "MMPd", "MPd", "Ui", "aarch64_sve_ucvtf", [IsOverloadCvt]>;
930+
defm SVFCVTZU_F32_U32 : SInstCvtMXZ<"svcvt_f32[_u32]", "MMPd", "MPd", "Ui", "aarch64_sve_ucvtf", [IsOverloadFirstandLast]>;
931931
defm SVFCVTZU_F32_U64 : SInstCvtMXZ<"svcvt_f32[_u64]", "MMPd", "MPd", "Ul", "aarch64_sve_ucvtf_f32i64">;
932932

933933
// svcvt_f64_u##
934934
defm SVFCVTZU_F64_U32 : SInstCvtMXZ<"svcvt_f64[_u32]", "ddPz", "dPz", "d", "aarch64_sve_ucvtf_f64i32">;
935-
defm SVFCVTZU_F64_U64 : SInstCvtMXZ<"svcvt_f64[_u64]", "NNPd", "NPd", "Ul", "aarch64_sve_ucvtf", [IsOverloadCvt]>;
935+
defm SVFCVTZU_F64_U64 : SInstCvtMXZ<"svcvt_f64[_u64]", "NNPd", "NPd", "Ul", "aarch64_sve_ucvtf", [IsOverloadFirstandLast]>;
936936

937937
// svcvt_f16_f##
938938
defm SVFCVT_F16_F32 : SInstCvtMXZ<"svcvt_f16[_f32]", "OOPd", "OPd", "f", "aarch64_sve_fcvt_f16f32">;
@@ -1190,11 +1190,23 @@ def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarc
11901190
}
11911191

11921192
let SVETargetGuard = "f32mm", SMETargetGuard = InvalidMode in {
1193-
def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla">;
1193+
def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla", [IsOverloadFirstandLast]>;
11941194
}
11951195

11961196
let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in {
1197-
def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla">;
1197+
def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla", [IsOverloadFirstandLast]>;
1198+
1199+
let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in {
1200+
def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "ddhh", "f", MergeNone, "aarch64_sve_fmmla", [IsOverloadFirstandLast]>;
1201+
}
1202+
1203+
let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in {
1204+
def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmmla">;
1205+
}
1206+
1207+
let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in {
1208+
def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmmla">;
1209+
}
11981210

11991211
def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">;
12001212
def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn2q">;

clang/include/clang/Basic/arm_sve_sme_incl.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ def IsZExtReturn : FlagType<0x00080000>; // Return value is s
214214
def IsOverloadNone : FlagType<0x00100000>; // Intrinsic does not take any overloaded types.
215215
def IsOverloadWhileOrMultiVecCvt : FlagType<0x00200000>; // Use {default type, typeof(operand1)} as overloaded types.
216216
def IsOverloadWhileRW : FlagType<0x00400000>; // Use {pred(default type), typeof(operand0)} as overloaded types.
217-
def IsOverloadCvt : FlagType<0x00800000>; // Use {typeof(operand0), typeof(last operand)} as overloaded types.
217+
def IsOverloadFirstandLast : FlagType<0x00800000>; // Use {typeof(operand0), typeof(last operand)} as overloaded types.
218218
def OverloadKindMask : FlagType<0x00E00000>; // When the masked values are all '0', the default type is used as overload type.
219219
def IsByteIndexed : FlagType<0x01000000>;
220220
def IsAppendSVALL : FlagType<0x02000000>; // Appends SV_ALL as the last operand.

clang/lib/CodeGen/TargetBuiltins/ARM.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4473,7 +4473,7 @@ CodeGenFunction::getSVEOverloadTypes(const SVETypeFlags &TypeFlags,
44734473
if (TypeFlags.isOverloadWhileRW())
44744474
return {getSVEPredType(TypeFlags), Ops[0]->getType()};
44754475

4476-
if (TypeFlags.isOverloadCvt())
4476+
if (TypeFlags.isOverloadFirstandLast())
44774477
return {Ops[0]->getType(), Ops.back()->getType()};
44784478

44794479
if (TypeFlags.isReductionQV() && !ResultType->isScalableTy() &&
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
2+
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
3+
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
4+
// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
5+
// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
6+
// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
7+
8+
// REQUIRES: aarch64-registered-target
9+
10+
#include <arm_sve.h>
11+
12+
#ifdef SVE_OVERLOADED_FORMS
13+
#define SVE_ACLE_FUNC(A1, A3) A1##A3
14+
#else
15+
#define SVE_ACLE_FUNC(A1, A2) A1##A2
16+
#endif
17+
18+
19+
// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32f16(
20+
// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
21+
// CHECK-NEXT: [[ENTRY:.*:]]
22+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
23+
// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
24+
//
25+
// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32f16u13__SVFloat32_tu13__SVFloat16_tS0_(
26+
// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
27+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
28+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
29+
// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
30+
//
31+
svfloat32_t test_f32f16(svfloat32_t acc, svfloat16_t a, svfloat16_t b) {
32+
return SVE_ACLE_FUNC(svmmla, _f32_f16)(acc, a, b);
33+
}

clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@
1717

1818
// CHECK-LABEL: @test_svmmla_f32(
1919
// CHECK-NEXT: entry:
20-
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]])
20+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]])
2121
// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
2222
//
2323
// CPP-CHECK-LABEL: @_Z15test_svmmla_f32u13__SVFloat32_tS_S_(
2424
// CPP-CHECK-NEXT: entry:
25-
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]])
25+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]])
2626
// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
2727
//
2828
svfloat32_t test_svmmla_f32(svfloat32_t x, svfloat32_t y, svfloat32_t z) {

clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@
1717

1818
// CHECK-LABEL: @test_svmmla_f64(
1919
// CHECK-NEXT: entry:
20-
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]])
20+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]])
2121
// CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
2222
//
2323
// CPP-CHECK-LABEL: @_Z15test_svmmla_f64u13__SVFloat64_tS_S_(
2424
// CPP-CHECK-NEXT: entry:
25-
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]])
25+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]])
2626
// CPP-CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
2727
//
2828
svfloat64_t test_svmmla_f64(svfloat64_t x, svfloat64_t y, svfloat64_t z) {
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
2+
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
3+
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
4+
// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
5+
// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
6+
// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
7+
8+
// REQUIRES: aarch64-registered-target
9+
10+
#include <arm_sve.h>
11+
12+
#ifdef SVE_OVERLOADED_FORMS
13+
// A simple used,unused... macro, long enough to represent any SVE builtin.
14+
#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3) A1##A3
15+
#else
16+
#define SVE_ACLE_FUNC(A1, A2, A3) A1##A2##A3
17+
#endif
18+
19+
// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_f16mf8(
20+
// CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
21+
// CHECK-NEXT: [[ENTRY:.*:]]
22+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
23+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmmla.nxv8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
24+
// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
25+
//
26+
// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z11test_f16mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
27+
// CPP-CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
28+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
29+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
30+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmmla.nxv8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
31+
// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
32+
//
33+
svfloat16_t test_f16mf8(svfloat16_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
34+
return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm)(acc, a, b, fpmr);
35+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
2+
3+
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
4+
// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
5+
// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
6+
// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
7+
// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
8+
9+
// REQUIRES: aarch64-registered-target
10+
11+
#include <arm_sve.h>
12+
13+
#ifdef SVE_OVERLOADED_FORMS
14+
// A simple used,unused... macro, long enough to represent any SVE builtin.
15+
#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3) A1##A3
16+
#else
17+
#define SVE_ACLE_FUNC(A1, A2, A3) A1##A2##A3
18+
#endif
19+
20+
// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32mf8(
21+
// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
22+
// CHECK-NEXT: [[ENTRY:.*:]]
23+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
24+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmmla.nxv4f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
25+
// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
26+
//
27+
// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
28+
// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
29+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
30+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
31+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmmla.nxv4f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
32+
// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
33+
//
34+
svfloat32_t test_f32mf8(svfloat32_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
35+
return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm)(acc, a, b, fpmr);
36+
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2797,7 +2797,16 @@ def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic<[IntrSpeculat
27972797
//
27982798
// SVE ACLE: 7.4/5. FP64/FP32 matrix multiply extensions
27992799
//
2800-
def int_aarch64_sve_fmmla : AdvSIMD_3VectorArg_Intrinsic;
2800+
2801+
def int_aarch64_sve_fmmla
2802+
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
2803+
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
2804+
[IntrNoMem]>;
2805+
2806+
def int_aarch64_sve_fp8_fmmla
2807+
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
2808+
[LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
2809+
[IntrReadMem, IntrInaccessibleMemOnly]>;
28012810

28022811
//
28032812
// SVE ACLE: 7.2. BFloat16 extensions

0 commit comments

Comments
 (0)