diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index ef196103035e8..8e2174c880ed8 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -1896,6 +1896,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>; } +let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f16mm,neon" in { + def VMMLA_F16_MF8 : VInst<"vmmla_f16_mf8_fpm", "(>F)(>F)..V", "Qm">; +} + +let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f32mm,neon" in { + def VMMLA_F32_MF8 : VInst<"vmmla_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">; +} + let TargetGuard = "i8mm,neon" in { def VMMLA : SInst<"vmmla", "..(<<)(<<)", "QUiQi">; def VUSMMLA : SInst<"vusmmla", "..(<; diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 15fa78ddba715..5421d46377e66 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -7624,6 +7624,16 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2"); } + case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm: + return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla, + {llvm::FixedVectorType::get(HalfTy, 8), + llvm::FixedVectorType::get(Int8Ty, 16)}, + Ops, E, "fmmla"); + case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm: + return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla, + {llvm::FixedVectorType::get(FloatTy, 4), + llvm::FixedVectorType::get(Int8Ty, 16)}, + Ops, E, "fmmla"); case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm: ExtractLow = true; [[fallthrough]]; diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c new file mode 100644 index 0000000000000..89ee9e38bb3fb --- /dev/null +++ b/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c @@ -0,0 +1,23 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f16mm -target-feature +fp8 \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -passes=mem2reg,sroa \ +// RUN: | FileCheck %s + +// REQUIRES: aarch64-registered-target + +#include + +// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8( +// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]]) +// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]]) +// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]] +// +float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) { + return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3); +} diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c new file mode 100644 index 0000000000000..13db72c2cbdd1 --- /dev/null +++ b/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c @@ -0,0 +1,21 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f32mm -target-feature +fp8 \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -passes=mem2reg,sroa \ +// RUN: | FileCheck %s + +// REQUIRES: aarch64-registered-target + +#include + +// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8( +// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]]) +// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]]) +// CHECK-NEXT: ret <4 x float> [[FMMLA_I]] +// +float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) { + return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3); +} + diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index b81edc385cd43..4cab6e05ba79f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -499,6 +499,7 @@ let TargetPrefix = "aarch64" in { def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic; + def int_aarch64_neon_fmmla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_bfmmla diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 58a53af76e1b5..bb2f083db19ef 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -13292,18 +13292,24 @@ multiclass AtomicFPStore op0, string asm> { def H : BaseAtomicFPStore; } -class BaseSIMDThreeSameVectorFP8MatrixMul size, string kind> +class BaseSIMDThreeSameVectorFP8MatrixMul size, string kind, list pattern> : BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101, - V128, asm, ".16b", []> { + V128, asm, ".16b", pattern> { let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b", "|", kind, "\t$Rd, $Rn, $Rm}"); } -multiclass SIMDThreeSameVectorFP8MatrixMul{ - def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul{ +multiclass SIMDThreeSameVectorFP8MatrixMul{ + def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul { let Predicates = [HasNEON, HasF8F16MM]; } - def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul{ + def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul { let Predicates = [HasNEON, HasF8F32MM]; } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index e6954f75b1a6a..76f076a60765f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -11417,7 +11417,7 @@ let Predicates = [HasF16F32MM] in defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">; let Uses = [FPMR, FPCR] in - defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">; + defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla", int_aarch64_neon_fmmla>; //===----------------------------------------------------------------------===// // Contention Management Hints (FEAT_CMH) diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll new file mode 100644 index 0000000000000..8d1abdd5380db --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm < %s | FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm -global-isel < %s | FileCheck %s + +define <8 x half> @fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: fmmla.v8f16.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmmla v0.8h, v1.16b, v2.16b +; CHECK-NEXT: ret +entry: + %vfmmla1.i = tail call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) #3 + ret <8 x half> %vfmmla1.i +} + diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll new file mode 100644 index 0000000000000..4c33567732687 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm < %s | FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm -global-isel < %s | FileCheck %s + +define <4 x float> @fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: fmmla.v4f32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmmla v0.4s, v1.16b, v2.16b +; CHECK-NEXT: ret +entry: + %vfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) #3 + ret <4 x float> %vfmmla1.i +}