[AArch64][llvm] Add support for Neon vmmlaq_{f16,f32}_mf8_fpm intrinsics (#165431)

jthackray · web-flow · commit 7377ac037dc5 · 2025-11-07T15:24:13.000Z
Add support for the following new AArch64 Neon intrinsics:
```
float16x8_t vmmlaq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t, fpm_t);
float32x4_t vmmlaq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t);
```
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
@@ -1921,6 +1921,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
   def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
 }
 
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f16mm,neon" in {
+  def VMMLA_F16_MF8 : VInst<"vmmla_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f32mm,neon" in {
+  def VMMLA_F32_MF8 : VInst<"vmmla_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+}
+
 let TargetGuard = "i8mm,neon" in {
   def VMMLA   : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
   def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -7646,6 +7646,16 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
   }
+  case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
+                           {llvm::FixedVectorType::get(HalfTy, 8),
+                            llvm::FixedVectorType::get(Int8Ty, 16)},
+                           Ops, E, "fmmla");
+  case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
+                           {llvm::FixedVectorType::get(FloatTy, 4),
+                            llvm::FixedVectorType::get(Int8Ty, 16)},
+                           Ops, E, "fmmla");
   case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
     ExtractLow = true;
     [[fallthrough]];
diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c
@@ -0,0 +1,23 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f16mm -target-feature +fp8 \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=mem2reg,sroa \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
+// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT:    [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]])
+// CHECK-NEXT:    ret <8 x half> [[FMMLA1_I]]
+//
+float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+  return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
+}
diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c
@@ -0,0 +1,21 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f32mm -target-feature +fp8 \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=mem2reg,sroa \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
+// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT:    [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]])
+// CHECK-NEXT:    ret <4 x float> [[FMMLA_I]]
+//
+float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+  return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
+}
+
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -499,6 +499,7 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
   def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
   def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_fmmla : AdvSIMD_MatMul_Intrinsic;
   def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
   def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
   def int_aarch64_neon_bfmmla
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -13292,18 +13292,24 @@ multiclass AtomicFPStore<bit R, bits<3> op0, string asm> {
   def H : BaseAtomicFPStore<FPR16, 0b01, R, op0, asm>;
 }
 
-class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind>
+class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind, list<dag> pattern>
   : BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101,
-                                V128, asm, ".16b", []> {
+                                V128, asm, ".16b", pattern> {
   let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b",
                                    "|", kind, "\t$Rd, $Rn, $Rm}");
 }
 
-multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{
-    def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h">{
+multiclass SIMDThreeSameVectorFP8MatrixMul<string asm, SDPatternOperator OpNode>{
+    def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h",
+              [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
+                                               (v16i8 V128:$Rn),
+                                               (v16i8 V128:$Rm)))]> {
       let Predicates = [HasNEON, HasF8F16MM];
     }
-    def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s">{
+    def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s",
+              [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
+                                               (v16i8 V128:$Rn),
+                                               (v16i8 V128:$Rm)))]> {
       let Predicates = [HasNEON, HasF8F32MM];
     }
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -11417,7 +11417,7 @@ let Predicates = [HasF16F32MM] in
   defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">;
 
 let Uses = [FPMR, FPCR] in
-  defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">;
+  defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla", int_aarch64_neon_fmmla>;
 
 //===----------------------------------------------------------------------===//
 // Contention Management Hints (FEAT_CMH)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm              < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm -global-isel < %s | FileCheck %s
+
+define <8 x half> @fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: fmmla.v8f16.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmmla v0.8h, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %vfmmla1.i = tail call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) #3
+  ret <8 x half> %vfmmla1.i
+}
+
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm              < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm -global-isel < %s | FileCheck %s
+
+define <4 x float> @fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: fmmla.v4f32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmmla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %vfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) #3
+  ret <4 x float> %vfmmla1.i
+}

Original file line number	Diff line number	Diff line change
`@@ -13292,18 +13292,24 @@ multiclass AtomicFPStore<bit R, bits<3> op0, string asm> {`
`13292`	`13292`	`def H : BaseAtomicFPStore<FPR16, 0b01, R, op0, asm>;`
`13293`	`13293`	`}`
`13294`	`13294`
`13295`		`-class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind>`
	`13295`	`+class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind, list<dag> pattern>`
`13296`	`13296`	`: BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101,`
`13297`		`- V128, asm, ".16b", []> {`
	`13297`	`+ V128, asm, ".16b", pattern> {`
`13298`	`13298`	`let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b",`
`13299`	`13299`	`"\|", kind, "\t$Rd, $Rn, $Rm}");`
`13300`	`13300`	`}`
`13301`	`13301`
`13302`		`-multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{`
`13303`		`- def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h">{`
	`13302`	`+multiclass SIMDThreeSameVectorFP8MatrixMul<string asm, SDPatternOperator OpNode>{`
	`13303`	`+ def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h",`
	`13304`	`+ [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),`
	`13305`	`+ (v16i8 V128:$Rn),`
	`13306`	`+ (v16i8 V128:$Rm)))]> {`
`13304`	`13307`	`let Predicates = [HasNEON, HasF8F16MM];`
`13305`	`13308`	`}`
`13306`		`- def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s">{`
	`13309`	`+ def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s",`
	`13310`	`+ [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),`
	`13311`	`+ (v16i8 V128:$Rn),`
	`13312`	`+ (v16i8 V128:$Rm)))]> {`
`13307`	`13313`	`let Predicates = [HasNEON, HasF8F32MM];`
`13308`	`13314`	`}`
`13309`	`13315`	`}`