- Implemented overloading for fmmla intrinsics, replaced fixed-type intrinsics

Amichaxx · Amichaxx · commit 4b3703398a6d · 2025-11-12T14:38:09.000Z
- Prototype cleanups
- Updated ll tests to remove unnecessary IR
- Removed unused arguments in clang test macros
- Removed redundant check lines in ll tests
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
@@ -1190,22 +1190,22 @@ def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi",  "i", MergeNone, "aarc
 }
 
 let SVETargetGuard = "f32mm", SMETargetGuard = InvalidMode in {
-def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla">;
+def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
 }
 
 let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in {
-def SVMLLA_F64 : SInst<"svmmla[_f64]",  "dddd", "d", MergeNone, "aarch64_sve_fmmla">;
+def SVMLLA_F64 : SInst<"svmmla[_f64]",  "dddd", "d", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
 
 let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in {
-  def SVMLLA_F32_F16  : SInst<"svmmla[_f32_f16]", "MMdd", "h", MergeNone, "aarch64_sve_fmmla_f16f32", [IsOverloadNone]>;
+  def SVMLLA_F32_F16  : SInst<"svmmla[_f32_f16]", "ddhh", "f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
 }
 
 let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in {
-  def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "MM~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f32", [IsOverloadNone]>;
+  def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "dd~~>", "f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
 }
 
 let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in {
-  def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "OO~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f16", [IsOverloadNone]>;
+  def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "dd~~>", "h", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
 }
 
 def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">;
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c
@@ -10,24 +10,24 @@
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#define SVE_ACLE_FUNC(A1, A3) A1##A3
 #else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#define SVE_ACLE_FUNC(A1, A2) A1##A2
 #endif
 
+
 // CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32f16(
 // CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32f16u13__SVFloat32_tu13__SVFloat16_tS0_(
 // CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 svfloat32_t test_f32f16(svfloat32_t acc, svfloat16_t a, svfloat16_t b) {
-  return SVE_ACLE_FUNC(svmmla, _f32_f16, , )(acc, a, b);
+  return SVE_ACLE_FUNC(svmmla, _f32_f16)(acc, a, b);
 }
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c
@@ -17,12 +17,12 @@
 
 // CHECK-LABEL: @test_svmmla_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z15test_svmmla_f32u13__SVFloat32_tS_S_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 svfloat32_t test_svmmla_f32(svfloat32_t x, svfloat32_t y, svfloat32_t z) {
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c
@@ -17,12 +17,12 @@
 
 // CHECK-LABEL: @test_svmmla_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z15test_svmmla_f64u13__SVFloat64_tS_S_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 svfloat64_t test_svmmla_f64(svfloat64_t x, svfloat64_t y, svfloat64_t z) {
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
@@ -11,25 +11,25 @@
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3) A1##A3
 #else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#define SVE_ACLE_FUNC(A1, A2, A3) A1##A2##A3
 #endif
 
 // CHECK-LABEL: define dso_local <vscale x 8 x half> @test_f16mf8(
 // CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.nxv8f16.nxv16i8(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z11test_f16mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
 // CPP-CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.nxv8f16.nxv16i8(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 svfloat16_t test_f16mf8(svfloat16_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
-  return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm, )(acc, a, b, fpmr);
+  return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm)(acc, a, b, fpmr);
 }
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
@@ -12,25 +12,25 @@
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3) A1##A3
 #else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#define SVE_ACLE_FUNC(A1, A2, A3) A1##A2##A3
 #endif
 
 // CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32mf8(
 // CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv16i8(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
 // CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv16i8(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 svfloat32_t test_f32mf8(svfloat32_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
-  return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm, )(acc, a, b, fpmr);
+  return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm)(acc, a, b, fpmr);
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2805,22 +2805,12 @@ def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic;
 //
 // SVE ACLE: 7.4/5. FP64/FP32 matrix multiply extensions
 //
-def int_aarch64_sve_fmmla : AdvSIMD_3VectorArg_Intrinsic;
 
-def int_aarch64_sve_fmmla_f16f32
-    : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
-                          [ llvm_nxv4f32_ty, llvm_nxv8f16_ty, llvm_nxv8f16_ty ],
-                          [IntrNoMem]>;
-
-def int_aarch64_sve_fmmla_mf8f32
-  : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
-                          [ llvm_nxv4f32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ],
-                          [IntrNoMem]>;
+def int_aarch64_sve_fmmla
+  : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                          [ LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1> ],
+                          [ IntrNoMem ]>;
 
-def int_aarch64_sve_fmmla_mf8f16
-  : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty],
-                          [ llvm_nxv8f16_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ],
-                          [IntrNoMem]>;                     
 //
 // SVE ACLE: 7.2. BFloat16 extensions
 //
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3684,7 +3684,7 @@ let Predicates = [HasSVE, HasMatMulFP32] in {
 } // End HasSVE, HasMatMulFP32
 
 let Predicates = [HasSVE_F16F32MM] in {
-  defm FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla_f16f32, nxv4f32, nxv8f16>;
+  defm FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla, nxv4f32, nxv8f16>;
 } // End HasSVE_F16F32MM
 
 let Predicates = [HasSVE, HasMatMulFP64] in {
@@ -4744,15 +4744,11 @@ defm FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt", nxv4f32, int_aarch64_
 } // End HasSSVE_FP8FMA
 
 let Predicates = [HasSVE2, HasF8F32MM] in {
-  def FMMLA_ZZZ_BtoS :  sve2_fp8_mmla<0b0, ZPR32, "fmmla">;
-  def : Pat<(nxv4f32 (int_aarch64_sve_fmmla_mf8f32 nxv4f32:$acc, nxv16i8:$zn, nxv16i8:$zm)), 
-        (FMMLA_ZZZ_BtoS $acc, $zn, $zm)>;
+  defm FMMLA_ZZZ_BtoS : sve2_fp8_fmmla<0b0, ZPR32, "fmmla", nxv4f32>;
 }
 
 let Predicates = [HasSVE2, HasF8F16MM] in {
-  def FMMLA_ZZZ_BtoH :  sve2_fp8_mmla<0b1, ZPR16, "fmmla">;
-  def : Pat<(nxv8f16 (int_aarch64_sve_fmmla_mf8f16 nxv8f16:$acc, nxv16i8:$zn, nxv16i8:$zm)),
-        (FMMLA_ZZZ_BtoH $acc, $zn, $zm)>;
+  defm FMMLA_ZZZ_BtoH : sve2_fp8_fmmla<0b1, ZPR16, "fmmla", nxv8f16>;
 }
 
 let Predicates = [HasSSVE_FP8DOT2] in {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11143,6 +11143,12 @@ class sve2_fp8_mmla<bit opc, ZPRRegOp dst_ty, string mnemonic>
   let Uses = [FPMR, FPCR];
 }
 
+multiclass sve2_fp8_fmmla<bits<1> opc, ZPRRegOp zprty, string mnemonic, ValueType ResVT> {
+  def NAME : sve2_fp8_mmla<opc, zprty, mnemonic>;
+  def : Pat<(ResVT (int_aarch64_sve_fmmla ResVT:$acc, nxv16i8:$zn, nxv16i8:$zm)),
+            (!cast<Instruction>(NAME) $acc, $zn, $zm)>;
+}
+
 class sve_fp8_dot_indexed<bits<4> opc, ZPRRegOp dst_ty, Operand iop_ty, string mnemonic>
 : I<(outs dst_ty:$Zda), (ins dst_ty:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, iop_ty:$iop),
     mnemonic, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
diff --git a/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll
@@ -1,32 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve-f16f32mm < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve-f16f32mm < %s | FileCheck %s
 
-define <vscale x 4 x float> @_Z1tu13__SVFloat32_tu13__SVFloat16_tS0_(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
-; CHECK-LABEL: _Z1tu13__SVFloat32_tu13__SVFloat16_tS0_:
+define <vscale x 4 x float> @fmmla_f32f16(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: fmmla_f32f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    str z0, [sp, #2, mul vl]
 ; CHECK-NEXT:    fmmla z0.s, z1.h, z2.h
-; CHECK-NEXT:    str z1, [sp, #1, mul vl]
-; CHECK-NEXT:    str z2, [sp]
-; CHECK-NEXT:    addvl sp, sp, #3
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
-  %acc.addr = alloca <vscale x 4 x float>, align 16
-  %a.addr = alloca <vscale x 8 x half>, align 16
-  %b.addr = alloca <vscale x 8 x half>, align 16
-  store <vscale x 4 x float> %acc, ptr %acc.addr, align 16
-  store <vscale x 8 x half> %a, ptr %a.addr, align 16
-  store <vscale x 8 x half> %b, ptr %b.addr, align 16
-  %0 = load <vscale x 4 x float>, ptr %acc.addr, align 16
-  %1 = load <vscale x 8 x half>, ptr %a.addr, align 16
-  %2 = load <vscale x 8 x half>, ptr %b.addr, align 16
-  %3 = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2)
-  ret <vscale x 4 x float> %3
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 4 x float> %out
 }
-
-declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
@@ -1,39 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f16mm < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f16mm < %s | FileCheck %s
 
-define <vscale x 8 x half> @_Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 %fpmr) {
-; CHECK-LABEL: _Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m:
+define <vscale x 8 x half> @fmmla_f16mf8(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 %fpmr) {
+; CHECK-LABEL: fmmla_f16mf8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    addvl x8, sp, #3
-; CHECK-NEXT:    str z1, [sp, #1, mul vl]
-; CHECK-NEXT:    str z0, [sp, #2, mul vl]
-; CHECK-NEXT:    str z2, [sp]
-; CHECK-NEXT:    str x0, [x8, #8]
-; CHECK-NEXT:    msr FPMR, x0
 ; CHECK-NEXT:    fmmla z0.h, z1.b, z2.b
-; CHECK-NEXT:    addvl sp, sp, #3
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
-  %acc.addr = alloca <vscale x 8 x half>, align 16
-  %a.addr = alloca <vscale x 16 x i8>, align 16
-  %b.addr = alloca <vscale x 16 x i8>, align 16
-  %fpmr.addr = alloca i64, align 8
-  store <vscale x 8 x half> %acc, ptr %acc.addr, align 16
-  store <vscale x 16 x i8> %a, ptr %a.addr, align 16
-  store <vscale x 16 x i8> %b, ptr %b.addr, align 16
-  store i64 %fpmr, ptr %fpmr.addr, align 8
-  %0 = load <vscale x 8 x half>, ptr %acc.addr, align 16
-  %1 = load <vscale x 16 x i8>, ptr %a.addr, align 16
-  %2 = load <vscale x 16 x i8>, ptr %b.addr, align 16
-  %3 = load i64, ptr %fpmr.addr, align 8
-  call void @llvm.aarch64.set.fpmr(i64 %3)
-  %4 = call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2)
-  ret <vscale x 8 x half> %4
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+  ret <vscale x 8 x half> %out
 }
-
-declare <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half>, <vscale x 16 x i8>, <vscale x 16 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
@@ -1,41 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f32mm < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f32mm < %s | FileCheck %s
 
-define dso_local <vscale x 4 x float> @_Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 noundef %fpmr) #0 {
-; CHECK-LABEL: _Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m:
+define dso_local <vscale x 4 x float> @fmmla_f32mf8(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 noundef %fpmr) #0 {
+; CHECK-LABEL: fmmla_f32mf8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    addvl x8, sp, #3
-; CHECK-NEXT:    str z1, [sp, #1, mul vl]
-; CHECK-NEXT:    str z0, [sp, #2, mul vl]
-; CHECK-NEXT:    str z2, [sp]
-; CHECK-NEXT:    str x0, [x8, #8]
-; CHECK-NEXT:    msr FPMR, x0
 ; CHECK-NEXT:    fmmla z0.s, z1.b, z2.b
-; CHECK-NEXT:    addvl sp, sp, #3
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
-  %acc.addr = alloca <vscale x 4 x float>, align 16
-  %a.addr = alloca <vscale x 16 x i8>, align 16
-  %b.addr = alloca <vscale x 16 x i8>, align 16
-  %fpmr.addr = alloca i64, align 8
-  store <vscale x 4 x float> %acc, ptr %acc.addr, align 16
-  store <vscale x 16 x i8> %a, ptr %a.addr, align 16
-  store <vscale x 16 x i8> %b, ptr %b.addr, align 16
-  store i64 %fpmr, ptr %fpmr.addr, align 8
-  %0 = load <vscale x 4 x float>, ptr %acc.addr, align 16
-  %1 = load <vscale x 16 x i8>, ptr %a.addr, align 16
-  %2 = load <vscale x 16 x i8>, ptr %b.addr, align 16
-  %3 = load i64, ptr %fpmr.addr, align 8
-  call void @llvm.aarch64.set.fpmr(i64 %3)
-  %4 = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2)
-  ret <vscale x 4 x float> %4
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+  ret <vscale x 4 x float> %out
 }
-
-declare void @llvm.aarch64.set.fpmr(i64)
-
-declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i8>)

Original file line number	Diff line number	Diff line change
`@@ -1190,22 +1190,22 @@ def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarc`
`1190`	`1190`	`}`
`1191`	`1191`
`1192`	`1192`	`let SVETargetGuard = "f32mm", SMETargetGuard = InvalidMode in {`
`1193`		`-def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla">;`
	`1193`	`+def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;`
`1194`	`1194`	`}`
`1195`	`1195`
`1196`	`1196`	`let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in {`
`1197`		`-def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla">;`
	`1197`	`+def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;`
`1198`	`1198`
`1199`	`1199`	`let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in {`
`1200`		`- def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "MMdd", "h", MergeNone, "aarch64_sve_fmmla_f16f32", [IsOverloadNone]>;`
	`1200`	`+ def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "ddhh", "f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;`
`1201`	`1201`	`}`
`1202`	`1202`
`1203`	`1203`	`let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in {`
`1204`		`- def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "MM~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f32", [IsOverloadNone]>;`
	`1204`	`+ def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "dd~~>", "f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;`
`1205`	`1205`	`}`
`1206`	`1206`
`1207`	`1207`	`let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in {`
`1208`		`- def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "OO~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f16", [IsOverloadNone]>;`
	`1208`	`+ def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "dd~~>", "h", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;`
`1209`	`1209`	`}`
`1210`	`1210`
`1211`	`1211`	`def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">;`