Merge branch 'main' into vmla_vmls_floats

amilendra · web-flow · commit 96f57d8315a4 · 2025-12-10T17:15:31.000Z
diff --git a/cmse/cmse.md b/cmse/cmse.md
@@ -5,7 +5,7 @@ date-of-issue: 21 June 2024
 set-quote-highlight: true
 # LaTeX specific variables
 copyright-text: Copyright 2019, 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>.
-draftversion: true
+draftversion: false
 # Jekyll specific variables
 header_counter: true
 toc: true
diff --git a/main/acle.md b/main/acle.md
@@ -1,10 +1,10 @@
 ---
 title: Arm C Language Extensions
-version: 2025Q2
-date-of-issue: 06 June 2025
+version: 2025Q3
+date-of-issue: 14 November 2025
 # LaTeX specific variables
 copyright-text: "Copyright: see section \\texorpdfstring{\\nameref{copyright}}{Copyright}."
-draftversion: true
+draftversion: false
 # Jekyll specific variables
 header_counter: true
 toc: true
@@ -181,6 +181,7 @@ unless a different support level is specified in the text.
 | 2024Q3       | 30 September 2024 | Arm    | See [Changes between ACLE Q2 2024 and ACLE Q3 2024](#changes-between-acle-q2-2024-and-acle-q3-2024)                  |
 | 2024Q4       | 21 February 2025  | Arm    | See [Changes between ACLE Q3 2024 and ACLE Q4 2024](#changes-between-acle-q3-2024-and-acle-q4-2024)                  |
 | 2025Q2       | 06 June 2025      | Arm    | See [Changes between ACLE Q4 2024 and ACLE Q2 2025](#changes-between-acle-q4-2024-and-acle-q2-2025)                  |
+| 2025Q3       | 14 November 2025  | Arm    | See [Changes between ACLE Q2 2025 and ACLE Q3 2025](#changes-between-acle-q2-2025-and-acle-q3-2025)                  |
 
 #### Changes between ACLE Q2 2017 and ACLE Q2 2018
 
@@ -446,7 +447,6 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin
 * Added `svdot[_n_f16_mf8]_fpm` and `svdot[_n_f32_mf8]_fpm`.
 * Added Guarded Control Stack (GCS) at
   [**Beta**](#current-status-and-anticipated-changes) quality level.
-* Add Function Multi Versioning feature priority syntax.
 
 #### Changes between ACLE Q4 2024 and ACLE Q2 2025
 
@@ -461,10 +461,16 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin
 * Upgrade to [**Beta**](#current-status-and-anticipated-changes)
   support for modal 8-bit floating point intrinsics.
 
-#### Changes for next release
+#### Changes between ACLE Q2 2025 and ACLE Q3 2025
 
 * Added feature test macro for FEAT_SSVE_FEXPA.
 * Added feature test macro for FEAT_CSSC.
+* Added Function Multi Versioning feature priority syntax.
+
+#### Changes for next release
+
+* Added support for modal 8-bit floating point matrix multiply-accumulate widening intrinsics.
+* Added support for 16-bit floating point matrix multiply-accumulate widening intrinsics.
 * Improve documentation for VMLA/VMLS intrinsics for floats.
 
 ### References
@@ -2347,6 +2353,26 @@ is hardware support for the SVE forms of these instructions and if the
 associated ACLE intrinsics are available. This implies that
 `__ARM_FEATURE_MATMUL_INT8` and `__ARM_FEATURE_SVE` are both nonzero.
 
+##### Multiplication of modal 8-bit floating-point matrices
+
+This section is in
+[**Alpha** state](#current-status-and-anticipated-changes) and might change or be
+extended in the future.
+
+`__ARM_FEATURE_F8F16MM` is defined to `1` if there is hardware support
+for the NEON and SVE modal 8-bit floating-point matrix multiply-accumulate to half-precision (FEAT_F8F16MM)
+instructions and if the associated ACLE intrinsics are available.
+
+`__ARM_FEATURE_F8F32MM` is defined to `1` if there is hardware support
+for the NEON and SVE modal 8-bit floating-point matrix multiply-accumulate to single-precision (FEAT_F8F32MM)
+instructions and if the associated ACLE intrinsics are available.
+
+##### Multiplication of 16-bit floating-point matrices
+
+`__ARM_FEATURE_SVE_F16F32MM` is defined to `1` if there is hardware support
+for the SVE 16-bit floating-point to 32-bit floating-point matrix multiply and add
+(FEAT_SVE_F16F32MM) instructions and if the associated ACLE intrinsics are available.
+
 ##### Multiplication of 32-bit floating-point matrices
 
 `__ARM_FEATURE_SVE_MATMUL_FP32` is defined to `1` if there is hardware support
@@ -2638,6 +2664,9 @@ be found in [[BA]](#BA).
 | [`__ARM_FEATURE_SVE_BITS`](#scalable-vector-extension-sve)                                                                                              | The number of bits in an SVE vector, when known in advance                                         | 256         |
 | [`__ARM_FEATURE_SVE_MATMUL_FP32`](#multiplication-of-32-bit-floating-point-matrices)                                                                    | 32-bit floating-point matrix multiply extension (FEAT_F32MM)                                       | 1           |
 | [`__ARM_FEATURE_SVE_MATMUL_FP64`](#multiplication-of-64-bit-floating-point-matrices)                                                                    | 64-bit floating-point matrix multiply extension (FEAT_F64MM)                                       | 1           |
+| [`__ARM_FEATURE_F8F16MM`](#multiplication-of-modal-8-bit-floating-point-matrices)                                                                       | Modal 8-bit floating-point matrix multiply-accumulate to half-precision extension (FEAT_F8F16MM)   | 1           |
+| [`__ARM_FEATURE_F8F32MM`](#multiplication-of-modal-8-bit-floating-point-matrices)                                                                       | Modal 8-bit floating-point matrix multiply-accumulate to single-precision extension (FEAT_F8F32MM) | 1           |
+| [`__ARM_FEATURE_SVE_F16F32MM`](#multiplication-of-16-bit-floating-point-matrices)                                                                       | 16-bit floating-point matrix multiply-accumulate to single-precision extension (FEAT_SVE_F16F32MM) | 1           |
 | [`__ARM_FEATURE_SVE_MATMUL_INT8`](#multiplication-of-8-bit-integer-matrices)                                                                            | SVE support for the integer matrix multiply extension (FEAT_I8MM)                                  | 1           |
 | [`__ARM_FEATURE_SVE_PREDICATE_OPERATORS`](#scalable-vector-extension-sve)                                                                               | Level of support for C and C++ operators on SVE vector types                                        | 1           |
 | [`__ARM_FEATURE_SVE_VECTOR_OPERATORS`](#scalable-vector-extension-sve)                                                                                  | Level of support for C and C++ operators on SVE predicate types                                     | 1           |
@@ -9375,6 +9404,31 @@ BFloat16 floating-point multiply vectors.
                                   uint64_t imm_idx);
    ```
 
+### SVE2 floating-point matrix multiply-accumulate instructions.
+
+#### FMMLA (widening, FP8 to FP16)
+
+Modal 8-bit floating-point matrix multiply-accumulate to half-precision.
+```c
+  // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_F8F16MM)
+  svfloat16_t svmmla[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm);
+```
+
+#### FMMLA (widening, FP8 to FP32)
+
+Modal 8-bit floating-point matrix multiply-accumulate to single-precision.
+```c
+  // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_F8F32MM)
+  svfloat32_t svmmla[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm);
+```
+#### FMMLA (widening, FP16 to FP32)
+
+16-bit floating-point matrix multiply-accumulate to single-precision.
+```c
+  // Only if __ARM_FEATURE_SVE_F16F32MM
+  svfloat32_t svmmla[_f32_f16](svfloat32_t zda, svfloat16_t zn, svfloat16_t zm);
+```
+
 ### SVE2.1 instruction intrinsics
 
 The specification for SVE2.1 is in
diff --git a/morello/morello.md b/morello/morello.md
@@ -4,7 +4,7 @@ version: 02alpha
 date-of-issue: 11 January 2022
 # LaTeX specific variables
 copyright-text: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>.
-draftversion: true
+draftversion: false
 # Jekyll specific variables
 header_counter: true
 toc: true
diff --git a/mve_intrinsics/mve.md b/mve_intrinsics/mve.md
@@ -5,7 +5,7 @@ date-of-issue: 11 January 2022
 # LaTeX specific variables
 landscape: true
 copyright-text: Copyright 2019-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>.
-draftversion: true
+draftversion: false
 # Jekyll specific variables
 header_counter: true
 toc: true
diff --git a/mve_intrinsics/mve.template.md b/mve_intrinsics/mve.template.md
@@ -5,7 +5,7 @@ date-of-issue: 11 January 2022
 # LaTeX specific variables
 landscape: true
 copyright-text: Copyright 2019-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>.
-draftversion: true
+draftversion: false
 # Jekyll specific variables
 header_counter: true
 toc: true
diff --git a/neon_intrinsics/advsimd.md b/neon_intrinsics/advsimd.md
@@ -5,7 +5,7 @@ date-of-issue: 06 June 2025
 # LaTeX specific variables
 landscape: true
 copyright-text: "Copyright: see section \\texorpdfstring{\\nameref{copyright}}{Copyright}."
-draftversion: true
+draftversion: false
 # Jekyll specific variables
 header_counter: true
 toc: true
@@ -6175,3 +6175,14 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``.
 | <code>float32x4_t <a href="https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlalltbq_laneq_f32_mf8_fpm" target="_blank">vmlalltbq_laneq_f32_mf8_fpm</a>(<br>&nbsp;&nbsp;&nbsp;&nbsp; float32x4_t vd,<br>&nbsp;&nbsp;&nbsp;&nbsp; mfloat8x16_t vn,<br>&nbsp;&nbsp;&nbsp;&nbsp; mfloat8x16_t vm,<br>&nbsp;&nbsp;&nbsp;&nbsp; const int lane,<br>&nbsp;&nbsp;&nbsp;&nbsp; fpm_t fpm)</code> | `vd -> Vd.4S`<br>`vm -> Vn.16B`<br>`vm -> Vm.B`<br>`0 <= lane <= 15` | `FMLALLBB Vd.4S, Vn.16B, Vm.B[lane]` | `Vd.4S -> result` | `A64`                     |
 | <code>float32x4_t <a href="https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlallttq_lane_f32_mf8_fpm" target="_blank">vmlallttq_lane_f32_mf8_fpm</a>(<br>&nbsp;&nbsp;&nbsp;&nbsp; float32x4_t vd,<br>&nbsp;&nbsp;&nbsp;&nbsp; mfloat8x16_t vn,<br>&nbsp;&nbsp;&nbsp;&nbsp; mfloat8x8_t vm,<br>&nbsp;&nbsp;&nbsp;&nbsp; const int lane,<br>&nbsp;&nbsp;&nbsp;&nbsp; fpm_t fpm)</code>    | `vd -> Vd.4S`<br>`vm -> Vn.16B`<br>`vm -> Vm.B`<br>`0 <= lane <= 7`  | `FMLALLBB Vd.4S, Vn.16B, Vm.B[lane]` | `Vd.4S -> result` | `A64`                     |
 | <code>float32x4_t <a href="https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlallttq_laneq_f32_mf8_fpm" target="_blank">vmlallttq_laneq_f32_mf8_fpm</a>(<br>&nbsp;&nbsp;&nbsp;&nbsp; float32x4_t vd,<br>&nbsp;&nbsp;&nbsp;&nbsp; mfloat8x16_t vn,<br>&nbsp;&nbsp;&nbsp;&nbsp; mfloat8x16_t vm,<br>&nbsp;&nbsp;&nbsp;&nbsp; const int lane,<br>&nbsp;&nbsp;&nbsp;&nbsp; fpm_t fpm)</code> | `vd -> Vd.4S`<br>`vm -> Vn.16B`<br>`vm -> Vm.B`<br>`0 <= lane <= 15` | `FMLALLBB Vd.4S, Vn.16B, Vm.B[lane]` | `Vd.4S -> result` | `A64`                     |
+
+## Matrix multiplication intrinsics from Armv9.6-A
+
+### Vector arithmetic
+
+#### Matrix multiply
+
+| Intrinsic                                                                                                                                                                                                                                                                                                                           | Argument preparation                           | AArch64 Instruction           | Result            | Supported architectures   |
+|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------|-------------------------------|-------------------|---------------------------|
+| <code>float16x8_t <a href="https://developer.arm.com/architectures/instruction-sets/intrinsics/vmmlaq_f16_mf8" target="_blank">vmmlaq_f16_mf8</a>(<br>&nbsp;&nbsp;&nbsp;&nbsp; float16x8_t r,<br>&nbsp;&nbsp;&nbsp;&nbsp; mfloat8x16_t a,<br>&nbsp;&nbsp;&nbsp;&nbsp; mfloat8x16_t b,<br>&nbsp;&nbsp;&nbsp;&nbsp; fpm_t fpm)</code> | `r -> Vd.4H`<br>`a -> Vn.16B`<br>`b -> Vm.16B` | `FMMLA Vd.4H, Vn.16B, Vm.16B` | `Vd.4H -> result` | `A64`                     |
+| <code>float32x4_t <a href="https://developer.arm.com/architectures/instruction-sets/intrinsics/vmmlaq_f32_mf8" target="_blank">vmmlaq_f32_mf8</a>(<br>&nbsp;&nbsp;&nbsp;&nbsp; float32x4_t r,<br>&nbsp;&nbsp;&nbsp;&nbsp; mfloat8x16_t a,<br>&nbsp;&nbsp;&nbsp;&nbsp; mfloat8x16_t b,<br>&nbsp;&nbsp;&nbsp;&nbsp; fpm_t fpm)</code> | `r -> Vd.4S`<br>`a -> Vn.16B`<br>`b -> Vm.16B` | `FMMLA Vd.4S, Vn.16B, Vm.16B` | `Vd.4S -> result` | `A64`                     |
diff --git a/neon_intrinsics/advsimd.template.md b/neon_intrinsics/advsimd.template.md
@@ -5,7 +5,7 @@ date-of-issue: 06 June 2025
 # LaTeX specific variables
 landscape: true
 copyright-text: "Copyright: see section \\texorpdfstring{{\\nameref{{copyright}}}}{{Copyright}}."
-draftversion: true
+draftversion: false
 # Jekyll specific variables
 header_counter: true
 toc: true
diff --git a/tools/intrinsic_db/advsimd.csv b/tools/intrinsic_db/advsimd.csv
@@ -4810,3 +4810,7 @@ float32x4_t vmlalltbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x
 float32x4_t vmlalltbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)	vd -> Vd.4S;vm -> Vn.16B; vm -> Vm.B; 0 <= lane <= 15	FMLALLBB Vd.4S, Vn.16B, Vm.B[lane]	Vd.4S -> result	A64
 float32x4_t vmlallttq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)	vd -> Vd.4S;vm -> Vn.16B; vm -> Vm.B; 0 <= lane <= 7	FMLALLBB Vd.4S, Vn.16B, Vm.B[lane]	Vd.4S -> result	A64
 float32x4_t vmlallttq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)	vd -> Vd.4S;vm -> Vn.16B; vm -> Vm.B; 0 <= lane <= 15	FMLALLBB Vd.4S, Vn.16B, Vm.B[lane]	Vd.4S -> result	A64
+
+<SECTION>	Matrix multiplication intrinsics from Armv9.6-A
+float16x8_t vmmlaq_f16_mf8(float16x8_t r, mfloat8x16_t a, mfloat8x16_t b, fpm_t fpm)	r -> Vd.4H;a -> Vn.16B;b -> Vm.16B	FMMLA Vd.4H, Vn.16B, Vm.16B	Vd.4H -> result	A64
+float32x4_t vmmlaq_f32_mf8(float32x4_t r, mfloat8x16_t a, mfloat8x16_t b, fpm_t fpm)	r -> Vd.4S;a -> Vn.16B;b -> Vm.16B	FMMLA Vd.4S, Vn.16B, Vm.16B	Vd.4S -> result	A64
diff --git a/tools/intrinsic_db/advsimd_classification.csv b/tools/intrinsic_db/advsimd_classification.csv
@@ -4697,3 +4697,5 @@ vmlalltbq_lane_f32_mf8_fpm	Vector arithmetic|Multiply|Multiply-accumulate and wi
 vmlalltbq_laneq_f32_mf8_fpm	Vector arithmetic|Multiply|Multiply-accumulate and widen
 vmlallttq_lane_f32_mf8_fpm	Vector arithmetic|Multiply|Multiply-accumulate and widen
 vmlallttq_laneq_f32_mf8_fpm	Vector arithmetic|Multiply|Multiply-accumulate and widen
+vmmlaq_f16_mf8	Vector arithmetic|Matrix multiply
+vmmlaq_f32_mf8	Vector arithmetic|Matrix multiply