diff --git a/neon_intrinsics/advsimd.md b/neon_intrinsics/advsimd.md index dad00fa3..b5172f0e 100644 --- a/neon_intrinsics/advsimd.md +++ b/neon_intrinsics/advsimd.md @@ -6182,7 +6182,7 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``. #### Matrix multiply -| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | -|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------|-------------------------------|-------------------|---------------------------| -| float16x8_t vmmlaq_f16_mf8(
     float16x8_t r,
     mfloat8x16_t a,
     mfloat8x16_t b,
     fpm_t fpm)
| `r -> Vd.4H`
`a -> Vn.16B`
`b -> Vm.16B` | `FMMLA Vd.4H, Vn.16B, Vm.16B` | `Vd.4H -> result` | `A64` | -| float32x4_t vmmlaq_f32_mf8(
     float32x4_t r,
     mfloat8x16_t a,
     mfloat8x16_t b,
     fpm_t fpm)
| `r -> Vd.4S`
`a -> Vn.16B`
`b -> Vm.16B` | `FMMLA Vd.4S, Vn.16B, Vm.16B` | `Vd.4S -> result` | `A64` | +| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | +|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------|-------------------------------|-------------------|---------------------------| +| float16x8_t vmmlaq_f16_mf8_fpm(
     float16x8_t r,
     mfloat8x16_t a,
     mfloat8x16_t b,
     fpm_t fpm)
| `r -> Vd.8H`
`a -> Vn.16B`
`b -> Vm.16B` | `FMMLA Vd.8H, Vn.16B, Vm.16B` | `Vd.8H -> result` | `A64` | +| float32x4_t vmmlaq_f32_mf8_fpm(
     float32x4_t r,
     mfloat8x16_t a,
     mfloat8x16_t b,
     fpm_t fpm)
| `r -> Vd.4S`
`a -> Vn.16B`
`b -> Vm.16B` | `FMMLA Vd.4S, Vn.16B, Vm.16B` | `Vd.4S -> result` | `A64` | diff --git a/tools/intrinsic_db/advsimd.csv b/tools/intrinsic_db/advsimd.csv index 7fd96e6e..6b82264f 100644 --- a/tools/intrinsic_db/advsimd.csv +++ b/tools/intrinsic_db/advsimd.csv @@ -4812,5 +4812,5 @@ float32x4_t vmlallttq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x float32x4_t vmlallttq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm) vd -> Vd.4S;vm -> Vn.16B; vm -> Vm.B; 0 <= lane <= 15 FMLALLBB Vd.4S, Vn.16B, Vm.B[lane] Vd.4S -> result A64
Matrix multiplication intrinsics from Armv9.6-A -float16x8_t vmmlaq_f16_mf8(float16x8_t r, mfloat8x16_t a, mfloat8x16_t b, fpm_t fpm) r -> Vd.4H;a -> Vn.16B;b -> Vm.16B FMMLA Vd.4H, Vn.16B, Vm.16B Vd.4H -> result A64 -float32x4_t vmmlaq_f32_mf8(float32x4_t r, mfloat8x16_t a, mfloat8x16_t b, fpm_t fpm) r -> Vd.4S;a -> Vn.16B;b -> Vm.16B FMMLA Vd.4S, Vn.16B, Vm.16B Vd.4S -> result A64 +float16x8_t vmmlaq_f16_mf8_fpm(float16x8_t r, mfloat8x16_t a, mfloat8x16_t b, fpm_t fpm) r -> Vd.8H;a -> Vn.16B;b -> Vm.16B FMMLA Vd.8H, Vn.16B, Vm.16B Vd.8H -> result A64 +float32x4_t vmmlaq_f32_mf8_fpm(float32x4_t r, mfloat8x16_t a, mfloat8x16_t b, fpm_t fpm) r -> Vd.4S;a -> Vn.16B;b -> Vm.16B FMMLA Vd.4S, Vn.16B, Vm.16B Vd.4S -> result A64 diff --git a/tools/intrinsic_db/advsimd_classification.csv b/tools/intrinsic_db/advsimd_classification.csv index b8a8a39f..51c413c0 100644 --- a/tools/intrinsic_db/advsimd_classification.csv +++ b/tools/intrinsic_db/advsimd_classification.csv @@ -4697,5 +4697,5 @@ vmlalltbq_lane_f32_mf8_fpm Vector arithmetic|Multiply|Multiply-accumulate and wi vmlalltbq_laneq_f32_mf8_fpm Vector arithmetic|Multiply|Multiply-accumulate and widen vmlallttq_lane_f32_mf8_fpm Vector arithmetic|Multiply|Multiply-accumulate and widen vmlallttq_laneq_f32_mf8_fpm Vector arithmetic|Multiply|Multiply-accumulate and widen -vmmlaq_f16_mf8 Vector arithmetic|Matrix multiply -vmmlaq_f32_mf8 Vector arithmetic|Matrix multiply +vmmlaq_f16_mf8_fpm Vector arithmetic|Matrix multiply +vmmlaq_f32_mf8_fpm Vector arithmetic|Matrix multiply