From 35c76149f512e5352b79068b7f31f1123047c48f Mon Sep 17 00:00:00 2001 From: Amilendra Kodithuwakku Date: Tue, 14 Oct 2025 15:55:42 +0100 Subject: [PATCH 1/2] Improve documentation for VMLA/VMLS intrinsics for floats The clang and GCC implementations for the VMLA/VMLS intrinsics for floats are the same except for the -ffp-contract=on case where GCC: uses VMUL+VADD/VSUM Clang: uses FMLA/FMLS (Fused MLA/MLS) Non-float VMLA/VMLS intrinsics reduce to MLA/MLS in both implementations. Previous ACLE releases (e.g. 2.1) made it clear that the VMLA/VMLS intrinsics for floats are implementation defined, but this information seem to have got lost in later releases. So redocument the implementation-defined behaviour. --- main/acle.md | 1 + neon_intrinsics/advsimd.md | 296 ++++++++++++++++----------------- tools/intrinsic_db/advsimd.csv | 40 ++--- 3 files changed, 169 insertions(+), 168 deletions(-) diff --git a/main/acle.md b/main/acle.md index 3b066e93..a9efea5a 100644 --- a/main/acle.md +++ b/main/acle.md @@ -465,6 +465,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin * Added feature test macro for FEAT_SSVE_FEXPA. * Added feature test macro for FEAT_CSSC. +* Re-document implementation defined aspect of the VMLA/VMLS intrinsics for floats. ### References diff --git a/neon_intrinsics/advsimd.md b/neon_intrinsics/advsimd.md index a87ad725..79218669 100644 --- a/neon_intrinsics/advsimd.md +++ b/neon_intrinsics/advsimd.md @@ -393,40 +393,40 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``. ##### Multiply-accumulate -| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | -|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------|---------------------------------------------------|--------------------|---------------------------| -| int8x8_t vmla_s8(
     int8x8_t a,
     int8x8_t b,
     int8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLA Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| int8x16_t vmlaq_s8(
     int8x16_t a,
     int8x16_t b,
     int8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLA Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| int16x4_t vmla_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLA Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | -| int16x8_t vmlaq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLA Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | -| int32x2_t vmla_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLA Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | -| int32x4_t vmlaq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLA Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | -| uint8x8_t vmla_u8(
     uint8x8_t a,
     uint8x8_t b,
     uint8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLA Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| uint8x16_t vmlaq_u8(
     uint8x16_t a,
     uint8x16_t b,
     uint8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLA Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| uint16x4_t vmla_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLA Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | -| uint16x8_t vmlaq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLA Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | -| uint32x2_t vmla_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLA Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | -| uint32x4_t vmlaq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLA Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | -| float32x2_t vmla_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t c)
| `N/A` | `RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 1` | `N/A` | `v7/A32/A64` | -| float32x4_t vmlaq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t c)
| `N/A` | `RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 3` | `N/A` | `v7/A32/A64` | -| float64x1_t vmla_f64(
     float64x1_t a,
     float64x1_t b,
     float64x1_t c)
| `N/A` | `RESULT[I] = a[i] + (b[i] * c[i]) for i = 0` | `N/A` | `A64` | -| float64x2_t vmlaq_f64(
     float64x2_t a,
     float64x2_t b,
     float64x2_t c)
| `N/A` | `RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 1` | `N/A` | `A64` | -| int8x8_t vmls_s8(
     int8x8_t a,
     int8x8_t b,
     int8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLS Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| int8x16_t vmlsq_s8(
     int8x16_t a,
     int8x16_t b,
     int8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLS Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| int16x4_t vmls_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLS Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | -| int16x8_t vmlsq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLS Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | -| int32x2_t vmls_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLS Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | -| int32x4_t vmlsq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLS Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | -| uint8x8_t vmls_u8(
     uint8x8_t a,
     uint8x8_t b,
     uint8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLS Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| uint8x16_t vmlsq_u8(
     uint8x16_t a,
     uint8x16_t b,
     uint8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLS Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| uint16x4_t vmls_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLS Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | -| uint16x8_t vmlsq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLS Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | -| uint32x2_t vmls_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLS Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | -| uint32x4_t vmlsq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLS Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | -| float32x2_t vmls_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t c)
| `N/A` | `RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 1` | `N/A` | `v7/A32/A64` | -| float32x4_t vmlsq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t c)
| `N/A` | `RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 3` | `N/A` | `v7/A32/A64` | -| float64x1_t vmls_f64(
     float64x1_t a,
     float64x1_t b,
     float64x1_t c)
| `N/A` | `RESULT[I] = a[i] - (b[i] * c[i]) for i = 0` | `N/A` | `A64` | -| float64x2_t vmlsq_f64(
     float64x2_t a,
     float64x2_t b,
     float64x2_t c)
| `N/A` | `RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 1` | `N/A` | `A64` | +| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | +|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------|---------------------------------------------------------------------------------------------------------------------|--------------------|---------------------------| +| int8x8_t vmla_s8(
     int8x8_t a,
     int8x8_t b,
     int8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLA Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| int8x16_t vmlaq_s8(
     int8x16_t a,
     int8x16_t b,
     int8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLA Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| int16x4_t vmla_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLA Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | +| int16x8_t vmlaq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLA Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | +| int32x2_t vmla_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLA Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | +| int32x4_t vmlaq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLA Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | +| uint8x8_t vmla_u8(
     uint8x8_t a,
     uint8x8_t b,
     uint8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLA Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| uint8x16_t vmlaq_u8(
     uint8x16_t a,
     uint8x16_t b,
     uint8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLA Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| uint16x4_t vmla_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLA Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | +| uint16x8_t vmlaq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLA Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | +| uint32x2_t vmla_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLA Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | +| uint32x4_t vmlaq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLA Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | +| float32x2_t vmla_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | +| float32x4_t vmlaq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | +| float64x1_t vmla_f64(
     float64x1_t a,
     float64x1_t b,
     float64x1_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | +| float64x2_t vmlaq_f64(
     float64x2_t a,
     float64x2_t b,
     float64x2_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | +| int8x8_t vmls_s8(
     int8x8_t a,
     int8x8_t b,
     int8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLS Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| int8x16_t vmlsq_s8(
     int8x16_t a,
     int8x16_t b,
     int8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLS Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| int16x4_t vmls_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLS Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | +| int16x8_t vmlsq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLS Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | +| int32x2_t vmls_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLS Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | +| int32x4_t vmlsq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLS Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | +| uint8x8_t vmls_u8(
     uint8x8_t a,
     uint8x8_t b,
     uint8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLS Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| uint8x16_t vmlsq_u8(
     uint8x16_t a,
     uint8x16_t b,
     uint8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLS Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| uint16x4_t vmls_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLS Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | +| uint16x8_t vmlsq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLS Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | +| uint32x2_t vmls_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLS Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | +| uint32x4_t vmlsq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLS Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | +| float32x2_t vmls_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | +| float32x4_t vmlsq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | +| float64x1_t vmls_f64(
     float64x1_t a,
     float64x1_t b,
     float64x1_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | +| float64x2_t vmlsq_f64(
     float64x2_t a,
     float64x2_t b,
     float64x2_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | ##### Multiply-accumulate and widen @@ -2663,95 +2663,95 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``. #### Vector multiply-accumulate by scalar -| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | -|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------|------------------------------------------------------|-------------------|---------------------------| -| int16x4_t vmla_lane_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | -| int16x8_t vmlaq_lane_s16(
     int16x8_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | -| int32x2_t vmla_lane_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | -| int32x4_t vmlaq_lane_s32(
     int32x4_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint16x4_t vmla_lane_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | -| uint16x8_t vmlaq_lane_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | -| uint32x2_t vmla_lane_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | -| uint32x4_t vmlaq_lane_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| float32x2_t vmla_lane_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 1` | `N/A` | `v7/A32/A64` | -| float32x4_t vmlaq_lane_f32(
     float32x4_t a,
     float32x4_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 3` | `N/A` | `v7/A32/A64` | -| int16x4_t vmla_laneq_s16(
     int16x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | -| int16x8_t vmlaq_laneq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | -| int32x2_t vmla_laneq_s32(
     int32x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | -| int32x4_t vmlaq_laneq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | -| uint16x4_t vmla_laneq_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | -| uint16x8_t vmlaq_laneq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | -| uint32x2_t vmla_laneq_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | -| uint32x4_t vmlaq_laneq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | -| float32x2_t vmla_laneq_f32(
     float32x2_t a,
     float32x2_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 1` | `N/A` | `A64` | -| float32x4_t vmlaq_laneq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 3` | `N/A` | `A64` | -| int32x4_t vmlal_lane_s16(
     int32x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| int64x2_t vmlal_lane_s32(
     int64x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | -| uint32x4_t vmlal_lane_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint64x2_t vmlal_lane_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | -| int32x4_t vmlal_high_lane_s16(
     int32x4_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlal_high_lane_s32(
     int64x2_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlal_high_lane_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlal_high_lane_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| int32x4_t vmlal_laneq_s16(
     int32x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlal_laneq_s32(
     int64x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlal_laneq_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlal_laneq_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| int32x4_t vmlal_high_laneq_s16(
     int32x4_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlal_high_laneq_s32(
     int64x2_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlal_high_laneq_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlal_high_laneq_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| int16x4_t vmla_n_s16(
     int16x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLA Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | -| int16x8_t vmlaq_n_s16(
     int16x8_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLA Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | -| int32x2_t vmla_n_s32(
     int32x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLA Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | -| int32x4_t vmlaq_n_s32(
     int32x4_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLA Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint16x4_t vmla_n_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLA Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | -| uint16x8_t vmlaq_n_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLA Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | -| uint32x2_t vmla_n_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLA Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | -| uint32x4_t vmlaq_n_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLA Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| float32x2_t vmla_n_f32(
     float32x2_t a,
     float32x2_t b,
     float32_t c)
| `N/A` | `RESULT[I] = a[i] + (b[i] * c) for i = 0 to 1` | `N/A` | `v7/A32/A64` | -| float32x4_t vmlaq_n_f32(
     float32x4_t a,
     float32x4_t b,
     float32_t c)
| `N/A` | `RESULT[I] = a[i] + (b[i] * c) for i = 0 to 3` | `N/A` | `v7/A32/A64` | +| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | +|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|-------------------|---------------------------| +| int16x4_t vmla_lane_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | +| int16x8_t vmlaq_lane_s16(
     int16x8_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | +| int32x2_t vmla_lane_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | +| int32x4_t vmlaq_lane_s32(
     int32x4_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint16x4_t vmla_lane_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | +| uint16x8_t vmlaq_lane_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | +| uint32x2_t vmla_lane_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | +| uint32x4_t vmlaq_lane_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| float32x2_t vmla_lane_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | +| float32x4_t vmlaq_lane_f32(
     float32x4_t a,
     float32x4_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | +| int16x4_t vmla_laneq_s16(
     int16x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | +| int16x8_t vmlaq_laneq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | +| int32x2_t vmla_laneq_s32(
     int32x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | +| int32x4_t vmlaq_laneq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | +| uint16x4_t vmla_laneq_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | +| uint16x8_t vmlaq_laneq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | +| uint32x2_t vmla_laneq_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | +| uint32x4_t vmlaq_laneq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | +| float32x2_t vmla_laneq_f32(
     float32x2_t a,
     float32x2_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | +| float32x4_t vmlaq_laneq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | +| int32x4_t vmlal_lane_s16(
     int32x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| int64x2_t vmlal_lane_s32(
     int64x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | +| uint32x4_t vmlal_lane_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint64x2_t vmlal_lane_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | +| int32x4_t vmlal_high_lane_s16(
     int32x4_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlal_high_lane_s32(
     int64x2_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlal_high_lane_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlal_high_lane_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| int32x4_t vmlal_laneq_s16(
     int32x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlal_laneq_s32(
     int64x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlal_laneq_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlal_laneq_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| int32x4_t vmlal_high_laneq_s16(
     int32x4_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlal_high_laneq_s32(
     int64x2_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlal_high_laneq_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlal_high_laneq_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| int16x4_t vmla_n_s16(
     int16x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLA Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | +| int16x8_t vmlaq_n_s16(
     int16x8_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLA Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | +| int32x2_t vmla_n_s32(
     int32x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLA Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | +| int32x4_t vmlaq_n_s32(
     int32x4_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLA Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint16x4_t vmla_n_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLA Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | +| uint16x8_t vmlaq_n_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLA Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | +| uint32x2_t vmla_n_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLA Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | +| uint32x4_t vmlaq_n_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLA Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| float32x2_t vmla_n_f32(
     float32x2_t a,
     float32x2_t b,
     float32_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | +| float32x4_t vmlaq_n_f32(
     float32x4_t a,
     float32x4_t b,
     float32_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | #### Vector multiply-subtract by scalar -| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | -|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------|------------------------------------------------------|-------------------|---------------------------| -| int16x4_t vmls_lane_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | -| int16x8_t vmlsq_lane_s16(
     int16x8_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | -| int32x2_t vmls_lane_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | -| int32x4_t vmlsq_lane_s32(
     int32x4_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint16x4_t vmls_lane_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | -| uint16x8_t vmlsq_lane_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | -| uint32x2_t vmls_lane_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | -| uint32x4_t vmlsq_lane_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| float32x2_t vmls_lane_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 1` | `N/A` | `v7/A32/A64` | -| float32x4_t vmlsq_lane_f32(
     float32x4_t a,
     float32x4_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 3` | `N/A` | `v7/A32/A64` | -| int16x4_t vmls_laneq_s16(
     int16x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | -| int16x8_t vmlsq_laneq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | -| int32x2_t vmls_laneq_s32(
     int32x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | -| int32x4_t vmlsq_laneq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | -| uint16x4_t vmls_laneq_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | -| uint16x8_t vmlsq_laneq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | -| uint32x2_t vmls_laneq_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | -| uint32x4_t vmlsq_laneq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | -| float32x2_t vmls_laneq_f32(
     float32x2_t a,
     float32x2_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 1` | `N/A` | `A64` | -| float32x4_t vmlsq_laneq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 3` | `N/A` | `A64` | -| int32x4_t vmlsl_lane_s16(
     int32x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| int64x2_t vmlsl_lane_s32(
     int64x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | -| uint32x4_t vmlsl_lane_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint64x2_t vmlsl_lane_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | -| int32x4_t vmlsl_high_lane_s16(
     int32x4_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlsl_high_lane_s32(
     int64x2_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlsl_high_lane_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlsl_high_lane_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| int32x4_t vmlsl_laneq_s16(
     int32x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlsl_laneq_s32(
     int64x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlsl_laneq_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlsl_laneq_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| int32x4_t vmlsl_high_laneq_s16(
     int32x4_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlsl_high_laneq_s32(
     int64x2_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlsl_high_laneq_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlsl_high_laneq_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | +|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|-------------------|---------------------------| +| int16x4_t vmls_lane_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | +| int16x8_t vmlsq_lane_s16(
     int16x8_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | +| int32x2_t vmls_lane_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | +| int32x4_t vmlsq_lane_s32(
     int32x4_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint16x4_t vmls_lane_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | +| uint16x8_t vmlsq_lane_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | +| uint32x2_t vmls_lane_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | +| uint32x4_t vmlsq_lane_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| float32x2_t vmls_lane_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | +| float32x4_t vmlsq_lane_f32(
     float32x4_t a,
     float32x4_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | +| int16x4_t vmls_laneq_s16(
     int16x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | +| int16x8_t vmlsq_laneq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | +| int32x2_t vmls_laneq_s32(
     int32x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | +| int32x4_t vmlsq_laneq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | +| uint16x4_t vmls_laneq_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | +| uint16x8_t vmlsq_laneq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | +| uint32x2_t vmls_laneq_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | +| uint32x4_t vmlsq_laneq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | +| float32x2_t vmls_laneq_f32(
     float32x2_t a,
     float32x2_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | +| float32x4_t vmlsq_laneq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | +| int32x4_t vmlsl_lane_s16(
     int32x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| int64x2_t vmlsl_lane_s32(
     int64x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | +| uint32x4_t vmlsl_lane_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint64x2_t vmlsl_lane_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | +| int32x4_t vmlsl_high_lane_s16(
     int32x4_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlsl_high_lane_s32(
     int64x2_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlsl_high_lane_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlsl_high_lane_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| int32x4_t vmlsl_laneq_s16(
     int32x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlsl_laneq_s32(
     int64x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlsl_laneq_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlsl_laneq_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| int32x4_t vmlsl_high_laneq_s16(
     int32x4_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlsl_high_laneq_s32(
     int64x2_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlsl_high_laneq_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlsl_high_laneq_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | #### Vector multiply by scalar @@ -2829,34 +2829,34 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``. #### Vector multiply-accumulate by scalar and widen -| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | -|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------|------------------------------------------------|-------------------|---------------------------| -| int32x4_t vmlal_n_s16(
     int32x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `SMLAL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| int64x2_t vmlal_n_s32(
     int64x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `SMLAL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | -| uint32x4_t vmlal_n_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `UMLAL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint64x2_t vmlal_n_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `UMLAL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | -| int32x4_t vmlal_high_n_s16(
     int32x4_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlal_high_n_s32(
     int64x2_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlal_high_n_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlal_high_n_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | -| int16x4_t vmls_n_s16(
     int16x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLS Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | -| int16x8_t vmlsq_n_s16(
     int16x8_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLS Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | -| int32x2_t vmls_n_s32(
     int32x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLS Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | -| int32x4_t vmlsq_n_s32(
     int32x4_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLS Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint16x4_t vmls_n_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLS Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | -| uint16x8_t vmlsq_n_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLS Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | -| uint32x2_t vmls_n_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLS Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | -| uint32x4_t vmlsq_n_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLS Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| float32x2_t vmls_n_f32(
     float32x2_t a,
     float32x2_t b,
     float32_t c)
| `N/A` | `RESULT[I] = a[i] - (b[i] * c) for i = 0 to 1` | `N/A` | `v7/A32/A64` | -| float32x4_t vmlsq_n_f32(
     float32x4_t a,
     float32x4_t b,
     float32_t c)
| `N/A` | `RESULT[I] = a[i] - (b[i] * c) for i = 0 to 3` | `N/A` | `v7/A32/A64` | -| int32x4_t vmlsl_n_s16(
     int32x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `SMLSL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| int64x2_t vmlsl_n_s32(
     int64x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `SMLSL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | -| uint32x4_t vmlsl_n_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `UMLSL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint64x2_t vmlsl_n_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `UMLSL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | -| int32x4_t vmlsl_high_n_s16(
     int32x4_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlsl_high_n_s32(
     int64x2_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlsl_high_n_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlsl_high_n_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | +| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | +|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------|------------------------------------------------------------------------------------------------------------------|-------------------|---------------------------| +| int32x4_t vmlal_n_s16(
     int32x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `SMLAL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| int64x2_t vmlal_n_s32(
     int64x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `SMLAL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | +| uint32x4_t vmlal_n_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `UMLAL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint64x2_t vmlal_n_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `UMLAL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | +| int32x4_t vmlal_high_n_s16(
     int32x4_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlal_high_n_s32(
     int64x2_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlal_high_n_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlal_high_n_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | +| int16x4_t vmls_n_s16(
     int16x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLS Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | +| int16x8_t vmlsq_n_s16(
     int16x8_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLS Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | +| int32x2_t vmls_n_s32(
     int32x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLS Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | +| int32x4_t vmlsq_n_s32(
     int32x4_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLS Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint16x4_t vmls_n_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLS Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | +| uint16x8_t vmlsq_n_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLS Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | +| uint32x2_t vmls_n_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLS Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | +| uint32x4_t vmlsq_n_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLS Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| float32x2_t vmls_n_f32(
     float32x2_t a,
     float32x2_t b,
     float32_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | +| float32x4_t vmlsq_n_f32(
     float32x4_t a,
     float32x4_t b,
     float32_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | +| int32x4_t vmlsl_n_s16(
     int32x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `SMLSL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| int64x2_t vmlsl_n_s32(
     int64x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `SMLSL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | +| uint32x4_t vmlsl_n_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `UMLSL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint64x2_t vmlsl_n_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `UMLSL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | +| int32x4_t vmlsl_high_n_s16(
     int32x4_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlsl_high_n_s32(
     int64x2_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlsl_high_n_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlsl_high_n_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | #### Fused multiply-accumulate by scalar diff --git a/tools/intrinsic_db/advsimd.csv b/tools/intrinsic_db/advsimd.csv index 7b51c965..53681581 100644 --- a/tools/intrinsic_db/advsimd.csv +++ b/tools/intrinsic_db/advsimd.csv @@ -209,10 +209,10 @@ uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) a -> Vd.4H;b -> Vn uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) a -> Vd.8H;b -> Vn.8H;c -> Vm.8H MLA Vd.8H,Vn.8H,Vm.8H Vd.8H -> result v7/A32/A64 uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) a -> Vd.2S;b -> Vn.2S;c -> Vm.2S MLA Vd.2S,Vn.2S,Vm.2S Vd.2S -> result v7/A32/A64 uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) a -> Vd.4S;b -> Vn.4S;c -> Vm.4S MLA Vd.4S,Vn.4S,Vm.4S Vd.4S -> result v7/A32/A64 -float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) N/A RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 1 N/A v7/A32/A64 -float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) N/A RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 3 N/A v7/A32/A64 -float64x1_t vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) N/A RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 N/A A64 -float64x2_t vmlaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) N/A RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 1 N/A A64 +float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1;Final instruction sequence is implementation defined N/A v7/A32/A64 +float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 3;Final instruction sequence is implementation defined N/A v7/A32/A64 +float64x1_t vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0;Final instruction sequence is implementation defined N/A A64 +float64x2_t vmlaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1;Final instruction sequence is implementation defined N/A A64 int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) a -> Vd.8H;b -> Vn.8B;c -> Vm.8B SMLAL Vd.8H,Vn.8B,Vm.8B Vd.8H -> result v7/A32/A64 int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.4H SMLAL Vd.4S,Vn.4H,Vm.4H Vd.4S -> result v7/A32/A64 int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) a -> Vd.2D;b -> Vn.2S;c -> Vm.2S SMLAL Vd.2D,Vn.2S,Vm.2S Vd.2D -> result v7/A32/A64 @@ -237,10 +237,10 @@ uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) a -> Vd.4H;b -> Vn uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) a -> Vd.8H;b -> Vn.8H;c -> Vm.8H MLS Vd.8H,Vn.8H,Vm.8H Vd.8H -> result v7/A32/A64 uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) a -> Vd.2S;b -> Vn.2S;c -> Vm.2S MLS Vd.2S,Vn.2S,Vm.2S Vd.2S -> result v7/A32/A64 uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) a -> Vd.4S;b -> Vn.4S;c -> Vm.4S MLS Vd.4S,Vn.4S,Vm.4S Vd.4S -> result v7/A32/A64 -float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) N/A RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 1 N/A v7/A32/A64 -float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) N/A RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 3 N/A v7/A32/A64 -float64x1_t vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) N/A RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 N/A A64 -float64x2_t vmlsq_f64(float64x2_t a, float64x2_t b, float64x2_t c) N/A RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 1 N/A A64 +float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1;Final instruction sequence is implementation defined N/A v7/A32/A64 +float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 3;Final instruction sequence is implementation defined N/A v7/A32/A64 +float64x1_t vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0;Final instruction sequence is implementation defined N/A A64 +float64x2_t vmlsq_f64(float64x2_t a, float64x2_t b, float64x2_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1;Final instruction sequence is implementation defined N/A A64 int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) a -> Vd.8H;b -> Vn.8B;c -> Vm.8B SMLSL Vd.8H,Vn.8B,Vm.8B Vd.8H -> result v7/A32/A64 int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.4H SMLSL Vd.4S,Vn.4H,Vm.4H Vd.4S -> result v7/A32/A64 int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) a -> Vd.2D;b -> Vn.2S;c -> Vm.2S SMLSL Vd.2D,Vn.2S,Vm.2S Vd.2D -> result v7/A32/A64 @@ -1342,8 +1342,8 @@ uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __builtin_con uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.4H;0 <= lane <= 3 MLA Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result v7/A32/A64 uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.2S;0 <= lane <= 1 MLA Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result v7/A32/A64 uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4S;v -> Vm.2S;0 <= lane <= 1 MLA Vd.4S,Vn.4S,Vm.S[lane] Vd.4S -> result v7/A32/A64 -float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 1 N/A v7/A32/A64 -float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 3 N/A v7/A32/A64 +float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1;Final instruction sequence is implementation defined N/A v7/A32/A64 +float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3;Final instruction sequence is implementation defined N/A v7/A32/A64 int16x4_t vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, __builtin_constant_p(lane)) a -> Vd.4H;b -> Vn.4H;v -> Vm.8H;0 <= lane <= 7 MLA Vd.4H,Vn.4H,Vm.H[lane] Vd.4H -> result A64 int16x8_t vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.8H;0 <= lane <= 7 MLA Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result A64 int32x2_t vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.4S;0 <= lane <= 3 MLA Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result A64 @@ -1352,8 +1352,8 @@ uint16x4_t vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v, __builtin_co uint16x8_t vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.8H;0 <= lane <= 7 MLA Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result A64 uint32x2_t vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.4S;0 <= lane <= 3 MLA Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result A64 uint32x4_t vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4S;v -> Vm.4S;0 <= lane <= 3 MLA Vd.4S,Vn.4S,Vm.S[lane] Vd.4S -> result A64 -float32x2_t vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 1 N/A A64 -float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 3 N/A A64 +float32x2_t vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1;Final instruction sequence is implementation defined N/A A64 +float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3;Final instruction sequence is implementation defined N/A A64 int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4H;v -> Vm.4H;0 <= lane <= 3 SMLAL Vd.4S,Vn.4H,Vm.H[lane] Vd.4S -> result v7/A32/A64 int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __builtin_constant_p(lane)) a -> Vd.2D;b -> Vn.2S;v -> Vm.2S;0 <= lane <= 1 SMLAL Vd.2D,Vn.2S,Vm.S[lane] Vd.2D -> result v7/A32/A64 uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4H;v -> Vm.4H;0 <= lane <= 3 UMLAL Vd.4S,Vn.4H,Vm.H[lane] Vd.4S -> result v7/A32/A64 @@ -1390,8 +1390,8 @@ uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __builtin_con uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.4H;0 <= lane <= 3 MLS Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result v7/A32/A64 uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.2S;0 <= lane <= 1 MLS Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result v7/A32/A64 uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4S;v -> Vm.2S;0 <= lane <= 1 MLS Vd.4S,Vn.4S,Vm.S[lane] Vd.4S -> result v7/A32/A64 -float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 1 N/A v7/A32/A64 -float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 3 N/A v7/A32/A64 +float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1;Final instruction sequence is implementation defined N/A v7/A32/A64 +float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3;Final instruction sequence is implementation defined N/A v7/A32/A64 int16x4_t vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, __builtin_constant_p(lane)) a -> Vd.4H;b -> Vn.4H;v -> Vm.8H;0 <= lane <= 7 MLS Vd.4H,Vn.4H,Vm.H[lane] Vd.4H -> result A64 int16x8_t vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.8H;0 <= lane <= 7 MLS Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result A64 int32x2_t vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.4S;0 <= lane <= 3 MLS Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result A64 @@ -1400,8 +1400,8 @@ uint16x4_t vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v, __builtin_co uint16x8_t vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.8H;0 <= lane <= 7 MLS Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result A64 uint32x2_t vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.4S;0 <= lane <= 3 MLS Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result A64 uint32x4_t vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4S;v -> Vm.4S;0 <= lane <= 3 MLS Vd.4S,Vn.4S,Vm.S[lane] Vd.4S -> result A64 -float32x2_t vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 1 N/A A64 -float32x4_t vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 3 N/A A64 +float32x2_t vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1;Final instruction sequence is implementation defined N/A A64 +float32x4_t vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3;Final instruction sequence is implementation defined N/A A64 int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4H;v -> Vm.4H;0 <= lane <= 3 SMLSL Vd.4S,Vn.4H,Vm.H[lane] Vd.4S -> result v7/A32/A64 int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __builtin_constant_p(lane)) a -> Vd.2D;b -> Vn.2S;v -> Vm.2S;0 <= lane <= 1 SMLSL Vd.2D,Vn.2S,Vm.S[lane] Vd.2D -> result v7/A32/A64 uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4H;v -> Vm.4H;0 <= lane <= 3 UMLSL Vd.4S,Vn.4H,Vm.H[lane] Vd.4S -> result v7/A32/A64 @@ -1550,8 +1550,8 @@ uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) a -> Vd.4H;b -> Vn uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) a -> Vd.8H;b -> Vn.8H;c -> Vm.H[0] MLA Vd.8H,Vn.8H,Vm.H[0] Vd.8H -> result v7/A32/A64 uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) a -> Vd.2S;b -> Vn.2S;c -> Vm.S[0] MLA Vd.2S,Vn.2S,Vm.S[0] Vd.2S -> result v7/A32/A64 uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) a -> Vd.4S;b -> Vn.4S;c -> Vm.S[0] MLA Vd.4S,Vn.4S,Vm.S[0] Vd.4S -> result v7/A32/A64 -float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) N/A RESULT[I] = a[i] + (b[i] * c) for i = 0 to 1 N/A v7/A32/A64 -float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) N/A RESULT[I] = a[i] + (b[i] * c) for i = 0 to 3 N/A v7/A32/A64 +float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 1;Final instruction sequence is implementation defined N/A v7/A32/A64 +float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 3;Final instruction sequence is implementation defined N/A v7/A32/A64 int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.H[0] SMLAL Vd.4S,Vn.4H,Vm.H[0] Vd.4S -> result v7/A32/A64 int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) a -> Vd.2D;b -> Vn.2S;c -> Vm.S[0] SMLAL Vd.2D,Vn.2S,Vm.S[0] Vd.2D -> result v7/A32/A64 uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.H[0] UMLAL Vd.4S,Vn.4H,Vm.H[0] Vd.4S -> result v7/A32/A64 @@ -1572,8 +1572,8 @@ uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) a -> Vd.4H;b -> Vn uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) a -> Vd.8H;b -> Vn.8H;c -> Vm.H[0] MLS Vd.8H,Vn.8H,Vm.H[0] Vd.8H -> result v7/A32/A64 uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) a -> Vd.2S;b -> Vn.2S;c -> Vm.S[0] MLS Vd.2S,Vn.2S,Vm.S[0] Vd.2S -> result v7/A32/A64 uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) a -> Vd.4S;b -> Vn.4S;c -> Vm.S[0] MLS Vd.4S,Vn.4S,Vm.S[0] Vd.4S -> result v7/A32/A64 -float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) N/A RESULT[I] = a[i] - (b[i] * c) for i = 0 to 1 N/A v7/A32/A64 -float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) N/A RESULT[I] = a[i] - (b[i] * c) for i = 0 to 3 N/A v7/A32/A64 +float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 1;Final instruction sequence is implementation defined N/A v7/A32/A64 +float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 3;Final instruction sequence is implementation defined N/A v7/A32/A64 int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.H[0] SMLSL Vd.4S,Vn.4H,Vm.H[0] Vd.4S -> result v7/A32/A64 int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) a -> Vd.2D;b -> Vn.2S;c -> Vm.S[0] SMLSL Vd.2D,Vn.2S,Vm.S[0] Vd.2D -> result v7/A32/A64 uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.H[0] UMLSL Vd.4S,Vn.4H,Vm.H[0] Vd.4S -> result v7/A32/A64 From 1ab80d82172be2cb4fd1ad7f6df55700afd3ee16 Mon Sep 17 00:00:00 2001 From: Amilendra Kodithuwakku Date: Fri, 14 Nov 2025 14:04:01 +0000 Subject: [PATCH 2/2] Remove 'Final instruction sequence is implementation defined' phrase --- main/acle.md | 2 +- neon_intrinsics/advsimd.md | 296 ++++++++++++++++----------------- tools/intrinsic_db/advsimd.csv | 40 ++--- 3 files changed, 169 insertions(+), 169 deletions(-) diff --git a/main/acle.md b/main/acle.md index a9efea5a..7e0aae37 100644 --- a/main/acle.md +++ b/main/acle.md @@ -465,7 +465,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin * Added feature test macro for FEAT_SSVE_FEXPA. * Added feature test macro for FEAT_CSSC. -* Re-document implementation defined aspect of the VMLA/VMLS intrinsics for floats. +* Improve documentation for VMLA/VMLS intrinsics for floats. ### References diff --git a/neon_intrinsics/advsimd.md b/neon_intrinsics/advsimd.md index 79218669..31d716a2 100644 --- a/neon_intrinsics/advsimd.md +++ b/neon_intrinsics/advsimd.md @@ -393,40 +393,40 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``. ##### Multiply-accumulate -| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | -|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------|---------------------------------------------------------------------------------------------------------------------|--------------------|---------------------------| -| int8x8_t vmla_s8(
     int8x8_t a,
     int8x8_t b,
     int8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLA Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| int8x16_t vmlaq_s8(
     int8x16_t a,
     int8x16_t b,
     int8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLA Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| int16x4_t vmla_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLA Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | -| int16x8_t vmlaq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLA Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | -| int32x2_t vmla_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLA Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | -| int32x4_t vmlaq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLA Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | -| uint8x8_t vmla_u8(
     uint8x8_t a,
     uint8x8_t b,
     uint8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLA Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| uint8x16_t vmlaq_u8(
     uint8x16_t a,
     uint8x16_t b,
     uint8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLA Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| uint16x4_t vmla_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLA Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | -| uint16x8_t vmlaq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLA Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | -| uint32x2_t vmla_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLA Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | -| uint32x4_t vmlaq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLA Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | -| float32x2_t vmla_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | -| float32x4_t vmlaq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | -| float64x1_t vmla_f64(
     float64x1_t a,
     float64x1_t b,
     float64x1_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | -| float64x2_t vmlaq_f64(
     float64x2_t a,
     float64x2_t b,
     float64x2_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | -| int8x8_t vmls_s8(
     int8x8_t a,
     int8x8_t b,
     int8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLS Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| int8x16_t vmlsq_s8(
     int8x16_t a,
     int8x16_t b,
     int8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLS Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| int16x4_t vmls_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLS Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | -| int16x8_t vmlsq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLS Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | -| int32x2_t vmls_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLS Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | -| int32x4_t vmlsq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLS Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | -| uint8x8_t vmls_u8(
     uint8x8_t a,
     uint8x8_t b,
     uint8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLS Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| uint8x16_t vmlsq_u8(
     uint8x16_t a,
     uint8x16_t b,
     uint8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLS Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| uint16x4_t vmls_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLS Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | -| uint16x8_t vmlsq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLS Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | -| uint32x2_t vmls_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLS Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | -| uint32x4_t vmlsq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLS Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | -| float32x2_t vmls_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | -| float32x4_t vmlsq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | -| float64x1_t vmls_f64(
     float64x1_t a,
     float64x1_t b,
     float64x1_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | -| float64x2_t vmlsq_f64(
     float64x2_t a,
     float64x2_t b,
     float64x2_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | +| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | +|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------|-----------------------------------------------------------|--------------------|---------------------------| +| int8x8_t vmla_s8(
     int8x8_t a,
     int8x8_t b,
     int8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLA Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| int8x16_t vmlaq_s8(
     int8x16_t a,
     int8x16_t b,
     int8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLA Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| int16x4_t vmla_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLA Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | +| int16x8_t vmlaq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLA Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | +| int32x2_t vmla_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLA Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | +| int32x4_t vmlaq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLA Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | +| uint8x8_t vmla_u8(
     uint8x8_t a,
     uint8x8_t b,
     uint8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLA Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| uint8x16_t vmlaq_u8(
     uint8x16_t a,
     uint8x16_t b,
     uint8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLA Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| uint16x4_t vmla_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLA Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | +| uint16x8_t vmlaq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLA Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | +| uint32x2_t vmla_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLA Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | +| uint32x4_t vmlaq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLA Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | +| float32x2_t vmla_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1` | `N/A` | `v7/A32/A64` | +| float32x4_t vmlaq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 3` | `N/A` | `v7/A32/A64` | +| float64x1_t vmla_f64(
     float64x1_t a,
     float64x1_t b,
     float64x1_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0` | `N/A` | `A64` | +| float64x2_t vmlaq_f64(
     float64x2_t a,
     float64x2_t b,
     float64x2_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1` | `N/A` | `A64` | +| int8x8_t vmls_s8(
     int8x8_t a,
     int8x8_t b,
     int8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLS Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| int8x16_t vmlsq_s8(
     int8x16_t a,
     int8x16_t b,
     int8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLS Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| int16x4_t vmls_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLS Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | +| int16x8_t vmlsq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLS Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | +| int32x2_t vmls_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLS Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | +| int32x4_t vmlsq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLS Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | +| uint8x8_t vmls_u8(
     uint8x8_t a,
     uint8x8_t b,
     uint8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLS Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| uint8x16_t vmlsq_u8(
     uint8x16_t a,
     uint8x16_t b,
     uint8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLS Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| uint16x4_t vmls_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLS Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` | +| uint16x8_t vmlsq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLS Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` | +| uint32x2_t vmls_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLS Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` | +| uint32x4_t vmlsq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLS Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` | +| float32x2_t vmls_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1` | `N/A` | `v7/A32/A64` | +| float32x4_t vmlsq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 3` | `N/A` | `v7/A32/A64` | +| float64x1_t vmls_f64(
     float64x1_t a,
     float64x1_t b,
     float64x1_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0` | `N/A` | `A64` | +| float64x2_t vmlsq_f64(
     float64x2_t a,
     float64x2_t b,
     float64x2_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1` | `N/A` | `A64` | ##### Multiply-accumulate and widen @@ -2663,95 +2663,95 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``. #### Vector multiply-accumulate by scalar -| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | -|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|-------------------|---------------------------| -| int16x4_t vmla_lane_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | -| int16x8_t vmlaq_lane_s16(
     int16x8_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | -| int32x2_t vmla_lane_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | -| int32x4_t vmlaq_lane_s32(
     int32x4_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint16x4_t vmla_lane_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | -| uint16x8_t vmlaq_lane_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | -| uint32x2_t vmla_lane_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | -| uint32x4_t vmlaq_lane_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| float32x2_t vmla_lane_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | -| float32x4_t vmlaq_lane_f32(
     float32x4_t a,
     float32x4_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | -| int16x4_t vmla_laneq_s16(
     int16x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | -| int16x8_t vmlaq_laneq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | -| int32x2_t vmla_laneq_s32(
     int32x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | -| int32x4_t vmlaq_laneq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | -| uint16x4_t vmla_laneq_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | -| uint16x8_t vmlaq_laneq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | -| uint32x2_t vmla_laneq_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | -| uint32x4_t vmlaq_laneq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | -| float32x2_t vmla_laneq_f32(
     float32x2_t a,
     float32x2_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | -| float32x4_t vmlaq_laneq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | -| int32x4_t vmlal_lane_s16(
     int32x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| int64x2_t vmlal_lane_s32(
     int64x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | -| uint32x4_t vmlal_lane_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint64x2_t vmlal_lane_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | -| int32x4_t vmlal_high_lane_s16(
     int32x4_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlal_high_lane_s32(
     int64x2_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlal_high_lane_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlal_high_lane_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| int32x4_t vmlal_laneq_s16(
     int32x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlal_laneq_s32(
     int64x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlal_laneq_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlal_laneq_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| int32x4_t vmlal_high_laneq_s16(
     int32x4_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlal_high_laneq_s32(
     int64x2_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlal_high_laneq_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlal_high_laneq_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| int16x4_t vmla_n_s16(
     int16x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLA Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | -| int16x8_t vmlaq_n_s16(
     int16x8_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLA Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | -| int32x2_t vmla_n_s32(
     int32x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLA Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | -| int32x4_t vmlaq_n_s32(
     int32x4_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLA Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint16x4_t vmla_n_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLA Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | -| uint16x8_t vmlaq_n_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLA Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | -| uint32x2_t vmla_n_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLA Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | -| uint32x4_t vmlaq_n_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLA Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| float32x2_t vmla_n_f32(
     float32x2_t a,
     float32x2_t b,
     float32_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | -| float32x4_t vmlaq_n_f32(
     float32x4_t a,
     float32x4_t b,
     float32_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | +| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | +|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------|--------------------------------------------------------------|-------------------|---------------------------| +| int16x4_t vmla_lane_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | +| int16x8_t vmlaq_lane_s16(
     int16x8_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | +| int32x2_t vmla_lane_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | +| int32x4_t vmlaq_lane_s32(
     int32x4_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint16x4_t vmla_lane_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | +| uint16x8_t vmlaq_lane_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | +| uint32x2_t vmla_lane_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | +| uint32x4_t vmlaq_lane_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| float32x2_t vmla_lane_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1` | `N/A` | `v7/A32/A64` | +| float32x4_t vmlaq_lane_f32(
     float32x4_t a,
     float32x4_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3` | `N/A` | `v7/A32/A64` | +| int16x4_t vmla_laneq_s16(
     int16x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | +| int16x8_t vmlaq_laneq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | +| int32x2_t vmla_laneq_s32(
     int32x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | +| int32x4_t vmlaq_laneq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | +| uint16x4_t vmla_laneq_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | +| uint16x8_t vmlaq_laneq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | +| uint32x2_t vmla_laneq_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | +| uint32x4_t vmlaq_laneq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | +| float32x2_t vmla_laneq_f32(
     float32x2_t a,
     float32x2_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1` | `N/A` | `A64` | +| float32x4_t vmlaq_laneq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3` | `N/A` | `A64` | +| int32x4_t vmlal_lane_s16(
     int32x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| int64x2_t vmlal_lane_s32(
     int64x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | +| uint32x4_t vmlal_lane_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint64x2_t vmlal_lane_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | +| int32x4_t vmlal_high_lane_s16(
     int32x4_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlal_high_lane_s32(
     int64x2_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlal_high_lane_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlal_high_lane_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| int32x4_t vmlal_laneq_s16(
     int32x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlal_laneq_s32(
     int64x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlal_laneq_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlal_laneq_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| int32x4_t vmlal_high_laneq_s16(
     int32x4_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlal_high_laneq_s32(
     int64x2_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlal_high_laneq_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlal_high_laneq_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| int16x4_t vmla_n_s16(
     int16x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLA Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | +| int16x8_t vmlaq_n_s16(
     int16x8_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLA Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | +| int32x2_t vmla_n_s32(
     int32x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLA Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | +| int32x4_t vmlaq_n_s32(
     int32x4_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLA Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint16x4_t vmla_n_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLA Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | +| uint16x8_t vmlaq_n_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLA Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | +| uint32x2_t vmla_n_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLA Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | +| uint32x4_t vmlaq_n_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLA Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| float32x2_t vmla_n_f32(
     float32x2_t a,
     float32x2_t b,
     float32_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 1` | `N/A` | `v7/A32/A64` | +| float32x4_t vmlaq_n_f32(
     float32x4_t a,
     float32x4_t b,
     float32_t c)
| `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 3` | `N/A` | `v7/A32/A64` | #### Vector multiply-subtract by scalar -| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | -|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|-------------------|---------------------------| -| int16x4_t vmls_lane_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | -| int16x8_t vmlsq_lane_s16(
     int16x8_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | -| int32x2_t vmls_lane_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | -| int32x4_t vmlsq_lane_s32(
     int32x4_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint16x4_t vmls_lane_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | -| uint16x8_t vmlsq_lane_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | -| uint32x2_t vmls_lane_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | -| uint32x4_t vmlsq_lane_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| float32x2_t vmls_lane_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | -| float32x4_t vmlsq_lane_f32(
     float32x4_t a,
     float32x4_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | -| int16x4_t vmls_laneq_s16(
     int16x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | -| int16x8_t vmlsq_laneq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | -| int32x2_t vmls_laneq_s32(
     int32x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | -| int32x4_t vmlsq_laneq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | -| uint16x4_t vmls_laneq_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | -| uint16x8_t vmlsq_laneq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | -| uint32x2_t vmls_laneq_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | -| uint32x4_t vmlsq_laneq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | -| float32x2_t vmls_laneq_f32(
     float32x2_t a,
     float32x2_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | -| float32x4_t vmlsq_laneq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `A64` | -| int32x4_t vmlsl_lane_s16(
     int32x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| int64x2_t vmlsl_lane_s32(
     int64x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | -| uint32x4_t vmlsl_lane_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint64x2_t vmlsl_lane_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | -| int32x4_t vmlsl_high_lane_s16(
     int32x4_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlsl_high_lane_s32(
     int64x2_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlsl_high_lane_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlsl_high_lane_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| int32x4_t vmlsl_laneq_s16(
     int32x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlsl_laneq_s32(
     int64x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlsl_laneq_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlsl_laneq_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| int32x4_t vmlsl_high_laneq_s16(
     int32x4_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlsl_high_laneq_s32(
     int64x2_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlsl_high_laneq_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlsl_high_laneq_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | +|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------|--------------------------------------------------------------|-------------------|---------------------------| +| int16x4_t vmls_lane_s16(
     int16x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | +| int16x8_t vmlsq_lane_s16(
     int16x8_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | +| int32x2_t vmls_lane_s32(
     int32x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | +| int32x4_t vmlsq_lane_s32(
     int32x4_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint16x4_t vmls_lane_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` | +| uint16x8_t vmlsq_lane_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` | +| uint32x2_t vmls_lane_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` | +| uint32x4_t vmlsq_lane_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| float32x2_t vmls_lane_f32(
     float32x2_t a,
     float32x2_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1` | `N/A` | `v7/A32/A64` | +| float32x4_t vmlsq_lane_f32(
     float32x4_t a,
     float32x4_t b,
     float32x2_t v,
     const int lane)
| `0 <= lane <= 1` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3` | `N/A` | `v7/A32/A64` | +| int16x4_t vmls_laneq_s16(
     int16x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | +| int16x8_t vmlsq_laneq_s16(
     int16x8_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | +| int32x2_t vmls_laneq_s32(
     int32x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | +| int32x4_t vmlsq_laneq_s32(
     int32x4_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | +| uint16x4_t vmls_laneq_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` | +| uint16x8_t vmlsq_laneq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` | +| uint32x2_t vmls_laneq_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` | +| uint32x4_t vmlsq_laneq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` | +| float32x2_t vmls_laneq_f32(
     float32x2_t a,
     float32x2_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1` | `N/A` | `A64` | +| float32x4_t vmlsq_laneq_f32(
     float32x4_t a,
     float32x4_t b,
     float32x4_t v,
     const int lane)
| `0 <= lane <= 3` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3` | `N/A` | `A64` | +| int32x4_t vmlsl_lane_s16(
     int32x4_t a,
     int16x4_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| int64x2_t vmlsl_lane_s32(
     int64x2_t a,
     int32x2_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | +| uint32x4_t vmlsl_lane_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint64x2_t vmlsl_lane_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` | +| int32x4_t vmlsl_high_lane_s16(
     int32x4_t a,
     int16x8_t b,
     int16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlsl_high_lane_s32(
     int64x2_t a,
     int32x4_t b,
     int32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlsl_high_lane_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x4_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlsl_high_lane_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x2_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| int32x4_t vmlsl_laneq_s16(
     int32x4_t a,
     int16x4_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlsl_laneq_s32(
     int64x2_t a,
     int32x2_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlsl_laneq_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlsl_laneq_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| int32x4_t vmlsl_high_laneq_s16(
     int32x4_t a,
     int16x8_t b,
     int16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlsl_high_laneq_s32(
     int64x2_t a,
     int32x4_t b,
     int32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlsl_high_laneq_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16x8_t v,
     const int lane)
| `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlsl_high_laneq_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32x4_t v,
     const int lane)
| `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` | #### Vector multiply by scalar @@ -2829,34 +2829,34 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``. #### Vector multiply-accumulate by scalar and widen -| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | -|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------|------------------------------------------------------------------------------------------------------------------|-------------------|---------------------------| -| int32x4_t vmlal_n_s16(
     int32x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `SMLAL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| int64x2_t vmlal_n_s32(
     int64x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `SMLAL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | -| uint32x4_t vmlal_n_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `UMLAL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint64x2_t vmlal_n_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `UMLAL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | -| int32x4_t vmlal_high_n_s16(
     int32x4_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlal_high_n_s32(
     int64x2_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlal_high_n_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlal_high_n_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | -| int16x4_t vmls_n_s16(
     int16x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLS Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | -| int16x8_t vmlsq_n_s16(
     int16x8_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLS Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | -| int32x2_t vmls_n_s32(
     int32x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLS Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | -| int32x4_t vmlsq_n_s32(
     int32x4_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLS Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint16x4_t vmls_n_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLS Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | -| uint16x8_t vmlsq_n_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLS Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | -| uint32x2_t vmls_n_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLS Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | -| uint32x4_t vmlsq_n_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLS Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| float32x2_t vmls_n_f32(
     float32x2_t a,
     float32x2_t b,
     float32_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 1`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | -| float32x4_t vmlsq_n_f32(
     float32x4_t a,
     float32x4_t b,
     float32_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 3`
`Final instruction sequence is implementation defined` | `N/A` | `v7/A32/A64` | -| int32x4_t vmlsl_n_s16(
     int32x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `SMLSL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| int64x2_t vmlsl_n_s32(
     int64x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `SMLSL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | -| uint32x4_t vmlsl_n_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `UMLSL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | -| uint64x2_t vmlsl_n_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `UMLSL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | -| int32x4_t vmlsl_high_n_s16(
     int32x4_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | -| int64x2_t vmlsl_high_n_s32(
     int64x2_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | -| uint32x4_t vmlsl_high_n_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | -| uint64x2_t vmlsl_high_n_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | +| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | +|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------|--------------------------------------------------------|-------------------|---------------------------| +| int32x4_t vmlal_n_s16(
     int32x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `SMLAL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| int64x2_t vmlal_n_s32(
     int64x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `SMLAL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | +| uint32x4_t vmlal_n_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `UMLAL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint64x2_t vmlal_n_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `UMLAL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | +| int32x4_t vmlal_high_n_s16(
     int32x4_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlal_high_n_s32(
     int64x2_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlal_high_n_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlal_high_n_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | +| int16x4_t vmls_n_s16(
     int16x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLS Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | +| int16x8_t vmlsq_n_s16(
     int16x8_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLS Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | +| int32x2_t vmls_n_s32(
     int32x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLS Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | +| int32x4_t vmlsq_n_s32(
     int32x4_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLS Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint16x4_t vmls_n_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLS Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` | +| uint16x8_t vmlsq_n_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLS Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` | +| uint32x2_t vmls_n_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLS Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` | +| uint32x4_t vmlsq_n_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLS Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| float32x2_t vmls_n_f32(
     float32x2_t a,
     float32x2_t b,
     float32_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 1` | `N/A` | `v7/A32/A64` | +| float32x4_t vmlsq_n_f32(
     float32x4_t a,
     float32x4_t b,
     float32_t c)
| `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 3` | `N/A` | `v7/A32/A64` | +| int32x4_t vmlsl_n_s16(
     int32x4_t a,
     int16x4_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `SMLSL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| int64x2_t vmlsl_n_s32(
     int64x2_t a,
     int32x2_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `SMLSL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | +| uint32x4_t vmlsl_n_u16(
     uint32x4_t a,
     uint16x4_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `UMLSL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` | +| uint64x2_t vmlsl_n_u32(
     uint64x2_t a,
     uint32x2_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `UMLSL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` | +| int32x4_t vmlsl_high_n_s16(
     int32x4_t a,
     int16x8_t b,
     int16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | +| int64x2_t vmlsl_high_n_s32(
     int64x2_t a,
     int32x4_t b,
     int32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | +| uint32x4_t vmlsl_high_n_u16(
     uint32x4_t a,
     uint16x8_t b,
     uint16_t c)
| `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` | +| uint64x2_t vmlsl_high_n_u32(
     uint64x2_t a,
     uint32x4_t b,
     uint32_t c)
| `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` | #### Fused multiply-accumulate by scalar diff --git a/tools/intrinsic_db/advsimd.csv b/tools/intrinsic_db/advsimd.csv index 53681581..4fb14cde 100644 --- a/tools/intrinsic_db/advsimd.csv +++ b/tools/intrinsic_db/advsimd.csv @@ -209,10 +209,10 @@ uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) a -> Vd.4H;b -> Vn uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) a -> Vd.8H;b -> Vn.8H;c -> Vm.8H MLA Vd.8H,Vn.8H,Vm.8H Vd.8H -> result v7/A32/A64 uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) a -> Vd.2S;b -> Vn.2S;c -> Vm.2S MLA Vd.2S,Vn.2S,Vm.2S Vd.2S -> result v7/A32/A64 uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) a -> Vd.4S;b -> Vn.4S;c -> Vm.4S MLA Vd.4S,Vn.4S,Vm.4S Vd.4S -> result v7/A32/A64 -float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1;Final instruction sequence is implementation defined N/A v7/A32/A64 -float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 3;Final instruction sequence is implementation defined N/A v7/A32/A64 -float64x1_t vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0;Final instruction sequence is implementation defined N/A A64 -float64x2_t vmlaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1;Final instruction sequence is implementation defined N/A A64 +float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1 N/A v7/A32/A64 +float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 3 N/A v7/A32/A64 +float64x1_t vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 N/A A64 +float64x2_t vmlaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1 N/A A64 int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) a -> Vd.8H;b -> Vn.8B;c -> Vm.8B SMLAL Vd.8H,Vn.8B,Vm.8B Vd.8H -> result v7/A32/A64 int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.4H SMLAL Vd.4S,Vn.4H,Vm.4H Vd.4S -> result v7/A32/A64 int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) a -> Vd.2D;b -> Vn.2S;c -> Vm.2S SMLAL Vd.2D,Vn.2S,Vm.2S Vd.2D -> result v7/A32/A64 @@ -237,10 +237,10 @@ uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) a -> Vd.4H;b -> Vn uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) a -> Vd.8H;b -> Vn.8H;c -> Vm.8H MLS Vd.8H,Vn.8H,Vm.8H Vd.8H -> result v7/A32/A64 uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) a -> Vd.2S;b -> Vn.2S;c -> Vm.2S MLS Vd.2S,Vn.2S,Vm.2S Vd.2S -> result v7/A32/A64 uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) a -> Vd.4S;b -> Vn.4S;c -> Vm.4S MLS Vd.4S,Vn.4S,Vm.4S Vd.4S -> result v7/A32/A64 -float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1;Final instruction sequence is implementation defined N/A v7/A32/A64 -float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 3;Final instruction sequence is implementation defined N/A v7/A32/A64 -float64x1_t vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0;Final instruction sequence is implementation defined N/A A64 -float64x2_t vmlsq_f64(float64x2_t a, float64x2_t b, float64x2_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1;Final instruction sequence is implementation defined N/A A64 +float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1 N/A v7/A32/A64 +float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 3 N/A v7/A32/A64 +float64x1_t vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 N/A A64 +float64x2_t vmlsq_f64(float64x2_t a, float64x2_t b, float64x2_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1 N/A A64 int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) a -> Vd.8H;b -> Vn.8B;c -> Vm.8B SMLSL Vd.8H,Vn.8B,Vm.8B Vd.8H -> result v7/A32/A64 int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.4H SMLSL Vd.4S,Vn.4H,Vm.4H Vd.4S -> result v7/A32/A64 int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) a -> Vd.2D;b -> Vn.2S;c -> Vm.2S SMLSL Vd.2D,Vn.2S,Vm.2S Vd.2D -> result v7/A32/A64 @@ -1342,8 +1342,8 @@ uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __builtin_con uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.4H;0 <= lane <= 3 MLA Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result v7/A32/A64 uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.2S;0 <= lane <= 1 MLA Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result v7/A32/A64 uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4S;v -> Vm.2S;0 <= lane <= 1 MLA Vd.4S,Vn.4S,Vm.S[lane] Vd.4S -> result v7/A32/A64 -float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1;Final instruction sequence is implementation defined N/A v7/A32/A64 -float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3;Final instruction sequence is implementation defined N/A v7/A32/A64 +float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1 N/A v7/A32/A64 +float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3 N/A v7/A32/A64 int16x4_t vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, __builtin_constant_p(lane)) a -> Vd.4H;b -> Vn.4H;v -> Vm.8H;0 <= lane <= 7 MLA Vd.4H,Vn.4H,Vm.H[lane] Vd.4H -> result A64 int16x8_t vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.8H;0 <= lane <= 7 MLA Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result A64 int32x2_t vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.4S;0 <= lane <= 3 MLA Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result A64 @@ -1352,8 +1352,8 @@ uint16x4_t vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v, __builtin_co uint16x8_t vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.8H;0 <= lane <= 7 MLA Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result A64 uint32x2_t vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.4S;0 <= lane <= 3 MLA Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result A64 uint32x4_t vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4S;v -> Vm.4S;0 <= lane <= 3 MLA Vd.4S,Vn.4S,Vm.S[lane] Vd.4S -> result A64 -float32x2_t vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1;Final instruction sequence is implementation defined N/A A64 -float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3;Final instruction sequence is implementation defined N/A A64 +float32x2_t vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1 N/A A64 +float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3 N/A A64 int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4H;v -> Vm.4H;0 <= lane <= 3 SMLAL Vd.4S,Vn.4H,Vm.H[lane] Vd.4S -> result v7/A32/A64 int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __builtin_constant_p(lane)) a -> Vd.2D;b -> Vn.2S;v -> Vm.2S;0 <= lane <= 1 SMLAL Vd.2D,Vn.2S,Vm.S[lane] Vd.2D -> result v7/A32/A64 uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4H;v -> Vm.4H;0 <= lane <= 3 UMLAL Vd.4S,Vn.4H,Vm.H[lane] Vd.4S -> result v7/A32/A64 @@ -1390,8 +1390,8 @@ uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __builtin_con uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.4H;0 <= lane <= 3 MLS Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result v7/A32/A64 uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.2S;0 <= lane <= 1 MLS Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result v7/A32/A64 uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4S;v -> Vm.2S;0 <= lane <= 1 MLS Vd.4S,Vn.4S,Vm.S[lane] Vd.4S -> result v7/A32/A64 -float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1;Final instruction sequence is implementation defined N/A v7/A32/A64 -float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3;Final instruction sequence is implementation defined N/A v7/A32/A64 +float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1 N/A v7/A32/A64 +float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3 N/A v7/A32/A64 int16x4_t vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, __builtin_constant_p(lane)) a -> Vd.4H;b -> Vn.4H;v -> Vm.8H;0 <= lane <= 7 MLS Vd.4H,Vn.4H,Vm.H[lane] Vd.4H -> result A64 int16x8_t vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.8H;0 <= lane <= 7 MLS Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result A64 int32x2_t vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.4S;0 <= lane <= 3 MLS Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result A64 @@ -1400,8 +1400,8 @@ uint16x4_t vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v, __builtin_co uint16x8_t vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.8H;0 <= lane <= 7 MLS Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result A64 uint32x2_t vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.4S;0 <= lane <= 3 MLS Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result A64 uint32x4_t vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4S;v -> Vm.4S;0 <= lane <= 3 MLS Vd.4S,Vn.4S,Vm.S[lane] Vd.4S -> result A64 -float32x2_t vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1;Final instruction sequence is implementation defined N/A A64 -float32x4_t vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3;Final instruction sequence is implementation defined N/A A64 +float32x2_t vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1 N/A A64 +float32x4_t vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3 N/A A64 int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4H;v -> Vm.4H;0 <= lane <= 3 SMLSL Vd.4S,Vn.4H,Vm.H[lane] Vd.4S -> result v7/A32/A64 int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __builtin_constant_p(lane)) a -> Vd.2D;b -> Vn.2S;v -> Vm.2S;0 <= lane <= 1 SMLSL Vd.2D,Vn.2S,Vm.S[lane] Vd.2D -> result v7/A32/A64 uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4H;v -> Vm.4H;0 <= lane <= 3 UMLSL Vd.4S,Vn.4H,Vm.H[lane] Vd.4S -> result v7/A32/A64 @@ -1550,8 +1550,8 @@ uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) a -> Vd.4H;b -> Vn uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) a -> Vd.8H;b -> Vn.8H;c -> Vm.H[0] MLA Vd.8H,Vn.8H,Vm.H[0] Vd.8H -> result v7/A32/A64 uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) a -> Vd.2S;b -> Vn.2S;c -> Vm.S[0] MLA Vd.2S,Vn.2S,Vm.S[0] Vd.2S -> result v7/A32/A64 uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) a -> Vd.4S;b -> Vn.4S;c -> Vm.S[0] MLA Vd.4S,Vn.4S,Vm.S[0] Vd.4S -> result v7/A32/A64 -float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 1;Final instruction sequence is implementation defined N/A v7/A32/A64 -float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 3;Final instruction sequence is implementation defined N/A v7/A32/A64 +float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 1 N/A v7/A32/A64 +float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 3 N/A v7/A32/A64 int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.H[0] SMLAL Vd.4S,Vn.4H,Vm.H[0] Vd.4S -> result v7/A32/A64 int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) a -> Vd.2D;b -> Vn.2S;c -> Vm.S[0] SMLAL Vd.2D,Vn.2S,Vm.S[0] Vd.2D -> result v7/A32/A64 uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.H[0] UMLAL Vd.4S,Vn.4H,Vm.H[0] Vd.4S -> result v7/A32/A64 @@ -1572,8 +1572,8 @@ uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) a -> Vd.4H;b -> Vn uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) a -> Vd.8H;b -> Vn.8H;c -> Vm.H[0] MLS Vd.8H,Vn.8H,Vm.H[0] Vd.8H -> result v7/A32/A64 uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) a -> Vd.2S;b -> Vn.2S;c -> Vm.S[0] MLS Vd.2S,Vn.2S,Vm.S[0] Vd.2S -> result v7/A32/A64 uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) a -> Vd.4S;b -> Vn.4S;c -> Vm.S[0] MLS Vd.4S,Vn.4S,Vm.S[0] Vd.4S -> result v7/A32/A64 -float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 1;Final instruction sequence is implementation defined N/A v7/A32/A64 -float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 3;Final instruction sequence is implementation defined N/A v7/A32/A64 +float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 1 N/A v7/A32/A64 +float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 3 N/A v7/A32/A64 int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.H[0] SMLSL Vd.4S,Vn.4H,Vm.H[0] Vd.4S -> result v7/A32/A64 int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) a -> Vd.2D;b -> Vn.2S;c -> Vm.S[0] SMLSL Vd.2D,Vn.2S,Vm.S[0] Vd.2D -> result v7/A32/A64 uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.H[0] UMLSL Vd.4S,Vn.4H,Vm.H[0] Vd.4S -> result v7/A32/A64