diff --git a/main/acle.md b/main/acle.md
index 3b066e93..7e0aae37 100644
--- a/main/acle.md
+++ b/main/acle.md
@@ -465,6 +465,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin
* Added feature test macro for FEAT_SSVE_FEXPA.
* Added feature test macro for FEAT_CSSC.
+* Improve documentation for VMLA/VMLS intrinsics for floats.
### References
diff --git a/neon_intrinsics/advsimd.md b/neon_intrinsics/advsimd.md
index a87ad725..31d716a2 100644
--- a/neon_intrinsics/advsimd.md
+++ b/neon_intrinsics/advsimd.md
@@ -393,40 +393,40 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``.
##### Multiply-accumulate
-| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures |
-|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------|---------------------------------------------------|--------------------|---------------------------|
-| int8x8_t vmla_s8(
int8x8_t a,
int8x8_t b,
int8x8_t c) | `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLA Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` |
-| int8x16_t vmlaq_s8(
int8x16_t a,
int8x16_t b,
int8x16_t c) | `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLA Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` |
-| int16x4_t vmla_s16(
int16x4_t a,
int16x4_t b,
int16x4_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLA Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` |
-| int16x8_t vmlaq_s16(
int16x8_t a,
int16x8_t b,
int16x8_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLA Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` |
-| int32x2_t vmla_s32(
int32x2_t a,
int32x2_t b,
int32x2_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLA Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` |
-| int32x4_t vmlaq_s32(
int32x4_t a,
int32x4_t b,
int32x4_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLA Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` |
-| uint8x8_t vmla_u8(
uint8x8_t a,
uint8x8_t b,
uint8x8_t c) | `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLA Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` |
-| uint8x16_t vmlaq_u8(
uint8x16_t a,
uint8x16_t b,
uint8x16_t c) | `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLA Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` |
-| uint16x4_t vmla_u16(
uint16x4_t a,
uint16x4_t b,
uint16x4_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLA Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` |
-| uint16x8_t vmlaq_u16(
uint16x8_t a,
uint16x8_t b,
uint16x8_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLA Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` |
-| uint32x2_t vmla_u32(
uint32x2_t a,
uint32x2_t b,
uint32x2_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLA Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` |
-| uint32x4_t vmlaq_u32(
uint32x4_t a,
uint32x4_t b,
uint32x4_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLA Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` |
-| float32x2_t vmla_f32(
float32x2_t a,
float32x2_t b,
float32x2_t c) | `N/A` | `RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 1` | `N/A` | `v7/A32/A64` |
-| float32x4_t vmlaq_f32(
float32x4_t a,
float32x4_t b,
float32x4_t c) | `N/A` | `RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 3` | `N/A` | `v7/A32/A64` |
-| float64x1_t vmla_f64(
float64x1_t a,
float64x1_t b,
float64x1_t c) | `N/A` | `RESULT[I] = a[i] + (b[i] * c[i]) for i = 0` | `N/A` | `A64` |
-| float64x2_t vmlaq_f64(
float64x2_t a,
float64x2_t b,
float64x2_t c) | `N/A` | `RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 1` | `N/A` | `A64` |
-| int8x8_t vmls_s8(
int8x8_t a,
int8x8_t b,
int8x8_t c) | `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLS Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` |
-| int8x16_t vmlsq_s8(
int8x16_t a,
int8x16_t b,
int8x16_t c) | `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLS Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` |
-| int16x4_t vmls_s16(
int16x4_t a,
int16x4_t b,
int16x4_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLS Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` |
-| int16x8_t vmlsq_s16(
int16x8_t a,
int16x8_t b,
int16x8_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLS Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` |
-| int32x2_t vmls_s32(
int32x2_t a,
int32x2_t b,
int32x2_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLS Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` |
-| int32x4_t vmlsq_s32(
int32x4_t a,
int32x4_t b,
int32x4_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLS Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` |
-| uint8x8_t vmls_u8(
uint8x8_t a,
uint8x8_t b,
uint8x8_t c) | `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLS Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` |
-| uint8x16_t vmlsq_u8(
uint8x16_t a,
uint8x16_t b,
uint8x16_t c) | `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLS Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` |
-| uint16x4_t vmls_u16(
uint16x4_t a,
uint16x4_t b,
uint16x4_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLS Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` |
-| uint16x8_t vmlsq_u16(
uint16x8_t a,
uint16x8_t b,
uint16x8_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLS Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` |
-| uint32x2_t vmls_u32(
uint32x2_t a,
uint32x2_t b,
uint32x2_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLS Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` |
-| uint32x4_t vmlsq_u32(
uint32x4_t a,
uint32x4_t b,
uint32x4_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLS Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` |
-| float32x2_t vmls_f32(
float32x2_t a,
float32x2_t b,
float32x2_t c) | `N/A` | `RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 1` | `N/A` | `v7/A32/A64` |
-| float32x4_t vmlsq_f32(
float32x4_t a,
float32x4_t b,
float32x4_t c) | `N/A` | `RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 3` | `N/A` | `v7/A32/A64` |
-| float64x1_t vmls_f64(
float64x1_t a,
float64x1_t b,
float64x1_t c) | `N/A` | `RESULT[I] = a[i] - (b[i] * c[i]) for i = 0` | `N/A` | `A64` |
-| float64x2_t vmlsq_f64(
float64x2_t a,
float64x2_t b,
float64x2_t c) | `N/A` | `RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 1` | `N/A` | `A64` |
+| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures |
+|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------|-----------------------------------------------------------|--------------------|---------------------------|
+| int8x8_t vmla_s8(
int8x8_t a,
int8x8_t b,
int8x8_t c) | `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLA Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` |
+| int8x16_t vmlaq_s8(
int8x16_t a,
int8x16_t b,
int8x16_t c) | `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLA Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` |
+| int16x4_t vmla_s16(
int16x4_t a,
int16x4_t b,
int16x4_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLA Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` |
+| int16x8_t vmlaq_s16(
int16x8_t a,
int16x8_t b,
int16x8_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLA Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` |
+| int32x2_t vmla_s32(
int32x2_t a,
int32x2_t b,
int32x2_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLA Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` |
+| int32x4_t vmlaq_s32(
int32x4_t a,
int32x4_t b,
int32x4_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLA Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` |
+| uint8x8_t vmla_u8(
uint8x8_t a,
uint8x8_t b,
uint8x8_t c) | `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLA Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` |
+| uint8x16_t vmlaq_u8(
uint8x16_t a,
uint8x16_t b,
uint8x16_t c) | `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLA Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` |
+| uint16x4_t vmla_u16(
uint16x4_t a,
uint16x4_t b,
uint16x4_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLA Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` |
+| uint16x8_t vmlaq_u16(
uint16x8_t a,
uint16x8_t b,
uint16x8_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLA Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` |
+| uint32x2_t vmla_u32(
uint32x2_t a,
uint32x2_t b,
uint32x2_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLA Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` |
+| uint32x4_t vmlaq_u32(
uint32x4_t a,
uint32x4_t b,
uint32x4_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLA Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` |
+| float32x2_t vmla_f32(
float32x2_t a,
float32x2_t b,
float32x2_t c) | `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1` | `N/A` | `v7/A32/A64` |
+| float32x4_t vmlaq_f32(
float32x4_t a,
float32x4_t b,
float32x4_t c) | `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 3` | `N/A` | `v7/A32/A64` |
+| float64x1_t vmla_f64(
float64x1_t a,
float64x1_t b,
float64x1_t c) | `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0` | `N/A` | `A64` |
+| float64x2_t vmlaq_f64(
float64x2_t a,
float64x2_t b,
float64x2_t c) | `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1` | `N/A` | `A64` |
+| int8x8_t vmls_s8(
int8x8_t a,
int8x8_t b,
int8x8_t c) | `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLS Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` |
+| int8x16_t vmlsq_s8(
int8x16_t a,
int8x16_t b,
int8x16_t c) | `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLS Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` |
+| int16x4_t vmls_s16(
int16x4_t a,
int16x4_t b,
int16x4_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLS Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` |
+| int16x8_t vmlsq_s16(
int16x8_t a,
int16x8_t b,
int16x8_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLS Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` |
+| int32x2_t vmls_s32(
int32x2_t a,
int32x2_t b,
int32x2_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLS Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` |
+| int32x4_t vmlsq_s32(
int32x4_t a,
int32x4_t b,
int32x4_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLS Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` |
+| uint8x8_t vmls_u8(
uint8x8_t a,
uint8x8_t b,
uint8x8_t c) | `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `MLS Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` |
+| uint8x16_t vmlsq_u8(
uint8x16_t a,
uint8x16_t b,
uint8x16_t c) | `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `MLS Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` |
+| uint16x4_t vmls_u16(
uint16x4_t a,
uint16x4_t b,
uint16x4_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.4H` | `MLS Vd.4H,Vn.4H,Vm.4H` | `Vd.4H -> result` | `v7/A32/A64` |
+| uint16x8_t vmlsq_u16(
uint16x8_t a,
uint16x8_t b,
uint16x8_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.8H` | `MLS Vd.8H,Vn.8H,Vm.8H` | `Vd.8H -> result` | `v7/A32/A64` |
+| uint32x2_t vmls_u32(
uint32x2_t a,
uint32x2_t b,
uint32x2_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.2S` | `MLS Vd.2S,Vn.2S,Vm.2S` | `Vd.2S -> result` | `v7/A32/A64` |
+| uint32x4_t vmlsq_u32(
uint32x4_t a,
uint32x4_t b,
uint32x4_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.4S` | `MLS Vd.4S,Vn.4S,Vm.4S` | `Vd.4S -> result` | `v7/A32/A64` |
+| float32x2_t vmls_f32(
float32x2_t a,
float32x2_t b,
float32x2_t c) | `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1` | `N/A` | `v7/A32/A64` |
+| float32x4_t vmlsq_f32(
float32x4_t a,
float32x4_t b,
float32x4_t c) | `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 3` | `N/A` | `v7/A32/A64` |
+| float64x1_t vmls_f64(
float64x1_t a,
float64x1_t b,
float64x1_t c) | `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0` | `N/A` | `A64` |
+| float64x2_t vmlsq_f64(
float64x2_t a,
float64x2_t b,
float64x2_t c) | `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1` | `N/A` | `A64` |
##### Multiply-accumulate and widen
@@ -2663,95 +2663,95 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``.
#### Vector multiply-accumulate by scalar
-| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures |
-|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------|------------------------------------------------------|-------------------|---------------------------|
-| int16x4_t vmla_lane_s16(
int16x4_t a,
int16x4_t b,
int16x4_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` |
-| int16x8_t vmlaq_lane_s16(
int16x8_t a,
int16x8_t b,
int16x4_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` |
-| int32x2_t vmla_lane_s32(
int32x2_t a,
int32x2_t b,
int32x2_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` |
-| int32x4_t vmlaq_lane_s32(
int32x4_t a,
int32x4_t b,
int32x2_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
-| uint16x4_t vmla_lane_u16(
uint16x4_t a,
uint16x4_t b,
uint16x4_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` |
-| uint16x8_t vmlaq_lane_u16(
uint16x8_t a,
uint16x8_t b,
uint16x4_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` |
-| uint32x2_t vmla_lane_u32(
uint32x2_t a,
uint32x2_t b,
uint32x2_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` |
-| uint32x4_t vmlaq_lane_u32(
uint32x4_t a,
uint32x4_t b,
uint32x2_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
-| float32x2_t vmla_lane_f32(
float32x2_t a,
float32x2_t b,
float32x2_t v,
const int lane) | `0 <= lane <= 1` | `RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 1` | `N/A` | `v7/A32/A64` |
-| float32x4_t vmlaq_lane_f32(
float32x4_t a,
float32x4_t b,
float32x2_t v,
const int lane) | `0 <= lane <= 1` | `RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 3` | `N/A` | `v7/A32/A64` |
-| int16x4_t vmla_laneq_s16(
int16x4_t a,
int16x4_t b,
int16x8_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` |
-| int16x8_t vmlaq_laneq_s16(
int16x8_t a,
int16x8_t b,
int16x8_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` |
-| int32x2_t vmla_laneq_s32(
int32x2_t a,
int32x2_t b,
int32x4_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` |
-| int32x4_t vmlaq_laneq_s32(
int32x4_t a,
int32x4_t b,
int32x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` |
-| uint16x4_t vmla_laneq_u16(
uint16x4_t a,
uint16x4_t b,
uint16x8_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` |
-| uint16x8_t vmlaq_laneq_u16(
uint16x8_t a,
uint16x8_t b,
uint16x8_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` |
-| uint32x2_t vmla_laneq_u32(
uint32x2_t a,
uint32x2_t b,
uint32x4_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` |
-| uint32x4_t vmlaq_laneq_u32(
uint32x4_t a,
uint32x4_t b,
uint32x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` |
-| float32x2_t vmla_laneq_f32(
float32x2_t a,
float32x2_t b,
float32x4_t v,
const int lane) | `0 <= lane <= 3` | `RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 1` | `N/A` | `A64` |
-| float32x4_t vmlaq_laneq_f32(
float32x4_t a,
float32x4_t b,
float32x4_t v,
const int lane) | `0 <= lane <= 3` | `RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 3` | `N/A` | `A64` |
-| int32x4_t vmlal_lane_s16(
int32x4_t a,
int16x4_t b,
int16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
-| int64x2_t vmlal_lane_s32(
int64x2_t a,
int32x2_t b,
int32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` |
-| uint32x4_t vmlal_lane_u16(
uint32x4_t a,
uint16x4_t b,
uint16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
-| uint64x2_t vmlal_lane_u32(
uint64x2_t a,
uint32x2_t b,
uint32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` |
-| int32x4_t vmlal_high_lane_s16(
int32x4_t a,
int16x8_t b,
int16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
-| int64x2_t vmlal_high_lane_s32(
int64x2_t a,
int32x4_t b,
int32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
-| uint32x4_t vmlal_high_lane_u16(
uint32x4_t a,
uint16x8_t b,
uint16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
-| uint64x2_t vmlal_high_lane_u32(
uint64x2_t a,
uint32x4_t b,
uint32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
-| int32x4_t vmlal_laneq_s16(
int32x4_t a,
int16x4_t b,
int16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
-| int64x2_t vmlal_laneq_s32(
int64x2_t a,
int32x2_t b,
int32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
-| uint32x4_t vmlal_laneq_u16(
uint32x4_t a,
uint16x4_t b,
uint16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
-| uint64x2_t vmlal_laneq_u32(
uint64x2_t a,
uint32x2_t b,
uint32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
-| int32x4_t vmlal_high_laneq_s16(
int32x4_t a,
int16x8_t b,
int16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
-| int64x2_t vmlal_high_laneq_s32(
int64x2_t a,
int32x4_t b,
int32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
-| uint32x4_t vmlal_high_laneq_u16(
uint32x4_t a,
uint16x8_t b,
uint16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
-| uint64x2_t vmlal_high_laneq_u32(
uint64x2_t a,
uint32x4_t b,
uint32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
-| int16x4_t vmla_n_s16(
int16x4_t a,
int16x4_t b,
int16_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLA Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` |
-| int16x8_t vmlaq_n_s16(
int16x8_t a,
int16x8_t b,
int16_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLA Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` |
-| int32x2_t vmla_n_s32(
int32x2_t a,
int32x2_t b,
int32_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLA Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` |
-| int32x4_t vmlaq_n_s32(
int32x4_t a,
int32x4_t b,
int32_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLA Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` |
-| uint16x4_t vmla_n_u16(
uint16x4_t a,
uint16x4_t b,
uint16_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLA Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` |
-| uint16x8_t vmlaq_n_u16(
uint16x8_t a,
uint16x8_t b,
uint16_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLA Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` |
-| uint32x2_t vmla_n_u32(
uint32x2_t a,
uint32x2_t b,
uint32_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLA Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` |
-| uint32x4_t vmlaq_n_u32(
uint32x4_t a,
uint32x4_t b,
uint32_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLA Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` |
-| float32x2_t vmla_n_f32(
float32x2_t a,
float32x2_t b,
float32_t c) | `N/A` | `RESULT[I] = a[i] + (b[i] * c) for i = 0 to 1` | `N/A` | `v7/A32/A64` |
-| float32x4_t vmlaq_n_f32(
float32x4_t a,
float32x4_t b,
float32_t c) | `N/A` | `RESULT[I] = a[i] + (b[i] * c) for i = 0 to 3` | `N/A` | `v7/A32/A64` |
+| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures |
+|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------|--------------------------------------------------------------|-------------------|---------------------------|
+| int16x4_t vmla_lane_s16(
int16x4_t a,
int16x4_t b,
int16x4_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` |
+| int16x8_t vmlaq_lane_s16(
int16x8_t a,
int16x8_t b,
int16x4_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` |
+| int32x2_t vmla_lane_s32(
int32x2_t a,
int32x2_t b,
int32x2_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` |
+| int32x4_t vmlaq_lane_s32(
int32x4_t a,
int32x4_t b,
int32x2_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
+| uint16x4_t vmla_lane_u16(
uint16x4_t a,
uint16x4_t b,
uint16x4_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` |
+| uint16x8_t vmlaq_lane_u16(
uint16x8_t a,
uint16x8_t b,
uint16x4_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` |
+| uint32x2_t vmla_lane_u32(
uint32x2_t a,
uint32x2_t b,
uint32x2_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` |
+| uint32x4_t vmlaq_lane_u32(
uint32x4_t a,
uint32x4_t b,
uint32x2_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
+| float32x2_t vmla_lane_f32(
float32x2_t a,
float32x2_t b,
float32x2_t v,
const int lane) | `0 <= lane <= 1` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1` | `N/A` | `v7/A32/A64` |
+| float32x4_t vmlaq_lane_f32(
float32x4_t a,
float32x4_t b,
float32x2_t v,
const int lane) | `0 <= lane <= 1` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3` | `N/A` | `v7/A32/A64` |
+| int16x4_t vmla_laneq_s16(
int16x4_t a,
int16x4_t b,
int16x8_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` |
+| int16x8_t vmlaq_laneq_s16(
int16x8_t a,
int16x8_t b,
int16x8_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` |
+| int32x2_t vmla_laneq_s32(
int32x2_t a,
int32x2_t b,
int32x4_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` |
+| int32x4_t vmlaq_laneq_s32(
int32x4_t a,
int32x4_t b,
int32x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` |
+| uint16x4_t vmla_laneq_u16(
uint16x4_t a,
uint16x4_t b,
uint16x8_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` |
+| uint16x8_t vmlaq_laneq_u16(
uint16x8_t a,
uint16x8_t b,
uint16x8_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLA Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` |
+| uint32x2_t vmla_laneq_u32(
uint32x2_t a,
uint32x2_t b,
uint32x4_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` |
+| uint32x4_t vmlaq_laneq_u32(
uint32x4_t a,
uint32x4_t b,
uint32x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLA Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` |
+| float32x2_t vmla_laneq_f32(
float32x2_t a,
float32x2_t b,
float32x4_t v,
const int lane) | `0 <= lane <= 3` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1` | `N/A` | `A64` |
+| float32x4_t vmlaq_laneq_f32(
float32x4_t a,
float32x4_t b,
float32x4_t v,
const int lane) | `0 <= lane <= 3` | `RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3` | `N/A` | `A64` |
+| int32x4_t vmlal_lane_s16(
int32x4_t a,
int16x4_t b,
int16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
+| int64x2_t vmlal_lane_s32(
int64x2_t a,
int32x2_t b,
int32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` |
+| uint32x4_t vmlal_lane_u16(
uint32x4_t a,
uint16x4_t b,
uint16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
+| uint64x2_t vmlal_lane_u32(
uint64x2_t a,
uint32x2_t b,
uint32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` |
+| int32x4_t vmlal_high_lane_s16(
int32x4_t a,
int16x8_t b,
int16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
+| int64x2_t vmlal_high_lane_s32(
int64x2_t a,
int32x4_t b,
int32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
+| uint32x4_t vmlal_high_lane_u16(
uint32x4_t a,
uint16x8_t b,
uint16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
+| uint64x2_t vmlal_high_lane_u32(
uint64x2_t a,
uint32x4_t b,
uint32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
+| int32x4_t vmlal_laneq_s16(
int32x4_t a,
int16x4_t b,
int16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
+| int64x2_t vmlal_laneq_s32(
int64x2_t a,
int32x2_t b,
int32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
+| uint32x4_t vmlal_laneq_u16(
uint32x4_t a,
uint16x4_t b,
uint16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLAL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
+| uint64x2_t vmlal_laneq_u32(
uint64x2_t a,
uint32x2_t b,
uint32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLAL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
+| int32x4_t vmlal_high_laneq_s16(
int32x4_t a,
int16x8_t b,
int16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
+| int64x2_t vmlal_high_laneq_s32(
int64x2_t a,
int32x4_t b,
int32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
+| uint32x4_t vmlal_high_laneq_u16(
uint32x4_t a,
uint16x8_t b,
uint16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
+| uint64x2_t vmlal_high_laneq_u32(
uint64x2_t a,
uint32x4_t b,
uint32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
+| int16x4_t vmla_n_s16(
int16x4_t a,
int16x4_t b,
int16_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLA Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` |
+| int16x8_t vmlaq_n_s16(
int16x8_t a,
int16x8_t b,
int16_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLA Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` |
+| int32x2_t vmla_n_s32(
int32x2_t a,
int32x2_t b,
int32_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLA Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` |
+| int32x4_t vmlaq_n_s32(
int32x4_t a,
int32x4_t b,
int32_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLA Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` |
+| uint16x4_t vmla_n_u16(
uint16x4_t a,
uint16x4_t b,
uint16_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLA Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` |
+| uint16x8_t vmlaq_n_u16(
uint16x8_t a,
uint16x8_t b,
uint16_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLA Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` |
+| uint32x2_t vmla_n_u32(
uint32x2_t a,
uint32x2_t b,
uint32_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLA Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` |
+| uint32x4_t vmlaq_n_u32(
uint32x4_t a,
uint32x4_t b,
uint32_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLA Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` |
+| float32x2_t vmla_n_f32(
float32x2_t a,
float32x2_t b,
float32_t c) | `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 1` | `N/A` | `v7/A32/A64` |
+| float32x4_t vmlaq_n_f32(
float32x4_t a,
float32x4_t b,
float32_t c) | `N/A` | `RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 3` | `N/A` | `v7/A32/A64` |
#### Vector multiply-subtract by scalar
-| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures |
-|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------|------------------------------------------------------|-------------------|---------------------------|
-| int16x4_t vmls_lane_s16(
int16x4_t a,
int16x4_t b,
int16x4_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` |
-| int16x8_t vmlsq_lane_s16(
int16x8_t a,
int16x8_t b,
int16x4_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` |
-| int32x2_t vmls_lane_s32(
int32x2_t a,
int32x2_t b,
int32x2_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` |
-| int32x4_t vmlsq_lane_s32(
int32x4_t a,
int32x4_t b,
int32x2_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
-| uint16x4_t vmls_lane_u16(
uint16x4_t a,
uint16x4_t b,
uint16x4_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` |
-| uint16x8_t vmlsq_lane_u16(
uint16x8_t a,
uint16x8_t b,
uint16x4_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` |
-| uint32x2_t vmls_lane_u32(
uint32x2_t a,
uint32x2_t b,
uint32x2_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` |
-| uint32x4_t vmlsq_lane_u32(
uint32x4_t a,
uint32x4_t b,
uint32x2_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
-| float32x2_t vmls_lane_f32(
float32x2_t a,
float32x2_t b,
float32x2_t v,
const int lane) | `0 <= lane <= 1` | `RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 1` | `N/A` | `v7/A32/A64` |
-| float32x4_t vmlsq_lane_f32(
float32x4_t a,
float32x4_t b,
float32x2_t v,
const int lane) | `0 <= lane <= 1` | `RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 3` | `N/A` | `v7/A32/A64` |
-| int16x4_t vmls_laneq_s16(
int16x4_t a,
int16x4_t b,
int16x8_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` |
-| int16x8_t vmlsq_laneq_s16(
int16x8_t a,
int16x8_t b,
int16x8_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` |
-| int32x2_t vmls_laneq_s32(
int32x2_t a,
int32x2_t b,
int32x4_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` |
-| int32x4_t vmlsq_laneq_s32(
int32x4_t a,
int32x4_t b,
int32x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` |
-| uint16x4_t vmls_laneq_u16(
uint16x4_t a,
uint16x4_t b,
uint16x8_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` |
-| uint16x8_t vmlsq_laneq_u16(
uint16x8_t a,
uint16x8_t b,
uint16x8_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` |
-| uint32x2_t vmls_laneq_u32(
uint32x2_t a,
uint32x2_t b,
uint32x4_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` |
-| uint32x4_t vmlsq_laneq_u32(
uint32x4_t a,
uint32x4_t b,
uint32x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` |
-| float32x2_t vmls_laneq_f32(
float32x2_t a,
float32x2_t b,
float32x4_t v,
const int lane) | `0 <= lane <= 3` | `RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 1` | `N/A` | `A64` |
-| float32x4_t vmlsq_laneq_f32(
float32x4_t a,
float32x4_t b,
float32x4_t v,
const int lane) | `0 <= lane <= 3` | `RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 3` | `N/A` | `A64` |
-| int32x4_t vmlsl_lane_s16(
int32x4_t a,
int16x4_t b,
int16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
-| int64x2_t vmlsl_lane_s32(
int64x2_t a,
int32x2_t b,
int32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` |
-| uint32x4_t vmlsl_lane_u16(
uint32x4_t a,
uint16x4_t b,
uint16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
-| uint64x2_t vmlsl_lane_u32(
uint64x2_t a,
uint32x2_t b,
uint32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` |
-| int32x4_t vmlsl_high_lane_s16(
int32x4_t a,
int16x8_t b,
int16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
-| int64x2_t vmlsl_high_lane_s32(
int64x2_t a,
int32x4_t b,
int32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
-| uint32x4_t vmlsl_high_lane_u16(
uint32x4_t a,
uint16x8_t b,
uint16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
-| uint64x2_t vmlsl_high_lane_u32(
uint64x2_t a,
uint32x4_t b,
uint32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
-| int32x4_t vmlsl_laneq_s16(
int32x4_t a,
int16x4_t b,
int16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
-| int64x2_t vmlsl_laneq_s32(
int64x2_t a,
int32x2_t b,
int32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
-| uint32x4_t vmlsl_laneq_u16(
uint32x4_t a,
uint16x4_t b,
uint16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
-| uint64x2_t vmlsl_laneq_u32(
uint64x2_t a,
uint32x2_t b,
uint32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
-| int32x4_t vmlsl_high_laneq_s16(
int32x4_t a,
int16x8_t b,
int16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
-| int64x2_t vmlsl_high_laneq_s32(
int64x2_t a,
int32x4_t b,
int32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
-| uint32x4_t vmlsl_high_laneq_u16(
uint32x4_t a,
uint16x8_t b,
uint16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
-| uint64x2_t vmlsl_high_laneq_u32(
uint64x2_t a,
uint32x4_t b,
uint32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
+| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures |
+|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------|--------------------------------------------------------------|-------------------|---------------------------|
+| int16x4_t vmls_lane_s16(
int16x4_t a,
int16x4_t b,
int16x4_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` |
+| int16x8_t vmlsq_lane_s16(
int16x8_t a,
int16x8_t b,
int16x4_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` |
+| int32x2_t vmls_lane_s32(
int32x2_t a,
int32x2_t b,
int32x2_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` |
+| int32x4_t vmlsq_lane_s32(
int32x4_t a,
int32x4_t b,
int32x2_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
+| uint16x4_t vmls_lane_u16(
uint16x4_t a,
uint16x4_t b,
uint16x4_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `v7/A32/A64` |
+| uint16x8_t vmlsq_lane_u16(
uint16x8_t a,
uint16x8_t b,
uint16x4_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `v7/A32/A64` |
+| uint32x2_t vmls_lane_u32(
uint32x2_t a,
uint32x2_t b,
uint32x2_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `v7/A32/A64` |
+| uint32x4_t vmlsq_lane_u32(
uint32x4_t a,
uint32x4_t b,
uint32x2_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
+| float32x2_t vmls_lane_f32(
float32x2_t a,
float32x2_t b,
float32x2_t v,
const int lane) | `0 <= lane <= 1` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1` | `N/A` | `v7/A32/A64` |
+| float32x4_t vmlsq_lane_f32(
float32x4_t a,
float32x4_t b,
float32x2_t v,
const int lane) | `0 <= lane <= 1` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3` | `N/A` | `v7/A32/A64` |
+| int16x4_t vmls_laneq_s16(
int16x4_t a,
int16x4_t b,
int16x8_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` |
+| int16x8_t vmlsq_laneq_s16(
int16x8_t a,
int16x8_t b,
int16x8_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` |
+| int32x2_t vmls_laneq_s32(
int32x2_t a,
int32x2_t b,
int32x4_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` |
+| int32x4_t vmlsq_laneq_s32(
int32x4_t a,
int32x4_t b,
int32x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` |
+| uint16x4_t vmls_laneq_u16(
uint16x4_t a,
uint16x4_t b,
uint16x8_t v,
const int lane) | `a -> Vd.4H`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.4H,Vn.4H,Vm.H[lane]` | `Vd.4H -> result` | `A64` |
+| uint16x8_t vmlsq_laneq_u16(
uint16x8_t a,
uint16x8_t b,
uint16x8_t v,
const int lane) | `a -> Vd.8H`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `MLS Vd.8H,Vn.8H,Vm.H[lane]` | `Vd.8H -> result` | `A64` |
+| uint32x2_t vmls_laneq_u32(
uint32x2_t a,
uint32x2_t b,
uint32x4_t v,
const int lane) | `a -> Vd.2S`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.2S,Vn.2S,Vm.S[lane]` | `Vd.2S -> result` | `A64` |
+| uint32x4_t vmlsq_laneq_u32(
uint32x4_t a,
uint32x4_t b,
uint32x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `MLS Vd.4S,Vn.4S,Vm.S[lane]` | `Vd.4S -> result` | `A64` |
+| float32x2_t vmls_laneq_f32(
float32x2_t a,
float32x2_t b,
float32x4_t v,
const int lane) | `0 <= lane <= 3` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1` | `N/A` | `A64` |
+| float32x4_t vmlsq_laneq_f32(
float32x4_t a,
float32x4_t b,
float32x4_t v,
const int lane) | `0 <= lane <= 3` | `RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3` | `N/A` | `A64` |
+| int32x4_t vmlsl_lane_s16(
int32x4_t a,
int16x4_t b,
int16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
+| int64x2_t vmlsl_lane_s32(
int64x2_t a,
int32x2_t b,
int32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` |
+| uint32x4_t vmlsl_lane_u16(
uint32x4_t a,
uint16x4_t b,
uint16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `v7/A32/A64` |
+| uint64x2_t vmlsl_lane_u32(
uint64x2_t a,
uint32x2_t b,
uint32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `v7/A32/A64` |
+| int32x4_t vmlsl_high_lane_s16(
int32x4_t a,
int16x8_t b,
int16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
+| int64x2_t vmlsl_high_lane_s32(
int64x2_t a,
int32x4_t b,
int32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
+| uint32x4_t vmlsl_high_lane_u16(
uint32x4_t a,
uint16x8_t b,
uint16x4_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.4H`
`0 <= lane <= 3` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
+| uint64x2_t vmlsl_high_lane_u32(
uint64x2_t a,
uint32x4_t b,
uint32x2_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.2S`
`0 <= lane <= 1` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
+| int32x4_t vmlsl_laneq_s16(
int32x4_t a,
int16x4_t b,
int16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
+| int64x2_t vmlsl_laneq_s32(
int64x2_t a,
int32x2_t b,
int32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
+| uint32x4_t vmlsl_laneq_u16(
uint32x4_t a,
uint16x4_t b,
uint16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.4H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLSL Vd.4S,Vn.4H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
+| uint64x2_t vmlsl_laneq_u32(
uint64x2_t a,
uint32x2_t b,
uint32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.2S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLSL Vd.2D,Vn.2S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
+| int32x4_t vmlsl_high_laneq_s16(
int32x4_t a,
int16x8_t b,
int16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
+| int64x2_t vmlsl_high_laneq_s32(
int64x2_t a,
int32x4_t b,
int32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
+| uint32x4_t vmlsl_high_laneq_u16(
uint32x4_t a,
uint16x8_t b,
uint16x8_t v,
const int lane) | `a -> Vd.4S`
`b -> Vn.8H`
`v -> Vm.8H`
`0 <= lane <= 7` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[lane]` | `Vd.4S -> result` | `A64` |
+| uint64x2_t vmlsl_high_laneq_u32(
uint64x2_t a,
uint32x4_t b,
uint32x4_t v,
const int lane) | `a -> Vd.2D`
`b -> Vn.4S`
`v -> Vm.4S`
`0 <= lane <= 3` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[lane]` | `Vd.2D -> result` | `A64` |
#### Vector multiply by scalar
@@ -2829,34 +2829,34 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``.
#### Vector multiply-accumulate by scalar and widen
-| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures |
-|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------|------------------------------------------------|-------------------|---------------------------|
-| int32x4_t vmlal_n_s16(
int32x4_t a,
int16x4_t b,
int16_t c) | `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `SMLAL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` |
-| int64x2_t vmlal_n_s32(
int64x2_t a,
int32x2_t b,
int32_t c) | `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `SMLAL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` |
-| uint32x4_t vmlal_n_u16(
uint32x4_t a,
uint16x4_t b,
uint16_t c) | `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `UMLAL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` |
-| uint64x2_t vmlal_n_u32(
uint64x2_t a,
uint32x2_t b,
uint32_t c) | `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `UMLAL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` |
-| int32x4_t vmlal_high_n_s16(
int32x4_t a,
int16x8_t b,
int16_t c) | `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` |
-| int64x2_t vmlal_high_n_s32(
int64x2_t a,
int32x4_t b,
int32_t c) | `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` |
-| uint32x4_t vmlal_high_n_u16(
uint32x4_t a,
uint16x8_t b,
uint16_t c) | `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` |
-| uint64x2_t vmlal_high_n_u32(
uint64x2_t a,
uint32x4_t b,
uint32_t c) | `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` |
-| int16x4_t vmls_n_s16(
int16x4_t a,
int16x4_t b,
int16_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLS Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` |
-| int16x8_t vmlsq_n_s16(
int16x8_t a,
int16x8_t b,
int16_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLS Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` |
-| int32x2_t vmls_n_s32(
int32x2_t a,
int32x2_t b,
int32_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLS Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` |
-| int32x4_t vmlsq_n_s32(
int32x4_t a,
int32x4_t b,
int32_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLS Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` |
-| uint16x4_t vmls_n_u16(
uint16x4_t a,
uint16x4_t b,
uint16_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLS Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` |
-| uint16x8_t vmlsq_n_u16(
uint16x8_t a,
uint16x8_t b,
uint16_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLS Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` |
-| uint32x2_t vmls_n_u32(
uint32x2_t a,
uint32x2_t b,
uint32_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLS Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` |
-| uint32x4_t vmlsq_n_u32(
uint32x4_t a,
uint32x4_t b,
uint32_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLS Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` |
-| float32x2_t vmls_n_f32(
float32x2_t a,
float32x2_t b,
float32_t c) | `N/A` | `RESULT[I] = a[i] - (b[i] * c) for i = 0 to 1` | `N/A` | `v7/A32/A64` |
-| float32x4_t vmlsq_n_f32(
float32x4_t a,
float32x4_t b,
float32_t c) | `N/A` | `RESULT[I] = a[i] - (b[i] * c) for i = 0 to 3` | `N/A` | `v7/A32/A64` |
-| int32x4_t vmlsl_n_s16(
int32x4_t a,
int16x4_t b,
int16_t c) | `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `SMLSL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` |
-| int64x2_t vmlsl_n_s32(
int64x2_t a,
int32x2_t b,
int32_t c) | `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `SMLSL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` |
-| uint32x4_t vmlsl_n_u16(
uint32x4_t a,
uint16x4_t b,
uint16_t c) | `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `UMLSL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` |
-| uint64x2_t vmlsl_n_u32(
uint64x2_t a,
uint32x2_t b,
uint32_t c) | `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `UMLSL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` |
-| int32x4_t vmlsl_high_n_s16(
int32x4_t a,
int16x8_t b,
int16_t c) | `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` |
-| int64x2_t vmlsl_high_n_s32(
int64x2_t a,
int32x4_t b,
int32_t c) | `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` |
-| uint32x4_t vmlsl_high_n_u16(
uint32x4_t a,
uint16x8_t b,
uint16_t c) | `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` |
-| uint64x2_t vmlsl_high_n_u32(
uint64x2_t a,
uint32x4_t b,
uint32_t c) | `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` |
+| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures |
+|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------|--------------------------------------------------------|-------------------|---------------------------|
+| int32x4_t vmlal_n_s16(
int32x4_t a,
int16x4_t b,
int16_t c) | `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `SMLAL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` |
+| int64x2_t vmlal_n_s32(
int64x2_t a,
int32x2_t b,
int32_t c) | `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `SMLAL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` |
+| uint32x4_t vmlal_n_u16(
uint32x4_t a,
uint16x4_t b,
uint16_t c) | `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `UMLAL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` |
+| uint64x2_t vmlal_n_u32(
uint64x2_t a,
uint32x2_t b,
uint32_t c) | `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `UMLAL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` |
+| int32x4_t vmlal_high_n_s16(
int32x4_t a,
int16x8_t b,
int16_t c) | `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `SMLAL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` |
+| int64x2_t vmlal_high_n_s32(
int64x2_t a,
int32x4_t b,
int32_t c) | `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `SMLAL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` |
+| uint32x4_t vmlal_high_n_u16(
uint32x4_t a,
uint16x8_t b,
uint16_t c) | `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `UMLAL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` |
+| uint64x2_t vmlal_high_n_u32(
uint64x2_t a,
uint32x4_t b,
uint32_t c) | `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `UMLAL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` |
+| int16x4_t vmls_n_s16(
int16x4_t a,
int16x4_t b,
int16_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLS Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` |
+| int16x8_t vmlsq_n_s16(
int16x8_t a,
int16x8_t b,
int16_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLS Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` |
+| int32x2_t vmls_n_s32(
int32x2_t a,
int32x2_t b,
int32_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLS Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` |
+| int32x4_t vmlsq_n_s32(
int32x4_t a,
int32x4_t b,
int32_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLS Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` |
+| uint16x4_t vmls_n_u16(
uint16x4_t a,
uint16x4_t b,
uint16_t c) | `a -> Vd.4H`
`b -> Vn.4H`
`c -> Vm.H[0]` | `MLS Vd.4H,Vn.4H,Vm.H[0]` | `Vd.4H -> result` | `v7/A32/A64` |
+| uint16x8_t vmlsq_n_u16(
uint16x8_t a,
uint16x8_t b,
uint16_t c) | `a -> Vd.8H`
`b -> Vn.8H`
`c -> Vm.H[0]` | `MLS Vd.8H,Vn.8H,Vm.H[0]` | `Vd.8H -> result` | `v7/A32/A64` |
+| uint32x2_t vmls_n_u32(
uint32x2_t a,
uint32x2_t b,
uint32_t c) | `a -> Vd.2S`
`b -> Vn.2S`
`c -> Vm.S[0]` | `MLS Vd.2S,Vn.2S,Vm.S[0]` | `Vd.2S -> result` | `v7/A32/A64` |
+| uint32x4_t vmlsq_n_u32(
uint32x4_t a,
uint32x4_t b,
uint32_t c) | `a -> Vd.4S`
`b -> Vn.4S`
`c -> Vm.S[0]` | `MLS Vd.4S,Vn.4S,Vm.S[0]` | `Vd.4S -> result` | `v7/A32/A64` |
+| float32x2_t vmls_n_f32(
float32x2_t a,
float32x2_t b,
float32_t c) | `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 1` | `N/A` | `v7/A32/A64` |
+| float32x4_t vmlsq_n_f32(
float32x4_t a,
float32x4_t b,
float32_t c) | `N/A` | `RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 3` | `N/A` | `v7/A32/A64` |
+| int32x4_t vmlsl_n_s16(
int32x4_t a,
int16x4_t b,
int16_t c) | `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `SMLSL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` |
+| int64x2_t vmlsl_n_s32(
int64x2_t a,
int32x2_t b,
int32_t c) | `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `SMLSL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` |
+| uint32x4_t vmlsl_n_u16(
uint32x4_t a,
uint16x4_t b,
uint16_t c) | `a -> Vd.4S`
`b -> Vn.4H`
`c -> Vm.H[0]` | `UMLSL Vd.4S,Vn.4H,Vm.H[0]` | `Vd.4S -> result` | `v7/A32/A64` |
+| uint64x2_t vmlsl_n_u32(
uint64x2_t a,
uint32x2_t b,
uint32_t c) | `a -> Vd.2D`
`b -> Vn.2S`
`c -> Vm.S[0]` | `UMLSL Vd.2D,Vn.2S,Vm.S[0]` | `Vd.2D -> result` | `v7/A32/A64` |
+| int32x4_t vmlsl_high_n_s16(
int32x4_t a,
int16x8_t b,
int16_t c) | `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `SMLSL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` |
+| int64x2_t vmlsl_high_n_s32(
int64x2_t a,
int32x4_t b,
int32_t c) | `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `SMLSL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` |
+| uint32x4_t vmlsl_high_n_u16(
uint32x4_t a,
uint16x8_t b,
uint16_t c) | `a -> Vd.4S`
`b -> Vn.8H`
`c -> Vm.H[0]` | `UMLSL2 Vd.4S,Vn.8H,Vm.H[0]` | `Vd.4S -> result` | `A64` |
+| uint64x2_t vmlsl_high_n_u32(
uint64x2_t a,
uint32x4_t b,
uint32_t c) | `a -> Vd.2D`
`b -> Vn.4S`
`c -> Vm.S[0]` | `UMLSL2 Vd.2D,Vn.4S,Vm.S[0]` | `Vd.2D -> result` | `A64` |
#### Fused multiply-accumulate by scalar
diff --git a/tools/intrinsic_db/advsimd.csv b/tools/intrinsic_db/advsimd.csv
index 7b51c965..4fb14cde 100644
--- a/tools/intrinsic_db/advsimd.csv
+++ b/tools/intrinsic_db/advsimd.csv
@@ -209,10 +209,10 @@ uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) a -> Vd.4H;b -> Vn
uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) a -> Vd.8H;b -> Vn.8H;c -> Vm.8H MLA Vd.8H,Vn.8H,Vm.8H Vd.8H -> result v7/A32/A64
uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) a -> Vd.2S;b -> Vn.2S;c -> Vm.2S MLA Vd.2S,Vn.2S,Vm.2S Vd.2S -> result v7/A32/A64
uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) a -> Vd.4S;b -> Vn.4S;c -> Vm.4S MLA Vd.4S,Vn.4S,Vm.4S Vd.4S -> result v7/A32/A64
-float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) N/A RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 1 N/A v7/A32/A64
-float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) N/A RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 3 N/A v7/A32/A64
-float64x1_t vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) N/A RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 N/A A64
-float64x2_t vmlaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) N/A RESULT[I] = a[i] + (b[i] * c[i]) for i = 0 to 1 N/A A64
+float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1 N/A v7/A32/A64
+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 3 N/A v7/A32/A64
+float64x1_t vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 N/A A64
+float64x2_t vmlaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c[i])) for i = 0 to 1 N/A A64
int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) a -> Vd.8H;b -> Vn.8B;c -> Vm.8B SMLAL Vd.8H,Vn.8B,Vm.8B Vd.8H -> result v7/A32/A64
int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.4H SMLAL Vd.4S,Vn.4H,Vm.4H Vd.4S -> result v7/A32/A64
int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) a -> Vd.2D;b -> Vn.2S;c -> Vm.2S SMLAL Vd.2D,Vn.2S,Vm.2S Vd.2D -> result v7/A32/A64
@@ -237,10 +237,10 @@ uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) a -> Vd.4H;b -> Vn
uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) a -> Vd.8H;b -> Vn.8H;c -> Vm.8H MLS Vd.8H,Vn.8H,Vm.8H Vd.8H -> result v7/A32/A64
uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) a -> Vd.2S;b -> Vn.2S;c -> Vm.2S MLS Vd.2S,Vn.2S,Vm.2S Vd.2S -> result v7/A32/A64
uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) a -> Vd.4S;b -> Vn.4S;c -> Vm.4S MLS Vd.4S,Vn.4S,Vm.4S Vd.4S -> result v7/A32/A64
-float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) N/A RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 1 N/A v7/A32/A64
-float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) N/A RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 3 N/A v7/A32/A64
-float64x1_t vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) N/A RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 N/A A64
-float64x2_t vmlsq_f64(float64x2_t a, float64x2_t b, float64x2_t c) N/A RESULT[I] = a[i] - (b[i] * c[i]) for i = 0 to 1 N/A A64
+float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1 N/A v7/A32/A64
+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 3 N/A v7/A32/A64
+float64x1_t vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 N/A A64
+float64x2_t vmlsq_f64(float64x2_t a, float64x2_t b, float64x2_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c[i])) for i = 0 to 1 N/A A64
int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) a -> Vd.8H;b -> Vn.8B;c -> Vm.8B SMLSL Vd.8H,Vn.8B,Vm.8B Vd.8H -> result v7/A32/A64
int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.4H SMLSL Vd.4S,Vn.4H,Vm.4H Vd.4S -> result v7/A32/A64
int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) a -> Vd.2D;b -> Vn.2S;c -> Vm.2S SMLSL Vd.2D,Vn.2S,Vm.2S Vd.2D -> result v7/A32/A64
@@ -1342,8 +1342,8 @@ uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __builtin_con
uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.4H;0 <= lane <= 3 MLA Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result v7/A32/A64
uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.2S;0 <= lane <= 1 MLA Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result v7/A32/A64
uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4S;v -> Vm.2S;0 <= lane <= 1 MLA Vd.4S,Vn.4S,Vm.S[lane] Vd.4S -> result v7/A32/A64
-float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 1 N/A v7/A32/A64
-float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 3 N/A v7/A32/A64
+float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1 N/A v7/A32/A64
+float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3 N/A v7/A32/A64
int16x4_t vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, __builtin_constant_p(lane)) a -> Vd.4H;b -> Vn.4H;v -> Vm.8H;0 <= lane <= 7 MLA Vd.4H,Vn.4H,Vm.H[lane] Vd.4H -> result A64
int16x8_t vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.8H;0 <= lane <= 7 MLA Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result A64
int32x2_t vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.4S;0 <= lane <= 3 MLA Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result A64
@@ -1352,8 +1352,8 @@ uint16x4_t vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v, __builtin_co
uint16x8_t vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.8H;0 <= lane <= 7 MLA Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result A64
uint32x2_t vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.4S;0 <= lane <= 3 MLA Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result A64
uint32x4_t vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4S;v -> Vm.4S;0 <= lane <= 3 MLA Vd.4S,Vn.4S,Vm.S[lane] Vd.4S -> result A64
-float32x2_t vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 1 N/A A64
-float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = a[i] + (b[i] * v[lane]) for i = 0 to 3 N/A A64
+float32x2_t vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 1 N/A A64
+float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vadd(a[i], vmul(b[i], v[lane])) for i = 0 to 3 N/A A64
int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4H;v -> Vm.4H;0 <= lane <= 3 SMLAL Vd.4S,Vn.4H,Vm.H[lane] Vd.4S -> result v7/A32/A64
int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __builtin_constant_p(lane)) a -> Vd.2D;b -> Vn.2S;v -> Vm.2S;0 <= lane <= 1 SMLAL Vd.2D,Vn.2S,Vm.S[lane] Vd.2D -> result v7/A32/A64
uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4H;v -> Vm.4H;0 <= lane <= 3 UMLAL Vd.4S,Vn.4H,Vm.H[lane] Vd.4S -> result v7/A32/A64
@@ -1390,8 +1390,8 @@ uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __builtin_con
uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.4H;0 <= lane <= 3 MLS Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result v7/A32/A64
uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.2S;0 <= lane <= 1 MLS Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result v7/A32/A64
uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4S;v -> Vm.2S;0 <= lane <= 1 MLS Vd.4S,Vn.4S,Vm.S[lane] Vd.4S -> result v7/A32/A64
-float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 1 N/A v7/A32/A64
-float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 3 N/A v7/A32/A64
+float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1 N/A v7/A32/A64
+float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __builtin_constant_p(lane)) 0 <= lane <= 1 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3 N/A v7/A32/A64
int16x4_t vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, __builtin_constant_p(lane)) a -> Vd.4H;b -> Vn.4H;v -> Vm.8H;0 <= lane <= 7 MLS Vd.4H,Vn.4H,Vm.H[lane] Vd.4H -> result A64
int16x8_t vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.8H;0 <= lane <= 7 MLS Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result A64
int32x2_t vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.4S;0 <= lane <= 3 MLS Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result A64
@@ -1400,8 +1400,8 @@ uint16x4_t vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v, __builtin_co
uint16x8_t vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v, __builtin_constant_p(lane)) a -> Vd.8H;b -> Vn.8H;v -> Vm.8H;0 <= lane <= 7 MLS Vd.8H,Vn.8H,Vm.H[lane] Vd.8H -> result A64
uint32x2_t vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v, __builtin_constant_p(lane)) a -> Vd.2S;b -> Vn.2S;v -> Vm.4S;0 <= lane <= 3 MLS Vd.2S,Vn.2S,Vm.S[lane] Vd.2S -> result A64
uint32x4_t vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4S;v -> Vm.4S;0 <= lane <= 3 MLS Vd.4S,Vn.4S,Vm.S[lane] Vd.4S -> result A64
-float32x2_t vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 1 N/A A64
-float32x4_t vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = a[i] - (b[i] * v[lane]) for i = 0 to 3 N/A A64
+float32x2_t vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 1 N/A A64
+float32x4_t vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, __builtin_constant_p(lane)) 0 <= lane <= 3 RESULT[I] = vsub(a[i], vmul(b[i], v[lane])) for i = 0 to 3 N/A A64
int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4H;v -> Vm.4H;0 <= lane <= 3 SMLSL Vd.4S,Vn.4H,Vm.H[lane] Vd.4S -> result v7/A32/A64
int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __builtin_constant_p(lane)) a -> Vd.2D;b -> Vn.2S;v -> Vm.2S;0 <= lane <= 1 SMLSL Vd.2D,Vn.2S,Vm.S[lane] Vd.2D -> result v7/A32/A64
uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __builtin_constant_p(lane)) a -> Vd.4S;b -> Vn.4H;v -> Vm.4H;0 <= lane <= 3 UMLSL Vd.4S,Vn.4H,Vm.H[lane] Vd.4S -> result v7/A32/A64
@@ -1550,8 +1550,8 @@ uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) a -> Vd.4H;b -> Vn
uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) a -> Vd.8H;b -> Vn.8H;c -> Vm.H[0] MLA Vd.8H,Vn.8H,Vm.H[0] Vd.8H -> result v7/A32/A64
uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) a -> Vd.2S;b -> Vn.2S;c -> Vm.S[0] MLA Vd.2S,Vn.2S,Vm.S[0] Vd.2S -> result v7/A32/A64
uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) a -> Vd.4S;b -> Vn.4S;c -> Vm.S[0] MLA Vd.4S,Vn.4S,Vm.S[0] Vd.4S -> result v7/A32/A64
-float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) N/A RESULT[I] = a[i] + (b[i] * c) for i = 0 to 1 N/A v7/A32/A64
-float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) N/A RESULT[I] = a[i] + (b[i] * c) for i = 0 to 3 N/A v7/A32/A64
+float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 1 N/A v7/A32/A64
+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) N/A RESULT[I] = vadd(a[i], vmul(b[i], c)) for i = 0 to 3 N/A v7/A32/A64
int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.H[0] SMLAL Vd.4S,Vn.4H,Vm.H[0] Vd.4S -> result v7/A32/A64
int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) a -> Vd.2D;b -> Vn.2S;c -> Vm.S[0] SMLAL Vd.2D,Vn.2S,Vm.S[0] Vd.2D -> result v7/A32/A64
uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.H[0] UMLAL Vd.4S,Vn.4H,Vm.H[0] Vd.4S -> result v7/A32/A64
@@ -1572,8 +1572,8 @@ uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) a -> Vd.4H;b -> Vn
uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) a -> Vd.8H;b -> Vn.8H;c -> Vm.H[0] MLS Vd.8H,Vn.8H,Vm.H[0] Vd.8H -> result v7/A32/A64
uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) a -> Vd.2S;b -> Vn.2S;c -> Vm.S[0] MLS Vd.2S,Vn.2S,Vm.S[0] Vd.2S -> result v7/A32/A64
uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) a -> Vd.4S;b -> Vn.4S;c -> Vm.S[0] MLS Vd.4S,Vn.4S,Vm.S[0] Vd.4S -> result v7/A32/A64
-float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) N/A RESULT[I] = a[i] - (b[i] * c) for i = 0 to 1 N/A v7/A32/A64
-float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) N/A RESULT[I] = a[i] - (b[i] * c) for i = 0 to 3 N/A v7/A32/A64
+float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 1 N/A v7/A32/A64
+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) N/A RESULT[I] = vsub(a[i], vmul(b[i], c)) for i = 0 to 3 N/A v7/A32/A64
int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.H[0] SMLSL Vd.4S,Vn.4H,Vm.H[0] Vd.4S -> result v7/A32/A64
int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) a -> Vd.2D;b -> Vn.2S;c -> Vm.S[0] SMLSL Vd.2D,Vn.2S,Vm.S[0] Vd.2D -> result v7/A32/A64
uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) a -> Vd.4S;b -> Vn.4H;c -> Vm.H[0] UMLSL Vd.4S,Vn.4H,Vm.H[0] Vd.4S -> result v7/A32/A64