Skip to content

Commit c3aa2c6

Browse files
easyaspi314mr-c
authored andcommitted
NEON: properly implement _high intrinsics
High intrinsics merely have an implicit vget_high or vcombine. There is no need to complicate them further.
1 parent 11a6182 commit c3aa2c6

File tree

9 files changed

+68
-324
lines changed

9 files changed

+68
-324
lines changed

simde/arm/neon/addl_high.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,8 @@
2828
#if !defined(SIMDE_ARM_NEON_ADDL_HIGH_H)
2929
#define SIMDE_ARM_NEON_ADDL_HIGH_H
3030

31-
#include "add.h"
32-
#include "movl.h"
33-
#include "movl_high.h"
31+
#include "addl.h"
32+
#include "get_high.h"
3433
#include "types.h"
3534

3635
HEDLEY_DIAGNOSTIC_PUSH
@@ -43,7 +42,7 @@ simde_vaddl_high_s8(simde_int8x16_t a, simde_int8x16_t b) {
4342
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4443
return vaddl_high_s8(a, b);
4544
#else
46-
return simde_vaddq_s16(simde_vmovl_high_s8(a), simde_vmovl_high_s8(b));
45+
return simde_vaddl_s8(simde_vget_high_s8(a), simde_vget_high_s8(b));
4746
#endif
4847
}
4948
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -57,7 +56,7 @@ simde_vaddl_high_s16(simde_int16x8_t a, simde_int16x8_t b) {
5756
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5857
return vaddl_high_s16(a, b);
5958
#else
60-
return simde_vaddq_s32(simde_vmovl_high_s16(a), simde_vmovl_high_s16(b));
59+
return simde_vaddl_s16(simde_vget_high_s16(a), simde_vget_high_s16(b));
6160
#endif
6261
}
6362
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -71,7 +70,7 @@ simde_vaddl_high_s32(simde_int32x4_t a, simde_int32x4_t b) {
7170
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7271
return vaddl_high_s32(a, b);
7372
#else
74-
return simde_vaddq_s64(simde_vmovl_high_s32(a), simde_vmovl_high_s32(b));
73+
return simde_vaddl_s32(simde_vget_high_s32(a), simde_vget_high_s32(b));
7574
#endif
7675
}
7776
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -85,7 +84,7 @@ simde_vaddl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
8584
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
8685
return vaddl_high_u8(a, b);
8786
#else
88-
return simde_vaddq_u16(simde_vmovl_high_u8(a), simde_vmovl_high_u8(b));
87+
return simde_vaddl_u8(simde_vget_high_u8(a), simde_vget_high_u8(b));
8988
#endif
9089
}
9190
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -99,7 +98,7 @@ simde_vaddl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
9998
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
10099
return vaddl_high_u16(a, b);
101100
#else
102-
return simde_vaddq_u32(simde_vmovl_high_u16(a), simde_vmovl_high_u16(b));
101+
return simde_vaddl_u16(simde_vget_high_u16(a), simde_vget_high_u16(b));
103102
#endif
104103
}
105104
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -113,7 +112,7 @@ simde_vaddl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
113112
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
114113
return vaddl_high_u32(a, b);
115114
#else
116-
return simde_vaddq_u64(simde_vmovl_high_u32(a), simde_vmovl_high_u32(b));
115+
return simde_vaddl_u32(simde_vget_high_u32(a), simde_vget_high_u32(b));
117116
#endif
118117
}
119118
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)

simde/arm/neon/addw_high.h

Lines changed: 8 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
#define SIMDE_ARM_NEON_ADDW_HIGH_H
2929

3030
#include "types.h"
31-
#include "movl_high.h"
32-
#include "add.h"
31+
#include "get_high.h"
32+
#include "addw.h"
3333

3434
HEDLEY_DIAGNOSTIC_PUSH
3535
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
@@ -40,19 +40,8 @@ simde_int16x8_t
4040
simde_vaddw_high_s8(simde_int16x8_t a, simde_int8x16_t b) {
4141
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4242
return vaddw_high_s8(a, b);
43-
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
44-
return simde_vaddq_s16(a, simde_vmovl_high_s8(b));
4543
#else
46-
simde_int16x8_private r_;
47-
simde_int16x8_private a_ = simde_int16x8_to_private(a);
48-
simde_int8x16_private b_ = simde_int8x16_to_private(b);
49-
50-
SIMDE_VECTORIZE
51-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
52-
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
53-
}
54-
55-
return simde_int16x8_from_private(r_);
44+
return simde_vaddw_s8(a, simde_vget_high_s8(b));
5645
#endif
5746
}
5847
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -65,19 +54,8 @@ simde_int32x4_t
6554
simde_vaddw_high_s16(simde_int32x4_t a, simde_int16x8_t b) {
6655
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6756
return vaddw_high_s16(a, b);
68-
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
69-
return simde_vaddq_s32(a, simde_vmovl_high_s16(b));
7057
#else
71-
simde_int32x4_private r_;
72-
simde_int32x4_private a_ = simde_int32x4_to_private(a);
73-
simde_int16x8_private b_ = simde_int16x8_to_private(b);
74-
75-
SIMDE_VECTORIZE
76-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
77-
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
78-
}
79-
80-
return simde_int32x4_from_private(r_);
58+
return simde_vaddw_s16(a, simde_vget_high_s16(b));
8159
#endif
8260
}
8361
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -90,19 +68,8 @@ simde_int64x2_t
9068
simde_vaddw_high_s32(simde_int64x2_t a, simde_int32x4_t b) {
9169
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
9270
return vaddw_high_s32(a, b);
93-
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
94-
return simde_vaddq_s64(a, simde_vmovl_high_s32(b));
9571
#else
96-
simde_int64x2_private r_;
97-
simde_int64x2_private a_ = simde_int64x2_to_private(a);
98-
simde_int32x4_private b_ = simde_int32x4_to_private(b);
99-
100-
SIMDE_VECTORIZE
101-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
102-
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
103-
}
104-
105-
return simde_int64x2_from_private(r_);
72+
return simde_vaddw_s32(a, simde_vget_high_s32(b));
10673
#endif
10774
}
10875
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -115,19 +82,8 @@ simde_uint16x8_t
11582
simde_vaddw_high_u8(simde_uint16x8_t a, simde_uint8x16_t b) {
11683
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
11784
return vaddw_high_u8(a, b);
118-
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
119-
return simde_vaddq_u16(a, simde_vmovl_high_u8(b));
12085
#else
121-
simde_uint16x8_private r_;
122-
simde_uint16x8_private a_ = simde_uint16x8_to_private(a);
123-
simde_uint8x16_private b_ = simde_uint8x16_to_private(b);
124-
125-
SIMDE_VECTORIZE
126-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
127-
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
128-
}
129-
130-
return simde_uint16x8_from_private(r_);
86+
return simde_vaddw_u8(a, simde_vget_high_u8(b));
13187
#endif
13288
}
13389
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -140,19 +96,8 @@ simde_uint32x4_t
14096
simde_vaddw_high_u16(simde_uint32x4_t a, simde_uint16x8_t b) {
14197
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
14298
return vaddw_high_u16(a, b);
143-
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
144-
return simde_vaddq_u32(a, simde_vmovl_high_u16(b));
14599
#else
146-
simde_uint32x4_private r_;
147-
simde_uint32x4_private a_ = simde_uint32x4_to_private(a);
148-
simde_uint16x8_private b_ = simde_uint16x8_to_private(b);
149-
150-
SIMDE_VECTORIZE
151-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
152-
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
153-
}
154-
155-
return simde_uint32x4_from_private(r_);
100+
return simde_vaddw_u16(a, simde_vget_high_u16(b));
156101
#endif
157102
}
158103
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -165,19 +110,8 @@ simde_uint64x2_t
165110
simde_vaddw_high_u32(simde_uint64x2_t a, simde_uint32x4_t b) {
166111
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
167112
return vaddw_high_u32(a, b);
168-
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
169-
return simde_vaddq_u64(a, simde_vmovl_high_u32(b));
170113
#else
171-
simde_uint64x2_private r_;
172-
simde_uint64x2_private a_ = simde_uint64x2_to_private(a);
173-
simde_uint32x4_private b_ = simde_uint32x4_to_private(b);
174-
175-
SIMDE_VECTORIZE
176-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
177-
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
178-
}
179-
180-
return simde_uint64x2_from_private(r_);
114+
return simde_vaddw_u32(a, simde_vget_high_u32(b));
181115
#endif
182116
}
183117
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)

simde/arm/neon/mlal_high.h

Lines changed: 8 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
#if !defined(SIMDE_ARM_NEON_MLAL_HIGH_H)
2929
#define SIMDE_ARM_NEON_MLAL_HIGH_H
3030

31-
#include "movl_high.h"
32-
#include "mla.h"
31+
#include "get_high.h"
32+
#include "mlal.h"
3333
#include "types.h"
3434

3535
HEDLEY_DIAGNOSTIC_PUSH
@@ -42,7 +42,7 @@ simde_vmlal_high_s8(simde_int16x8_t a, simde_int8x16_t b, simde_int8x16_t c) {
4242
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4343
return vmlal_high_s8(a, b, c);
4444
#else
45-
return simde_vmlaq_s16(a, simde_vmovl_high_s8(b), simde_vmovl_high_s8(c));
45+
return simde_vmlal_s8(a, simde_vget_high_s8(b), simde_vget_high_s8(c));
4646
#endif
4747
}
4848
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -56,7 +56,7 @@ simde_vmlal_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) {
5656
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5757
return vmlal_high_s16(a, b, c);
5858
#else
59-
return simde_vmlaq_s32(a, simde_vmovl_high_s16(b), simde_vmovl_high_s16(c));
59+
return simde_vmlal_s16(a, simde_vget_high_s16(b), simde_vget_high_s16(c));
6060
#endif
6161
}
6262
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -70,22 +70,7 @@ simde_vmlal_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) {
7070
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7171
return vmlal_high_s32(a, b, c);
7272
#else
73-
simde_int64x2_private
74-
r_,
75-
a_ = simde_int64x2_to_private(a),
76-
b_ = simde_int64x2_to_private(simde_vmovl_high_s32(b)),
77-
c_ = simde_int64x2_to_private(simde_vmovl_high_s32(c));
78-
79-
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
80-
r_.values = (b_.values * c_.values) + a_.values;
81-
#else
82-
SIMDE_VECTORIZE
83-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
84-
r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i];
85-
}
86-
#endif
87-
88-
return simde_int64x2_from_private(r_);
73+
return simde_vmlal_s32(a, simde_vget_high_s32(b), simde_vget_high_s32(c));
8974
#endif
9075
}
9176
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -99,7 +84,7 @@ simde_vmlal_high_u8(simde_uint16x8_t a, simde_uint8x16_t b, simde_uint8x16_t c)
9984
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
10085
return vmlal_high_u8(a, b, c);
10186
#else
102-
return simde_vmlaq_u16(a, simde_vmovl_high_u8(b), simde_vmovl_high_u8(c));
87+
return simde_vmlal_u8(a, simde_vget_high_u8(b), simde_vget_high_u8(c));
10388
#endif
10489
}
10590
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -113,7 +98,7 @@ simde_vmlal_high_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t c)
11398
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
11499
return vmlal_high_u16(a, b, c);
115100
#else
116-
return simde_vmlaq_u32(a, simde_vmovl_high_u16(b), simde_vmovl_high_u16(c));
101+
return simde_vmlal_u16(a, simde_vget_high_u16(b), simde_vget_high_u16(c));
117102
#endif
118103
}
119104
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -127,22 +112,7 @@ simde_vmlal_high_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t c)
127112
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
128113
return vmlal_high_u32(a, b, c);
129114
#else
130-
simde_uint64x2_private
131-
r_,
132-
a_ = simde_uint64x2_to_private(a),
133-
b_ = simde_uint64x2_to_private(simde_vmovl_high_u32(b)),
134-
c_ = simde_uint64x2_to_private(simde_vmovl_high_u32(c));
135-
136-
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
137-
r_.values = (b_.values * c_.values) + a_.values;
138-
#else
139-
SIMDE_VECTORIZE
140-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
141-
r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i];
142-
}
143-
#endif
144-
145-
return simde_uint64x2_from_private(r_);
115+
return simde_vmlal_u32(a, simde_vget_high_u32(b), simde_vget_high_u32(c));
146116
#endif
147117
}
148118
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)

simde/arm/neon/mlal_high_n.h

Lines changed: 6 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,8 @@
2727
#if !defined(SIMDE_ARM_NEON_MLAL_HIGH_N_H)
2828
#define SIMDE_ARM_NEON_MLAL_HIGH_N_H
2929

30-
#include "movl_high.h"
31-
#include "dup_n.h"
32-
#include "mla.h"
30+
#include "get_high.h"
31+
#include "mlal_n.h"
3332
#include "types.h"
3433

3534
HEDLEY_DIAGNOSTIC_PUSH
@@ -42,7 +41,7 @@ simde_vmlal_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) {
4241
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4342
return vmlal_high_n_s16(a, b, c);
4443
#else
45-
return simde_vmlaq_s32(a, simde_vmovl_high_s16(b), simde_vdupq_n_s32(c));
44+
return simde_vmlal_n_s16(a, simde_vget_high_s16(b), c);
4645
#endif
4746
}
4847
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -56,22 +55,7 @@ simde_vmlal_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) {
5655
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5756
return vmlal_high_n_s32(a, b, c);
5857
#else
59-
simde_int64x2_private
60-
r_,
61-
a_ = simde_int64x2_to_private(a),
62-
b_ = simde_int64x2_to_private(simde_vmovl_high_s32(b)),
63-
c_ = simde_int64x2_to_private(simde_vdupq_n_s64(c));
64-
65-
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
66-
r_.values = (b_.values * c_.values) + a_.values;
67-
#else
68-
SIMDE_VECTORIZE
69-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
70-
r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i];
71-
}
72-
#endif
73-
74-
return simde_int64x2_from_private(r_);
58+
return simde_vmlal_n_s32(a, simde_vget_high_s32(b), c);
7559
#endif
7660
}
7761
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -85,7 +69,7 @@ simde_vmlal_high_n_u16(simde_uint32x4_t a, simde_uint16x8_t b, uint16_t c) {
8569
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
8670
return vmlal_high_n_u16(a, b, c);
8771
#else
88-
return simde_vmlaq_u32(a, simde_vmovl_high_u16(b), simde_vdupq_n_u32(c));
72+
return simde_vmlal_n_u16(a, simde_vget_high_u16(b), c);
8973
#endif
9074
}
9175
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
@@ -99,22 +83,7 @@ simde_vmlal_high_n_u32(simde_uint64x2_t a, simde_uint32x4_t b, uint32_t c) {
9983
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
10084
return vmlal_high_n_u32(a, b, c);
10185
#else
102-
simde_uint64x2_private
103-
r_,
104-
a_ = simde_uint64x2_to_private(a),
105-
b_ = simde_uint64x2_to_private(simde_vmovl_high_u32(b)),
106-
c_ = simde_uint64x2_to_private(simde_vdupq_n_u64(c));
107-
108-
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
109-
r_.values = (b_.values * c_.values) + a_.values;
110-
#else
111-
SIMDE_VECTORIZE
112-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
113-
r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i];
114-
}
115-
#endif
116-
117-
return simde_uint64x2_from_private(r_);
86+
return simde_vmlal_n_u32(a, simde_vget_high_u32(b), c);
11887
#endif
11988
}
12089
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)

0 commit comments

Comments
 (0)