Skip to content

Commit 102ef03

Browse files
FargkJaccovG
authored andcommitted
put saturation in store functions for eltwise functions
1 parent f9faa29 commit 102ef03

File tree

2 files changed

+18
-7
lines changed

2 files changed

+18
-7
lines changed

lib/src/kernels/eltwise/mli_krn_eltwise.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ static inline void __attribute__ ((always_inline)) eltwise_op_add_fx (
183183

184184
const v2q15_t broadcast_val_v2 = fx_create_v2q15(broadcast_val, broadcast_val);
185185
for (int idx = 0; idx < out_size / 2; idx++) {
186-
mli_prv_store_2_samples(out, fx_add_v2q15(broadcast_val_v2, mli_prv_load_2_samples(vec)));
186+
mli_prv_sat_and_store_2_samples(out, fx_add_v2q15(broadcast_val_v2, mli_prv_load_2_samples(vec)));
187187
vec += 2;
188188
out += 2;
189189
}
@@ -222,7 +222,7 @@ static inline void __attribute__ ((always_inline)) eltwise_op_sub_fx (
222222
const v2q15_t broadcast_val_v2 = fx_create_v2q15(broadcast_val, broadcast_val);
223223
// Vector minus scalar
224224
for (int idx = 0; idx < op1_size / 2; idx++) {
225-
mli_prv_store_2_samples(out, fx_sub_v2q15(mli_prv_load_2_samples(op1), broadcast_val_v2));
225+
mli_prv_sat_and_store_2_samples(out, fx_sub_v2q15(mli_prv_load_2_samples(op1), broadcast_val_v2));
226226
op1 += 2;
227227
out += 2;
228228
}
@@ -234,7 +234,7 @@ static inline void __attribute__ ((always_inline)) eltwise_op_sub_fx (
234234
const v2q15_t broadcast_val_v2 = fx_create_v2q15(broadcast_val, broadcast_val);
235235
// Scalar minus Vector
236236
for (int idx = 0; idx < op2_size / 2; idx++) {
237-
mli_prv_store_2_samples(out, fx_sub_v2q15(broadcast_val_v2, mli_prv_load_2_samples(op2)));
237+
mli_prv_sat_and_store_2_samples(out, fx_sub_v2q15(broadcast_val_v2, mli_prv_load_2_samples(op2)));
238238
op2 += 2;
239239
out += 2;
240240
}
@@ -274,7 +274,7 @@ static inline void __attribute__ ((always_inline)) eltwise_op_max_fx (
274274

275275
const v2q15_t broadcast_val_v2 = fx_create_v2q15(broadcast_val, broadcast_val);
276276
for (int idx = 0; idx < out_size / 2; idx++) {
277-
mli_prv_store_2_samples(out, fx_max_v2q15(broadcast_val_v2, mli_prv_load_2_samples(vec)));
277+
mli_prv_sat_and_store_2_samples(out, fx_max_v2q15(broadcast_val_v2, mli_prv_load_2_samples(vec)));
278278
vec += 2;
279279
out += 2;
280280
}
@@ -314,7 +314,7 @@ static inline void __attribute__ ((always_inline)) eltwise_op_min_fx (
314314

315315
const v2q15_t broadcast_val_v2 = fx_create_v2q15(broadcast_val, broadcast_val);
316316
for (int idx = 0; idx < out_size / 2; idx++) {
317-
mli_prv_store_2_samples(out, fx_min_v2q15(broadcast_val_v2, mli_prv_load_2_samples(vec)));
317+
mli_prv_sat_and_store_2_samples(out, fx_min_v2q15(broadcast_val_v2, mli_prv_load_2_samples(vec)));
318318
vec += 2;
319319
out += 2;
320320
}
@@ -357,7 +357,7 @@ static inline void __attribute__ ((always_inline)) eltwise_op_mul_fx (
357357
if ((out_size & 0x3) || (out_size < 0x7)) {
358358
for (int j = 0; j < (out_size & 0x3); j++) {
359359
auto acc = mli_prv_init_accu((io_T)0);
360-
mli_prv_load_mac(&acc, vec++, (const MLI_PTR(io_T) __restrict) & broadcast_val);
360+
mli_prv_load_mac(&acc, vec++, (const io_T *__restrict) &broadcast_val);
361361
mli_prv_clip_and_store_output(out++, &acc, mul_out_shift);
362362
}
363363
for (int j = 0; j < (out_size & ~0x3) / 2; j++) {
@@ -431,7 +431,7 @@ static inline void __attribute__ ((always_inline)) eltwise_op_mul_with_restricts
431431
if ((out_size & 0x3) || (out_size < 0x7)) {
432432
for (int j = 0; j < (out_size & 0x3); j++) {
433433
auto acc = mli_prv_init_accu((io_T)0);
434-
mli_prv_load_mac(&acc, vec++, (const MLI_PTR(io_T) __restrict) & broadcast_val);
434+
mli_prv_load_mac(&acc, vec++, (const io_T *__restrict) &broadcast_val);
435435
mli_prv_clip_and_store_output(out++, &acc, mul_out_shift);
436436
}
437437
for (int j = 0; j < (out_size & ~0x3) / 2; j++) {

lib/src/private/mli_prv_load_store.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,17 @@ static inline void __attribute__ ((always_inline)) mli_prv_store_2_samples (MLI_
6666
*(MLI_PTR (v2q15_t)) out = data;
6767
}
6868

69+
70+
static inline void __attribute__ ((always_inline)) mli_prv_sat_and_store_2_samples (MLI_PTR (int8_t) __restrict out, v2q15_t data) {
71+
const v2u16_t sat_v2= {8, 8};
72+
*(MLI_PTR (v2i8_t)) out = __builtin_convertvector (fx_sat_v2q15(data, sat_v2), v2i8_t);
73+
}
74+
75+
static inline void __attribute__ ((always_inline)) mli_prv_sat_and_store_2_samples (MLI_PTR (int16_t) __restrict out, v2q15_t data) {
76+
/*You don't need to do additional saturation, because of it already built into the 16-bit FXAPI functions.*/
77+
*(MLI_PTR (v2q15_t)) out = data;
78+
}
79+
6980
static inline v2q15_t __attribute__ ((always_inline)) mli_prv_load_1_sample (const MLI_PTR (int8_t) __restrict in) {
7081
return fx_create_v2q15((q15_t) (*(MLI_PTR (q7_t)) in), 0);
7182
}

0 commit comments

Comments
 (0)