|
18 | 18 | #include "arc_vector.h" |
19 | 19 |
|
20 | 20 | const int unroll_factor[2][5] = { |
21 | | - { |
22 | | - /* ELTWISE_ADD_NO_CONVERT = */ 1, |
23 | | - /* ELTWISE_SUB_NO_CONVERT = */ 1, |
24 | | - /* ELTWISE_MUL_NO_CONVERT = */ 4, |
25 | | - /* ELTWISE_MAX_NO_CONVERT = */ 4, |
26 | | - /* ELTWISE_MIN_NO_CONVERT = */ 4 |
27 | | - } , |
28 | | - { |
29 | | - /* ELTWISE_ADD_CONVERT = */ 1, |
30 | | - /* ELTWISE_SUB_CONVERT = */ 1, |
31 | | - /* ELTWISE_MUL_CONVERT = */ 3, |
32 | | - /* ELTWISE_MAX_CONVERT = */ 3, |
33 | | - /* ELTWISE_MIN_CONVERT = */ 3 |
34 | | - } |
| 21 | +{ |
| 22 | +/* ELTWISE_ADD_NO_CONVERT = */ 1, |
| 23 | +/* ELTWISE_SUB_NO_CONVERT = */ 1, |
| 24 | +/* ELTWISE_MUL_NO_CONVERT = */ 4, |
| 25 | +/* ELTWISE_MAX_NO_CONVERT = */ 4, |
| 26 | +/* ELTWISE_MIN_NO_CONVERT = */ 4 |
| 27 | +} , |
| 28 | +{ |
| 29 | +/* ELTWISE_ADD_CONVERT = */ 1, |
| 30 | +/* ELTWISE_SUB_CONVERT = */ 1, |
| 31 | +/* ELTWISE_MUL_CONVERT = */ 4, |
| 32 | +/* ELTWISE_MAX_CONVERT = */ 3, |
| 33 | +/* ELTWISE_MIN_CONVERT = */ 3 |
| 34 | +} |
35 | 35 | }; |
36 | 36 |
|
37 | 37 | namespace mli { |
@@ -296,51 +296,61 @@ MLI_FORCE_INLINE vNx4char_t eltwise_perform_operation<vNx4char_t, vNx4char_t, EL |
296 | 296 | const int pre_op_shift1, |
297 | 297 | const int pre_op_shift2, |
298 | 298 | const int post_op_shift) { |
299 | | - MLI_ASSERT(post_op_shift > 3); |
300 | 299 | vNx4char_t res; |
| 300 | + const int headroom = 3; |
| 301 | + const int hi_comp = 16; |
| 302 | + const int acc_len = 32; |
| 303 | + const int out_len = 8; |
| 304 | + const int target_out_shift = acc_len - out_len - headroom; |
| 305 | + const int preshift = mli_math_min_fx(mli_math_max_fx(post_op_shift - target_out_shift, 0), headroom); |
| 306 | + const int shift = post_op_shift - hi_comp - preshift; |
| 307 | + const int shift_left = mli_math_max_fx(1 - shift, 0); |
| 308 | + const int shift_right = mli_math_max_fx(shift, 1); |
301 | 309 |
|
302 | 310 | #if defined(__Xvec_guard_bit_option) && __Xvec_guard_bit_option != 0 |
303 | 311 | /* |
304 | 312 | * res = ((op1 - in_offset1) * (op2 - in_offset2) * scale_factor1 >> post_op_shift) + out_offset |
305 | | - * acc_init = in_offset1 * in_offset2 * scale_factor + out_offset << post_op_shift |
306 | | - * term1 = op1 * op2 * scale_factor1 // 31 bit |
307 | | - * term2 = - op2 * in_offset1 * scale_factor1 // 32 bit |
308 | | - * term3 = - op1 * in_offset2 * scale_factor1 // 32 bit |
| 313 | + * acc_init = in_offset1 * in_offset2 |
| 314 | + * term1 = op1 * op2 * scale_factor1 |
| 315 | + * term2 = - op2 * in_offset1 * scale_factor1 |
| 316 | + * term3 = - op1 * in_offset2 * scale_factor1 |
| 317 | + * acc = (term1 + term2 + term3) * scale_factor >> post_op_shift + out_offset |
| 318 | + * |
309 | 319 | */ |
| 320 | + |
310 | 321 | int16_t acc_init = in_offset1 * in_offset2; |
311 | 322 | vNx4accshort_t acc16 = mli_math_init_accu<int16_t, vNx4accshort_t>(acc_init); |
312 | 323 | acc16 = mli_math_mac_fx(acc16, op1, op2); |
313 | 324 | acc16 = mli_math_msub_fx(acc16, op2, (vNx4char_t)(int8_t)in_offset1); |
314 | 325 | acc16 = mli_math_msub_fx(acc16, op1, (vNx4char_t)(int8_t)in_offset2); |
315 | | - vNx4short_t vacc16 = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(acc16); |
316 | | - vNx4int_t acc = mli_math_mul_fx<vNx4short_t, vNx4int_t>(vacc16, scale_factor1); |
317 | | - acc = mli_math_asr_rnd_fx(acc, post_op_shift); |
318 | | - acc = mli_math_add_fx(acc, (vNx4int_t) out_offset); |
319 | | - res = mli_math_cast_fx<vNx4int_t, vNx4char_t>(acc); |
320 | | -#else |
| 326 | + |
321 | 327 | /* |
322 | | - * Each operand is 9 bit. The first multiplier output is 18 bit. After scaling with positive 15 bit scale_factor, |
323 | | - * The second multiplier output is 32 bits. A headroom of 3 is sufficient to add the offset, round and compensate. |
324 | | - * |
325 | | - * Note: Minimum shift value is 15 |
326 | | - */ |
327 | | - |
328 | | - const int preshift_sf = 3; |
329 | | - const int mask = (1 << preshift_sf) - 1; |
| 328 | + * If we preshift we can continue the operations in 16 bits. Only 8 bits are needs from the |
| 329 | + * mul_hi output. with headroom of 3 bits. |
| 330 | + */ |
| 331 | + |
| 332 | + vNx4short_t vacc16 = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(acc16, preshift); |
| 333 | + |
| 334 | + |
| 335 | +#else |
| 336 | + |
330 | 337 | vNx4short_t op1_offset = to_vNx4short_t(op1) - in_offset1; |
331 | 338 | vNx4short_t op2_offset = to_vNx4short_t(op2) - in_offset2; |
332 | | - vNx4int_t temp1 = mli_math_mul_fx<vNx4short_t, vNx4int_t>(op1_offset, op2_offset); |
333 | | - vNx4int_t temp2 = (scale_factor1 & mask); |
334 | | - vNx4int_t offset = out_offset; |
335 | | - vNx4accint_t acc = mli_math_mul_fx_low(temp1, temp2); |
336 | | - acc = mli_math_asr_fx(acc, preshift_sf); |
337 | | - temp2 = (scale_factor1 >> preshift_sf); |
338 | | - acc = mli_math_mac_fx_low(acc, temp1, temp2); |
339 | | - acc = mli_math_asr_rnd_fx(acc, post_op_shift - preshift_sf); |
340 | | - acc = mli_math_add(acc, offset); |
341 | | - res = mli_math_acc_cast_fx<vNx4char_t, vNx4accint_t>(acc); |
| 339 | + vNx4int_t acc32 = mli_math_mul_fx<vNx4short_t, vNx4int_t>(op1_offset, op2_offset); |
| 340 | + |
| 341 | + /* |
| 342 | + * If we preshift we can continue the operations in 16 bits. Only 8 bits are needs from the |
| 343 | + * mul_hi output. with headroom of 3 bits. |
| 344 | + */ |
| 345 | + |
| 346 | + vNx4short_t vacc16 = mli_math_cast_fx<vNx4int_t, vNx4short_t>(acc32, preshift); |
342 | 347 | #endif |
343 | 348 |
|
| 349 | + vacc16 = mli_math_asl_fx(vacc16, shift_left); |
| 350 | + vNx4short_t accu_scaled = mli_math_mul_fx_high(vacc16, scale_factor1); |
| 351 | + accu_scaled = mli_math_asr_rnd_fx(accu_scaled, shift_right); |
| 352 | + accu_scaled = mli_math_add_fx(accu_scaled, (vNx4short_t) out_offset); |
| 353 | + res = mli_math_cast_fx<vNx4short_t, vNx4char_t>(accu_scaled); |
344 | 354 |
|
345 | 355 | return res; |
346 | 356 | } |
@@ -549,6 +559,7 @@ void eltwise_innerloop( |
549 | 559 | idx_out += num_lanes; |
550 | 560 | } |
551 | 561 | } |
| 562 | + |
552 | 563 | template<> |
553 | 564 | MLI_FORCE_INLINE void eltwise_innerloop<int16_t, ELTWISE_MAX, false>( |
554 | 565 | const MLI_PTR(int16_t) __restrict op1_ptr, |
|
0 commit comments