@@ -29,8 +29,8 @@ const int unroll_factor[2][5] = {
2929/* ELTWISE_ADD_CONVERT = */ 1 ,
3030/* ELTWISE_SUB_CONVERT = */ 1 ,
3131/* ELTWISE_MUL_CONVERT = */ 4 ,
32- /* ELTWISE_MAX_CONVERT = */ 3 ,
33- /* ELTWISE_MIN_CONVERT = */ 3
32+ /* ELTWISE_MAX_CONVERT = */ 4 ,
33+ /* ELTWISE_MIN_CONVERT = */ 4
3434}
3535};
3636
@@ -373,7 +373,7 @@ MLI_FORCE_INLINE vNx2short_t eltwise_perform_operation<vNx2short_t, vNx2short_t,
373373 vNx2short_t res;
374374 res = mli_math_max_fx (op1, op2);
375375 if (post_op_shift > 0 ) {
376- res = mli_math_asr_rnd_fx (res, post_op_shift);
376+ res = mli_math_asr_rnd_fx (res, post_op_shift);
377377 } else {
378378 res = mli_math_asl_fx (res, -post_op_shift);
379379 }
@@ -417,16 +417,17 @@ MLI_FORCE_INLINE vNx4char_t eltwise_perform_operation<vNx4char_t, vNx4char_t, EL
417417 const int pre_op_shift2,
418418 const int post_op_shift) {
419419 vNx4char_t res;
420- int32_t acc_init = (out_offset << post_op_shift) - scale_factor1 * in_offset1;
421- #ifdef ROUND_UP
422- acc_init += ((1 << post_op_shift) >> 1 ); // rounding half up //
423- #else
424- #error Rounding mode not supported
425- #endif
426- vNx4accint_t accu = mli_math_init_accu<int32_t , vNx4accint_t>(acc_init);
420+ constexpr int mul_hi_shift = 16 ;
421+ int shift = post_op_shift - mul_hi_shift;
422+ int shift_left = mli_math_max_fx (1 - shift, 0 );
423+ int shift_right = mli_math_max_fx (shift, 1 );
427424 vNx4short_t max = to_vNx4short_t (mli_math_max_fx (op1, op2));
428- accu = mli_math_mac_fx (accu, max, scale_factor1);
429- res = mli_math_acc_cast_fx<vNx4char_t, vNx4accint_t, false >(accu, post_op_shift);
425+ max = mli_math_sub_fx (max, (vNx4short_t)in_offset1);
426+ max = mli_math_asl_fx (max, shift_left);
427+ vNx4short_t max_scaled = mli_math_mul_fx_high (max, scale_factor1);
428+ max_scaled = mli_math_asr_rnd_fx (max_scaled, shift_right);
429+ max_scaled = mli_math_add_fx (max_scaled, (vNx4short_t) out_offset);
430+ res = mli_math_cast_fx<vNx4short_t, vNx4char_t>(max_scaled);
430431 return res;
431432}
432433
@@ -491,17 +492,17 @@ MLI_FORCE_INLINE vNx4char_t eltwise_perform_operation<vNx4char_t, vNx4char_t, EL
491492 const int pre_op_shift2,
492493 const int post_op_shift) {
493494 vNx4char_t res;
494- int32_t acc_init = (out_offset << post_op_shift) - scale_factor1 * in_offset1;
495-
496- #ifdef ROUND_UP
497- acc_init += ((1 << post_op_shift) >> 1 ); // rounding half up //
498- #else
499- #error Rounding mode not supported
500- #endif
501- vNx4accint_t accu = mli_math_init_accu<int32_t , vNx4accint_t>(acc_init);
495+ constexpr int mul_hi_shift = 16 ;
496+ int shift = post_op_shift - mul_hi_shift;
497+ int shift_left = mli_math_max_fx (1 - shift, 0 );
498+ int shift_right = mli_math_max_fx (shift, 1 );
502499 vNx4short_t max = to_vNx4short_t (mli_math_min_fx (op1, op2));
503- accu = mli_math_mac_fx (accu, max, scale_factor1);
504- res = mli_math_acc_cast_fx<vNx4char_t, vNx4accint_t, false >(accu, post_op_shift);
500+ max = mli_math_sub_fx (max, (vNx4short_t)in_offset1);
501+ max = mli_math_asl_fx (max, shift_left);
502+ vNx4short_t max_scaled = mli_math_mul_fx_high (max, scale_factor1);
503+ max_scaled = mli_math_asr_rnd_fx (max_scaled, shift_right);
504+ max_scaled = mli_math_add_fx (max_scaled, (vNx4short_t) out_offset);
505+ res = mli_math_cast_fx<vNx4short_t, vNx4char_t>(max_scaled);
505506 return res;
506507
507508}
@@ -572,8 +573,8 @@ MLI_FORCE_INLINE void eltwise_innerloop<int16_t, ELTWISE_MAX, false>(
572573 int idx2,
573574 int idx_out,
574575 const int count,
575- int16_t op1_s,
576- int16_t op2_s,
576+ int16_t op1_s,
577+ int16_t op2_s,
577578 const bool scalar_op1,
578579 const bool scalar_op2,
579580 const int16_t in_offset1,
@@ -628,8 +629,8 @@ MLI_FORCE_INLINE void eltwise_innerloop<int16_t, ELTWISE_MIN, false>(
628629 int idx2,
629630 int idx_out,
630631 const int count,
631- int16_t op1_s,
632- int16_t op2_s,
632+ int16_t op1_s,
633+ int16_t op2_s,
633634 const bool scalar_op1,
634635 const bool scalar_op2,
635636 const int16_t in_offset1,
0 commit comments