optimize max/min sa8

Hakim7267 · JaccovG · commit a353acfb201f · 2021-04-13T17:41:15.000+02:00
diff --git a/lib/src/kernels/eltwise/impl/mli_krn_eltwise_vdsp.h b/lib/src/kernels/eltwise/impl/mli_krn_eltwise_vdsp.h
@@ -421,13 +421,19 @@ MLI_FORCE_INLINE vNx4char_t eltwise_perform_operation<vNx4char_t, vNx4char_t, EL
     int shift = post_op_shift - mul_hi_shift;
     int shift_left = mli_math_max_fx(1 - shift, 0);
     int shift_right = mli_math_max_fx(shift, 1);
+    // As shift is limited by 23 the shift_right is limited by 7 so we can pre_shift left the out_offset
+    int16_t offset = out_offset << shift_right;
+#ifdef ROUND_UP
+    offset += ((1 << shift_right) >> 1);
+#else
+    #error Rounding mode not supported
+#endif
     vNx4short_t max = to_vNx4short_t(mli_math_max_fx(op1, op2));
     max = mli_math_sub_fx(max, (vNx4short_t)in_offset1);
     max = mli_math_asl_fx(max, shift_left);
     vNx4short_t max_scaled = mli_math_mul_fx_high(max, scale_factor1);
-    max_scaled = mli_math_asr_rnd_fx(max_scaled, shift_right);
-    max_scaled = mli_math_add_fx(max_scaled, (vNx4short_t) out_offset);
-    res = mli_math_cast_fx<vNx4short_t, vNx4char_t>(max_scaled);
+    max_scaled = mli_math_add_fx(max_scaled, (vNx4short_t) offset);
+    res = mli_math_cast_fx<vNx4short_t, vNx4char_t, false>(max_scaled, shift_right);
     return res;
 }
 
@@ -496,13 +502,19 @@ MLI_FORCE_INLINE vNx4char_t eltwise_perform_operation<vNx4char_t, vNx4char_t, EL
     int shift = post_op_shift - mul_hi_shift;
     int shift_left = mli_math_max_fx(1 - shift, 0);
     int shift_right = mli_math_max_fx(shift, 1);
+    // As shift is limited by 23 the shift_right is limited by 7 so we can pre_shift left the out_offset
+    int16_t offset = out_offset << shift_right;
+#ifdef ROUND_UP
+    offset += ((1 << shift_right) >> 1);
+#else
+    #error Rounding mode not supported
+#endif
     vNx4short_t max = to_vNx4short_t(mli_math_min_fx(op1, op2));
     max = mli_math_sub_fx(max, (vNx4short_t)in_offset1);
     max = mli_math_asl_fx(max, shift_left);
     vNx4short_t max_scaled = mli_math_mul_fx_high(max, scale_factor1);
-    max_scaled = mli_math_asr_rnd_fx(max_scaled, shift_right);
-    max_scaled = mli_math_add_fx(max_scaled, (vNx4short_t) out_offset);
-    res = mli_math_cast_fx<vNx4short_t, vNx4char_t>(max_scaled);
+    max_scaled = mli_math_add_fx(max_scaled, (vNx4short_t) offset);
+    res = mli_math_cast_fx<vNx4short_t, vNx4char_t, false>(max_scaled, shift_right);
     return res;
 
 }
diff --git a/lib/src/pal/mli_math.h b/lib/src/pal/mli_math.h
@@ -33,7 +33,7 @@ template <typename io_T> MLI_FORCE_INLINE io_T mli_math_ashift_right_fx(io_T in_
 template < typename out_T > MLI_FORCE_INLINE out_T mli_math_cast_ptr_to_scalar_fx(void *src);
 template < typename in_T > MLI_FORCE_INLINE void *mli_math_cast_scalar_to_ptr_fx(in_T src);
 
-template <typename in_T, typename out_T> MLI_FORCE_INLINE out_T mli_math_cast_fx(in_T in_val, int shift_right);
+template <typename in_T, typename out_T, bool round = true > MLI_FORCE_INLINE out_T mli_math_cast_fx(in_T in_val, int shift_right);
 template <typename in_T, typename out_T> MLI_FORCE_INLINE out_T mli_math_cast_fx(in_T in_val);
 
 #if defined(__Xvec_width) && !defined(MLI_BUILD_REFERENCE)
diff --git a/lib/src/pal/vdsp/mli_math.h b/lib/src/pal/vdsp/mli_math.h
@@ -1035,6 +1035,15 @@ MLI_FORCE_INLINE vNx4short_t mli_math_cast_fx(vNx4short_t in_val, int shift_righ
     return acc;
 }
 
+template<>
+MLI_FORCE_INLINE vNx4char_t mli_math_cast_fx<vNx4short_t, vNx4char_t, false >(vNx4short_t in_val, int shift_right) {
+    MLI_EXTRA_ASSERT(shift_right >= 0);
+    vNx4short_t acc = in_val;
+    acc = mli_math_asr_fx(acc, shift_right);
+    acc = mli_math_bound_range_fx(acc, INT8_MIN, INT8_MAX);
+    return to_vNx4char_t(acc);
+}
+
 template<>
 MLI_FORCE_INLINE vNx4char_t mli_math_cast_fx(vNx4short_t in_val, int shift_right) {
     MLI_EXTRA_ASSERT(shift_right >= 0);