optimize eltwise mul preshift

AhmedHussein535 · JaccovG · commit 791c39c05c27 · 2021-04-13T12:29:35.000+02:00
diff --git a/lib/src/kernels/eltwise/impl/mli_krn_eltwise_ref.h b/lib/src/kernels/eltwise/impl/mli_krn_eltwise_ref.h
@@ -277,11 +277,12 @@ void eltwise_prepare_and_run(
             scale16_2 = scale16_1;
             post_op_shift -= shift;
         } else if (func_type == ELTWISE_MUL) {
-            int64_t scale_factor = mli_math_asl_fx<int64_t>(scale_1, IN_SCALE_SHIFT);
-            scale_factor = ((scale_factor * scale_2) / scale_out);
-            post_op_shift = IN_SCALE_SHIFT + shift1 + shift2 - shift_out;
             int shift;
-            scale16_1 = mli_math_norm_cast_fx<int64_t, int16_t>(scale_factor, &shift);
+            scale_factor1 = scale_1 * scale_2;
+            scale_factor1 = mli_math_norm_cast_fx<int32_t, int32_t>(scale_factor1, &shift);
+            scale_factor1 = (scale_factor1 / scale_out);
+            post_op_shift = shift1 + shift2 - shift_out - shift;
+            scale16_1 = mli_math_norm_cast_fx<int32_t, int16_t>(scale_factor1, &shift);
             post_op_shift -= shift;
             shift = MAX(post_op_shift - MUL_MAX_SHIFT, 0) + MIN(MUL_MAX_SHIFT + post_op_shift, 0);
             scale16_1 = mli_math_asr_rnd_fx<int16_t>(scale16_1, shift);
diff --git a/lib/src/kernels/eltwise/impl/mli_krn_eltwise_vdsp.h b/lib/src/kernels/eltwise/impl/mli_krn_eltwise_vdsp.h
@@ -319,6 +319,11 @@ MLI_FORCE_INLINE vNx4char_t eltwise_perform_operation<vNx4char_t, vNx4char_t, EL
      */
 
     int16_t acc_init = in_offset1 * in_offset2;
+#ifdef ROUND_UP
+    acc_init += ((1 << preshift) >> 1); /* rounding half up */
+#else
+    #error Rounding mode not supported
+#endif
     vNx4accshort_t acc16 = mli_math_init_accu<int16_t, vNx4accshort_t>(acc_init);
     acc16 = mli_math_mac_fx(acc16, op1, op2);
     acc16 = mli_math_msub_fx(acc16, op2, (vNx4char_t)(int8_t)in_offset1);
@@ -329,9 +334,7 @@ MLI_FORCE_INLINE vNx4char_t eltwise_perform_operation<vNx4char_t, vNx4char_t, EL
      * mul_hi output. with headroom of 3 bits.
      */
 
-    vNx4short_t vacc16 = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(acc16, preshift);
-
-
+    vNx4short_t vacc16 = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t, false>(acc16, preshift);
 #else
 
     vNx4short_t op1_offset = to_vNx4short_t(op1) - in_offset1;
diff --git a/lib/src/pal/dsp/mli_math.h b/lib/src/pal/dsp/mli_math.h
@@ -126,9 +126,9 @@ MLI_FORCE_INLINE int mli_math_norm_fx(mli_acc40_t acc) {
 }
 
 template<typename in_T, typename out_T>
-MLI_FORCE_INLINE out_T mli_math_norm_cast_fx(in_T val , int *norm_shift) {
-    int cast_shift = (sizeof(in_T) - sizeof(out_T)) * 8;
-    int norm = mli_math_norm_fx<in_T, in_T>(val);
+MLI_FORCE_INLINE out_T mli_math_norm_cast_fx(in_T val , int32_t *norm_shift) {
+    int32_t cast_shift = (sizeof(in_T) - sizeof(out_T)) * 8;
+    int32_t norm = mli_math_norm_fx<in_T, int32_t>(val);
     *norm_shift = cast_shift - norm;
     return mli_math_cast_fx<in_T, out_T>(val, *norm_shift);
 }
diff --git a/lib/src/pal/ref/mli_math.h b/lib/src/pal/ref/mli_math.h
@@ -141,9 +141,9 @@ MLI_FORCE_INLINE o_T mli_math_norm_fx(T x)
 }
 
 template<typename in_T, typename out_T>
-MLI_FORCE_INLINE out_T mli_math_norm_cast_fx(in_T val , int *norm_shift) {
-    int cast_shift = (sizeof(in_T) - sizeof(out_T)) * 8;
-    int norm = mli_math_norm_fx<in_T, int>(val);
+MLI_FORCE_INLINE out_T mli_math_norm_cast_fx(in_T val , int32_t *norm_shift) {
+    int32_t cast_shift = (sizeof(in_T) - sizeof(out_T)) * 8;
+    int32_t norm = mli_math_norm_fx<in_T, int32_t>(val);
     *norm_shift = cast_shift - norm;
     return mli_math_cast_fx<in_T, out_T>(val, *norm_shift);
 }
diff --git a/lib/src/pal/vdsp/mli_math.h b/lib/src/pal/vdsp/mli_math.h
@@ -1508,9 +1508,9 @@ MLI_FORCE_INLINE vNx4int_t mli_math_norm_fx(vNx4accint_t x) {
 }
 
 template<typename in_T, typename out_T>
-MLI_FORCE_INLINE out_T mli_math_norm_cast_fx(in_T val , int *norm_shift) {
-    int cast_shift = (sizeof(in_T) - sizeof(out_T)) * 8;
-    int norm = mli_math_norm_fx<in_T, in_T>(val);
+MLI_FORCE_INLINE out_T mli_math_norm_cast_fx(in_T val , int32_t *norm_shift) {
+    int32_t cast_shift = (sizeof(in_T) - sizeof(out_T)) * 8;
+    int32_t norm = mli_math_norm_fx<in_T, int32_t>(val);
     *norm_shift = cast_shift - norm;
     return mli_math_cast_fx<in_T, out_T>(val, *norm_shift);
 }

Original file line number	Diff line number	Diff line change
`@@ -126,9 +126,9 @@ MLI_FORCE_INLINE int mli_math_norm_fx(mli_acc40_t acc) {`
`126`	`126`	`}`
`127`	`127`
`128`	`128`	`template<typename in_T, typename out_T>`
`129`		`-MLI_FORCE_INLINE out_T mli_math_norm_cast_fx(in_T val , int *norm_shift) {`
`130`		`- int cast_shift = (sizeof(in_T) - sizeof(out_T)) * 8;`
`131`		`- int norm = mli_math_norm_fx<in_T, in_T>(val);`
	`129`	`+MLI_FORCE_INLINE out_T mli_math_norm_cast_fx(in_T val , int32_t *norm_shift) {`
	`130`	`+ int32_t cast_shift = (sizeof(in_T) - sizeof(out_T)) * 8;`
	`131`	`+ int32_t norm = mli_math_norm_fx<in_T, int32_t>(val);`
`132`	`132`	`*norm_shift = cast_shift - norm;`
`133`	`133`	`return mli_math_cast_fx<in_T, out_T>(val, *norm_shift);`
`134`	`134`	`}`
Original file line number	Diff line number	Diff line change
`@@ -141,9 +141,9 @@ MLI_FORCE_INLINE o_T mli_math_norm_fx(T x)`
`141`	`141`	`}`
`142`	`142`
`143`	`143`	`template<typename in_T, typename out_T>`
`144`		`-MLI_FORCE_INLINE out_T mli_math_norm_cast_fx(in_T val , int *norm_shift) {`
`145`		`- int cast_shift = (sizeof(in_T) - sizeof(out_T)) * 8;`
`146`		`- int norm = mli_math_norm_fx<in_T, int>(val);`
	`144`	`+MLI_FORCE_INLINE out_T mli_math_norm_cast_fx(in_T val , int32_t *norm_shift) {`
	`145`	`+ int32_t cast_shift = (sizeof(in_T) - sizeof(out_T)) * 8;`
	`146`	`+ int32_t norm = mli_math_norm_fx<in_T, int32_t>(val);`
`147`	`147`	`*norm_shift = cast_shift - norm;`
`148`	`148`	`return mli_math_cast_fx<in_T, out_T>(val, *norm_shift);`
`149`	`149`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1508,9 +1508,9 @@ MLI_FORCE_INLINE vNx4int_t mli_math_norm_fx(vNx4accint_t x) {`
`1508`	`1508`	`}`
`1509`	`1509`
`1510`	`1510`	`template<typename in_T, typename out_T>`
`1511`		`-MLI_FORCE_INLINE out_T mli_math_norm_cast_fx(in_T val , int *norm_shift) {`
`1512`		`- int cast_shift = (sizeof(in_T) - sizeof(out_T)) * 8;`
`1513`		`- int norm = mli_math_norm_fx<in_T, in_T>(val);`
	`1511`	`+MLI_FORCE_INLINE out_T mli_math_norm_cast_fx(in_T val , int32_t *norm_shift) {`
	`1512`	`+ int32_t cast_shift = (sizeof(in_T) - sizeof(out_T)) * 8;`
	`1513`	`+ int32_t norm = mli_math_norm_fx<in_T, int32_t>(val);`
`1514`	`1514`	`*norm_shift = cast_shift - norm;`
`1515`	`1515`	`return mli_math_cast_fx<in_T, out_T>(val, *norm_shift);`
`1516`	`1516`	`}`