eltwise max/min negative shift

Ahmed Abdelhakim (Si-Vision) · JaccovG · commit 6b8f858f2eff · 2021-04-13T14:45:58.000+02:00
diff --git a/lib/src/kernels/eltwise/impl/mli_krn_eltwise_ref.h b/lib/src/kernels/eltwise/impl/mli_krn_eltwise_ref.h
@@ -21,6 +21,11 @@
 #define INT32_TO_INT16 16
 #define IN_SCALE_SHIFT 32
 #define MUL_MAX_SHIFT 31
+/*
+ * For max/min shifting more than 23 is not needed
+ * as the scaled result ((max - in_offset) * scale) will be limited by 24 bits including the sign bit.
+ */
+#define MAX_MIN_UPPER_LIMIT_SHIFT 23
 
 namespace mli {
 namespace krn {
@@ -264,18 +269,16 @@ void eltwise_prepare_and_run(
         shift2 = in2->el_params.sa.scale_frac_bits.mem.i8;
         shift_out = out->el_params.sa.scale_frac_bits.mem.i8;
         if (func_type == ELTWISE_MAX || func_type == ELTWISE_MIN) {
-            in_scale_fx1 = mli_math_asr_rnd_fx<int32_t>(scale_1,
-                    (int32_t) shift1 - frac_bits_fx16);
-            out_scale_fx = mli_math_asr_rnd_fx<int32_t>(scale_out,
-                    (int32_t) shift_out - frac_bits_fx16);
-            scale_factor1 = mli_math_asr_rnd_fx<int32_t>(in_scale_fx1, -INT32_TO_INT16);
-            scale_factor1 /= out_scale_fx;
-            post_op_shift = INT32_TO_INT16;
-            int norm1 = (scale_factor1 != 0) ? mli_math_norm_fx<int32_t, int>(scale_factor1) : 0;
-            int shift = MAX(INT32_TO_INT16 - norm1, 0);
-            scale16_1 = mli_math_cast_fx<int32_t, int16_t>(scale_factor1, shift);
-            scale16_2 = scale16_1;
+            int32_t scale_factor = mli_math_asl_fx<int32_t>(scale_1, INT32_TO_INT16);
+            scale_factor = scale_factor / scale_out;
+            post_op_shift = INT32_TO_INT16 + shift1 - shift_out;
+            int shift;
+            scale16_1 = mli_math_norm_cast_fx<int32_t, int16_t>(scale_factor, &shift);
+            post_op_shift -= shift;
+            shift = MAX(post_op_shift - MAX_MIN_UPPER_LIMIT_SHIFT, 0) + MIN(MUL_MAX_SHIFT + post_op_shift, 0);
+            scale16_1 = mli_math_asr_rnd_fx<int16_t>(scale16_1, shift);
             post_op_shift -= shift;
+            scale16_2 = scale16_1;
         } else if (func_type == ELTWISE_MUL) {
             int shift;
             scale_factor1 = scale_1 * scale_2;
diff --git a/lib/src/kernels/eltwise/impl/mli_krn_eltwise_vdsp.h b/lib/src/kernels/eltwise/impl/mli_krn_eltwise_vdsp.h
@@ -29,8 +29,8 @@ const int unroll_factor[2][5] = {
 /* ELTWISE_ADD_CONVERT = */ 1,
 /* ELTWISE_SUB_CONVERT = */ 1,
 /* ELTWISE_MUL_CONVERT = */ 4,
-/* ELTWISE_MAX_CONVERT = */ 3,
-/* ELTWISE_MIN_CONVERT = */ 3
+/* ELTWISE_MAX_CONVERT = */ 4,
+/* ELTWISE_MIN_CONVERT = */ 4
 }
 };
 
@@ -373,7 +373,7 @@ MLI_FORCE_INLINE vNx2short_t eltwise_perform_operation<vNx2short_t, vNx2short_t,
     vNx2short_t res;
     res = mli_math_max_fx(op1, op2);
     if (post_op_shift > 0) {
-    	res = mli_math_asr_rnd_fx(res, post_op_shift);
+        res = mli_math_asr_rnd_fx(res, post_op_shift);
     } else {
         res = mli_math_asl_fx(res, -post_op_shift);
     }
@@ -417,16 +417,17 @@ MLI_FORCE_INLINE vNx4char_t eltwise_perform_operation<vNx4char_t, vNx4char_t, EL
         const int pre_op_shift2,
         const int post_op_shift) {
     vNx4char_t res;
-    int32_t acc_init = (out_offset << post_op_shift) - scale_factor1 * in_offset1;
-#ifdef ROUND_UP
-    acc_init += ((1 << post_op_shift) >> 1); // rounding half up //
-#else
-    #error Rounding mode not supported
-#endif
-    vNx4accint_t accu = mli_math_init_accu<int32_t, vNx4accint_t>(acc_init);
+    constexpr int mul_hi_shift = 16;
+    int shift = post_op_shift - mul_hi_shift;
+    int shift_left = mli_math_max_fx(1 - shift, 0);
+    int shift_right = mli_math_max_fx(shift, 1);
     vNx4short_t max = to_vNx4short_t(mli_math_max_fx(op1, op2));
-    accu = mli_math_mac_fx(accu, max, scale_factor1);
-    res = mli_math_acc_cast_fx<vNx4char_t, vNx4accint_t, false>(accu, post_op_shift);
+    max = mli_math_sub_fx(max, (vNx4short_t)in_offset1);
+    max = mli_math_asl_fx(max, shift_left);
+    vNx4short_t max_scaled = mli_math_mul_fx_high(max, scale_factor1);
+    max_scaled = mli_math_asr_rnd_fx(max_scaled, shift_right);
+    max_scaled = mli_math_add_fx(max_scaled, (vNx4short_t) out_offset);
+    res = mli_math_cast_fx<vNx4short_t, vNx4char_t>(max_scaled);
     return res;
 }
 
@@ -491,17 +492,17 @@ MLI_FORCE_INLINE vNx4char_t eltwise_perform_operation<vNx4char_t, vNx4char_t, EL
         const int pre_op_shift2,
         const int post_op_shift) {
     vNx4char_t res;
-    int32_t acc_init = (out_offset << post_op_shift) - scale_factor1 * in_offset1;
-
-#ifdef ROUND_UP
-    acc_init += ((1 << post_op_shift) >> 1); // rounding half up //
-#else
-    #error Rounding mode not supported
-#endif
-    vNx4accint_t accu = mli_math_init_accu<int32_t, vNx4accint_t>(acc_init);
+    constexpr int mul_hi_shift = 16;
+    int shift = post_op_shift - mul_hi_shift;
+    int shift_left = mli_math_max_fx(1 - shift, 0);
+    int shift_right = mli_math_max_fx(shift, 1);
     vNx4short_t max = to_vNx4short_t(mli_math_min_fx(op1, op2));
-    accu = mli_math_mac_fx(accu, max, scale_factor1);
-    res = mli_math_acc_cast_fx<vNx4char_t, vNx4accint_t, false>(accu, post_op_shift);
+    max = mli_math_sub_fx(max, (vNx4short_t)in_offset1);
+    max = mli_math_asl_fx(max, shift_left);
+    vNx4short_t max_scaled = mli_math_mul_fx_high(max, scale_factor1);
+    max_scaled = mli_math_asr_rnd_fx(max_scaled, shift_right);
+    max_scaled = mli_math_add_fx(max_scaled, (vNx4short_t) out_offset);
+    res = mli_math_cast_fx<vNx4short_t, vNx4char_t>(max_scaled);
     return res;
 
 }
@@ -572,8 +573,8 @@ MLI_FORCE_INLINE void eltwise_innerloop<int16_t, ELTWISE_MAX, false>(
         int idx2,
         int idx_out,
         const int count,
-		int16_t op1_s,
-		int16_t op2_s,
+        int16_t op1_s,
+        int16_t op2_s,
         const bool scalar_op1,
         const bool scalar_op2,
         const int16_t in_offset1,
@@ -628,8 +629,8 @@ MLI_FORCE_INLINE void eltwise_innerloop<int16_t, ELTWISE_MIN, false>(
         int idx2,
         int idx_out,
         const int count,
-		int16_t op1_s,
-		int16_t op2_s,
+        int16_t op1_s,
+        int16_t op2_s,
         const bool scalar_op1,
         const bool scalar_op2,
         const int16_t in_offset1,
diff --git a/user_tests/tests/mli_krn_eltwise/tests_mli_krn_eltwise.cc b/user_tests/tests/mli_krn_eltwise/tests_mli_krn_eltwise.cc
@@ -57,9 +57,9 @@ const crc32_calc                                    test_1_chksum_sa8{ 0xd48163e
                                                     test_5_chksum_sa8{ 0x9A14384C },
                   test_6_chksum_fx16{ 0xfc026def }, test_6_chksum_sa8{ 0x3a54561 },
                   test_7_chksum_fx16{ 0x488ed527 }, test_7_chksum_sa8{ 0xD4B7515B },
-                  test_8_chksum_fx16{ 0x68889D84 }, test_8_chksum_sa8{ 0x168B3B32 },
-                  test_9_chksum_fx16{ 0x9417F3D7 }, test_9_chksum_sa8{ 0xA83B910E },
-                  test_10_chksum_fx16{ 0xD728E430 }, test_10_chksum_sa8{ 0xE34DA6B0 },
+                  test_8_chksum_fx16{ 0x68889D84 }, test_8_chksum_sa8{ 0x2D86F301 },
+                  test_9_chksum_fx16{ 0x9417F3D7 }, test_9_chksum_sa8{ 0x351016DF },
+                  test_10_chksum_fx16{ 0xD728E430 }, test_10_chksum_sa8{ 0xDC1A832D },
                   test_11_chksum_fx16{ 0xBF03F2E0 }, test_11_chksum_sa8{ 0xD36B7E94 };
 
 // Platform Specific CRC Results