fix negative_shift in eltwise_mul

AhmedHussein535 · JaccovG · commit f26b02df820d · 2021-04-12T11:31:29.000+02:00
diff --git a/lib/src/kernels/eltwise/impl/mli_krn_eltwise_ref.h b/lib/src/kernels/eltwise/impl/mli_krn_eltwise_ref.h
@@ -18,8 +18,9 @@
 #include "mli_prv_dsp.h"
 #include "mli_math.h"
 
-#define INT64_TO_INT16 48
 #define INT32_TO_INT16 16
+#define IN_SCALE_SHIFT 32
+#define MUL_MAX_SHIFT 31
 
 namespace mli {
 namespace krn {
@@ -96,6 +97,42 @@ out_T eltwise_perform_operation(
     return res;
 }
 
+
+template <>
+MLI_FORCE_INLINE int8_t eltwise_perform_operation<int8_t, int8_t, ELTWISE_MUL, true>(
+        const int8_t op1,
+        const int8_t op2,
+        const int16_t in_offset1,
+        const int16_t in_offset2,
+        const int16_t out_offset,
+        const int16_t scale_factor1,
+        const int16_t scale_factor2,
+        const int pre_op_shift1,
+        const int pre_op_shift2,
+        const int post_op_shift) {
+    int8_t res = 0;
+    int32_t acc;
+    int32_t input1, input2;
+
+    input1 = mli_math_sub_fx<int16_t> (op1, in_offset1);
+    input2 = mli_math_sub_fx<int16_t> (op2, in_offset2);
+
+    acc = mli_math_mul_fx<int32_t, int64_t> (input1, input2);
+    const int headroom = 3;
+    const int acc_len = 32;
+    const int out_len = 8;
+    const int target_out_shift = acc_len - out_len - headroom;
+    const int preshift = mli_math_min_fx(mli_math_max_fx(post_op_shift - target_out_shift, 0), headroom);
+    const int shift = post_op_shift - preshift;
+    int16_t acc_result = mli_math_cast_fx<int32_t, int16_t>(acc, preshift);
+    int32_t acc_scaled = mli_math_mul_fx<int16_t, int32_t> (acc_result, scale_factor1);
+    int16_t tmp16 = mli_math_cast_fx<int32_t, int16_t>(acc_scaled, shift);
+    tmp16 = mli_math_add_fx<int16_t>(tmp16, out_offset);
+    res = mli_math_cast_fx<int16_t, int8_t>(tmp16, 0);
+
+    return res;
+}
+
 template <typename io_T, mli_eltwise_type func_type, bool convert>
 void eltwise_innerloop(
         const MLI_PTR(io_T) __restrict  op1_ptr,
@@ -240,18 +277,14 @@ void eltwise_prepare_and_run(
             scale16_2 = scale16_1;
             post_op_shift -= shift;
         } else if (func_type == ELTWISE_MUL) {
-            in_scale_fx1 = mli_math_asr_rnd_fx<int32_t>(scale_1,
-                    (int32_t) shift1 - frac_bits_fx16);
-            in_scale_fx2 = mli_math_asr_rnd_fx<int32_t>(scale_2,
-                    (int32_t) shift2 - frac_bits_fx16);
-            out_scale_fx = mli_math_asr_rnd_fx<int32_t>(scale_out,
-                    (int32_t) shift_out - frac_bits_fx16);
-            int64_t scale_factor = mli_math_asr_rnd_fx<int64_t>(in_scale_fx1, -INT32_TO_INT16);
-            scale_factor = (scale_factor / out_scale_fx) * in_scale_fx2;
-            post_op_shift = INT32_TO_INT16 + frac_bits_fx16;
-            int norm = (scale_factor != 0) ? mli_math_norm_fx<int64_t, int>(scale_factor) : 0;
-            int shift = MAX((INT64_TO_INT16 - norm), 0);
-            scale16_1 = mli_math_cast_fx<int64_t, int16_t>(scale_factor, shift);
+            int64_t scale_factor = mli_math_asl_fx<int64_t>(scale_1, IN_SCALE_SHIFT);
+            scale_factor = ((scale_factor * scale_2) / scale_out);
+            post_op_shift = IN_SCALE_SHIFT + shift1 + shift2 - shift_out;
+            int shift;
+            scale16_1 = mli_math_norm_cast_fx<int64_t, int16_t>(scale_factor, &shift);
+            post_op_shift -= shift;
+            shift = MAX(post_op_shift - MUL_MAX_SHIFT, 0) + MIN(MUL_MAX_SHIFT + post_op_shift, 0);
+            scale16_1 = mli_math_asr_rnd_fx<int16_t>(scale16_1, shift);
             post_op_shift -= shift;
         } else {
             in_scale_fx1 = mli_math_asr_rnd_fx<int32_t>(scale_1,
diff --git a/lib/src/kernels/eltwise/impl/mli_krn_eltwise_vdsp.h b/lib/src/kernels/eltwise/impl/mli_krn_eltwise_vdsp.h
@@ -18,20 +18,20 @@
 #include "arc_vector.h"
 
 const int unroll_factor[2][5] = {
-        {
-        /* ELTWISE_ADD_NO_CONVERT = */ 1,
-        /* ELTWISE_SUB_NO_CONVERT = */ 1,
-        /* ELTWISE_MUL_NO_CONVERT = */ 4,
-        /* ELTWISE_MAX_NO_CONVERT = */ 4,
-        /* ELTWISE_MIN_NO_CONVERT = */ 4
-        } ,
-        {
-        /* ELTWISE_ADD_CONVERT = */ 1,
-        /* ELTWISE_SUB_CONVERT = */ 1,
-        /* ELTWISE_MUL_CONVERT = */ 3,
-        /* ELTWISE_MAX_CONVERT = */ 3,
-        /* ELTWISE_MIN_CONVERT = */ 3
-        }
+{
+/* ELTWISE_ADD_NO_CONVERT = */ 1,
+/* ELTWISE_SUB_NO_CONVERT = */ 1,
+/* ELTWISE_MUL_NO_CONVERT = */ 4,
+/* ELTWISE_MAX_NO_CONVERT = */ 4,
+/* ELTWISE_MIN_NO_CONVERT = */ 4
+} ,
+{
+/* ELTWISE_ADD_CONVERT = */ 1,
+/* ELTWISE_SUB_CONVERT = */ 1,
+/* ELTWISE_MUL_CONVERT = */ 4,
+/* ELTWISE_MAX_CONVERT = */ 3,
+/* ELTWISE_MIN_CONVERT = */ 3
+}
 };
 
 namespace mli {
@@ -296,51 +296,61 @@ MLI_FORCE_INLINE vNx4char_t eltwise_perform_operation<vNx4char_t, vNx4char_t, EL
         const int pre_op_shift1,
         const int pre_op_shift2,
         const int post_op_shift) {
-    MLI_ASSERT(post_op_shift > 3);
     vNx4char_t res;
+    const int headroom = 3;
+    const int hi_comp = 16;
+    const int acc_len = 32;
+    const int out_len = 8;
+    const int target_out_shift = acc_len - out_len - headroom;
+    const int preshift = mli_math_min_fx(mli_math_max_fx(post_op_shift - target_out_shift, 0), headroom);
+    const int shift = post_op_shift - hi_comp - preshift;
+    const int shift_left = mli_math_max_fx(1 - shift, 0);
+    const int shift_right = mli_math_max_fx(shift, 1);
 
 #if defined(__Xvec_guard_bit_option) && __Xvec_guard_bit_option != 0
     /*
      *  res = ((op1 - in_offset1) * (op2 - in_offset2) * scale_factor1 >> post_op_shift) + out_offset
-     *  acc_init = in_offset1 * in_offset2 * scale_factor + out_offset << post_op_shift
-     *  term1  = op1 * op2 * scale_factor1          // 31 bit
-     *  term2 = - op2 * in_offset1 * scale_factor1  // 32 bit
-     *  term3 = - op1 * in_offset2 * scale_factor1  // 32 bit
+     *  acc_init = in_offset1 * in_offset2
+     *  term1  = op1 * op2 * scale_factor1
+     *  term2 = - op2 * in_offset1 * scale_factor1
+     *  term3 = - op1 * in_offset2 * scale_factor1
+     *  acc = (term1 + term2 + term3) * scale_factor >> post_op_shift + out_offset
+     *
      */
+
     int16_t acc_init = in_offset1 * in_offset2;
     vNx4accshort_t acc16 = mli_math_init_accu<int16_t, vNx4accshort_t>(acc_init);
     acc16 = mli_math_mac_fx(acc16, op1, op2);
     acc16 = mli_math_msub_fx(acc16, op2, (vNx4char_t)(int8_t)in_offset1);
     acc16 = mli_math_msub_fx(acc16, op1, (vNx4char_t)(int8_t)in_offset2);
-    vNx4short_t vacc16 = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(acc16);
-    vNx4int_t acc = mli_math_mul_fx<vNx4short_t, vNx4int_t>(vacc16, scale_factor1);
-    acc = mli_math_asr_rnd_fx(acc, post_op_shift);
-    acc = mli_math_add_fx(acc, (vNx4int_t) out_offset);
-    res = mli_math_cast_fx<vNx4int_t, vNx4char_t>(acc);
-#else
+
     /*
-    * Each operand is 9 bit. The first multiplier output is 18 bit. After scaling with positive 15 bit scale_factor,
-    * The second multiplier output is 32 bits. A headroom of 3 is sufficient to add the offset, round and compensate.
-    *
-    * Note: Minimum shift value is 15
-    */
-
-    const int preshift_sf = 3;
-    const int mask = (1 << preshift_sf) - 1;
+     * If we preshift we can continue the operations in 16 bits. Only 8 bits are needs from the
+     * mul_hi output. with headroom of 3 bits.
+     */
+
+    vNx4short_t vacc16 = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(acc16, preshift);
+
+
+#else
+
     vNx4short_t op1_offset = to_vNx4short_t(op1) - in_offset1;
     vNx4short_t op2_offset = to_vNx4short_t(op2) - in_offset2;
-    vNx4int_t temp1 = mli_math_mul_fx<vNx4short_t, vNx4int_t>(op1_offset, op2_offset);
-    vNx4int_t temp2 = (scale_factor1 & mask);
-    vNx4int_t offset = out_offset;
-    vNx4accint_t acc = mli_math_mul_fx_low(temp1, temp2);
-    acc = mli_math_asr_fx(acc, preshift_sf);
-    temp2 = (scale_factor1 >> preshift_sf);
-    acc = mli_math_mac_fx_low(acc, temp1, temp2);
-    acc = mli_math_asr_rnd_fx(acc, post_op_shift - preshift_sf);
-    acc = mli_math_add(acc, offset);
-    res = mli_math_acc_cast_fx<vNx4char_t, vNx4accint_t>(acc);
+    vNx4int_t acc32 = mli_math_mul_fx<vNx4short_t, vNx4int_t>(op1_offset, op2_offset);
+
+    /*
+     * If we preshift we can continue the operations in 16 bits. Only 8 bits are needs from the
+     * mul_hi output. with headroom of 3 bits.
+     */
+
+    vNx4short_t vacc16 = mli_math_cast_fx<vNx4int_t, vNx4short_t>(acc32, preshift);
 #endif
 
+    vacc16 = mli_math_asl_fx(vacc16, shift_left);
+    vNx4short_t accu_scaled = mli_math_mul_fx_high(vacc16, scale_factor1);
+    accu_scaled = mli_math_asr_rnd_fx(accu_scaled, shift_right);
+    accu_scaled = mli_math_add_fx(accu_scaled, (vNx4short_t) out_offset);
+    res = mli_math_cast_fx<vNx4short_t, vNx4char_t>(accu_scaled);
 
     return res;
 }
@@ -549,6 +559,7 @@ void eltwise_innerloop(
         idx_out += num_lanes;
     }
 }
+
 template<>
 MLI_FORCE_INLINE void eltwise_innerloop<int16_t, ELTWISE_MAX, false>(
         const MLI_PTR(int16_t) __restrict op1_ptr,
diff --git a/lib/src/kernels/eltwise/mli_krn_eltwise_decl.h b/lib/src/kernels/eltwise/mli_krn_eltwise_decl.h
@@ -80,7 +80,7 @@ void eltwise_op_basic(
         const int out_offset);
 
 template <typename in_T, typename out_T, mli_eltwise_type func_type, bool convert>
-MLI_FORCE_INLINE out_T eltwise_perform_operation(
+out_T eltwise_perform_operation(
         const in_T op1,
         const in_T op2,
         const int16_t in_offset1,
@@ -92,6 +92,19 @@ MLI_FORCE_INLINE out_T eltwise_perform_operation(
         const int pre_op_shift2,
         const int post_op_shift);
 
+template <>
+MLI_FORCE_INLINE int8_t eltwise_perform_operation <int8_t, int8_t, ELTWISE_MUL, true>(
+        const int8_t op1,
+        const int8_t op2,
+        const int16_t in_offset1,
+        const int16_t in_offset2,
+        const int16_t out_offset,
+        const int16_t scale_factor1,
+        const int16_t scale_factor2,
+        const int pre_op_shift1,
+        const int pre_op_shift2,
+        const int post_op_shift);
+
 template <typename io_T, mli_eltwise_type func_type, bool convert>
 void eltwise_innerloop(
         const MLI_PTR(io_T) __restrict op1_ptr,
diff --git a/lib/src/pal/dsp/mli_math.h b/lib/src/pal/dsp/mli_math.h
@@ -43,6 +43,11 @@ MLI_FORCE_INLINE T mli_math_asl_fx(T x, int nbits);
 template <typename T>
 MLI_FORCE_INLINE T mli_math_asr_fx(T x, int nbits);
 
+template <>
+MLI_FORCE_INLINE int64_t mli_math_asl_fx(int64_t x, int nbits) {
+    return fx_asl_q63(x, nbits);
+}
+
 template <>
 MLI_FORCE_INLINE int32_t mli_math_asl_fx(int32_t x, int nbits) {
     return fx_asl_q31(x, nbits);
diff --git a/lib/src/pal/vdsp/mli_math.h b/lib/src/pal/vdsp/mli_math.h
@@ -1351,6 +1351,20 @@ MLI_FORCE_INLINE vNx4char_t mli_math_acc_cast_fx(vNx4accint_t acc, int shift_rig
     return to_vNx4char_t(accu_result);
 }
 
+template<>
+MLI_FORCE_INLINE vNx4short_t mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t,/*round = */ false>(
+        vNx4accshort_t acc, int shift_right) {
+    MLI_EXTRA_ASSERT(shift_right >= 0);
+
+    int ctrlword = SAT|SIGNED|TARGET_SZ_16|SHIFT(shift_right);
+    vNx4short_t accu_result;
+    accu_result.lo = to_vNx2short_t(vvconvert(__vacc_lo(acc), ctrlword));
+    accu_result.hi = to_vNx2short_t(vvconvert(__vacc_hi(acc), ctrlword));
+
+    return accu_result;
+}
+
+
 template<>
 MLI_FORCE_INLINE vNx4char_t mli_math_acc_cast_fx<vNx4char_t, vNx4accint_t,/*round = */ false>(
         vNx4accint_t acc, int shift_right) {
diff --git a/user_tests/tests/mli_krn_eltwise/tests_mli_krn_eltwise.cc b/user_tests/tests/mli_krn_eltwise/tests_mli_krn_eltwise.cc
@@ -56,7 +56,7 @@ const crc32_calc                                    test_1_chksum_sa8{ 0xd48163e
                                                     test_4_chksum_sa8{ 0xF22D7321 },
                                                     test_5_chksum_sa8{ 0x9A14384C },
                   test_6_chksum_fx16{ 0xfc026def }, test_6_chksum_sa8{ 0x3a54561 },
-                  test_7_chksum_fx16{ 0x488ed527 }, test_7_chksum_sa8{ 0xDA50B98A },
+                  test_7_chksum_fx16{ 0x488ed527 }, test_7_chksum_sa8{ 0xD4B7515B },
                   test_8_chksum_fx16{ 0x68889D84 }, test_8_chksum_sa8{ 0x168B3B32 },
                   test_9_chksum_fx16{ 0x9417F3D7 }, test_9_chksum_sa8{ 0xA83B910E },
                   test_10_chksum_fx16{ 0xD728E430 }, test_10_chksum_sa8{ 0xE34DA6B0 },
diff --git a/user_tests/tests/mli_krn_gru_cell/tests_mli_krn_gru_cell.cc b/user_tests/tests/mli_krn_gru_cell/tests_mli_krn_gru_cell.cc
@@ -60,29 +60,28 @@ struct gru_cell_test_operands {
 #if defined(CRC_RM_UP)
 const crc32_calc test_1_chksum_fx16{ 0xCA3B3621 }, test_1_chksum_fx16_fx8_fx8{ 0x7C81E8FA }, test_1_chksum_sa8{ 0xBA369AB3 },
                  test_2_chksum_fx16{ 0xCA3B3621 }, test_2_chksum_fx16_fx8_fx8{ 0x7C81E8FA }, test_2_chksum_sa8{ 0xBA369AB3 },
-                 test_3_chksum_fx16{ 0x0575B7B5 }, test_3_chksum_fx16_fx8_fx8{ 0x3105731C }, test_3_chksum_sa8{ 0xBC580566 },
-                 test_4_chksum_fx16{ 0xA957E40B }, test_4_chksum_fx16_fx8_fx8{ 0x44D14AA8 }, test_4_chksum_sa8{ 0x2F5C16B5 },
-                 test_5_chksum_fx16{ 0xA9D9FC7B }, test_5_chksum_fx16_fx8_fx8{ 0xB08CE82C }, test_5_chksum_sa8{ 0x08D240F4 },
-                 test_6_chksum_fx16{ 0x82B87A3D }, test_6_chksum_fx16_fx8_fx8{ 0x1D12879D }, test_6_chksum_sa8{ 0x921BE561 };
+                 test_3_chksum_fx16{ 0x0575B7B5 }, test_3_chksum_fx16_fx8_fx8{ 0x3105731C }, test_3_chksum_sa8{ 0xD7D30910 },
+                 test_4_chksum_fx16{ 0xA957E40B }, test_4_chksum_fx16_fx8_fx8{ 0x44D14AA8 }, test_4_chksum_sa8{ 0x551350E9 },
+                 test_5_chksum_fx16{ 0xA9D9FC7B }, test_5_chksum_fx16_fx8_fx8{ 0xB08CE82C }, test_5_chksum_sa8{ 0x482C5F79 },
+                 test_6_chksum_fx16{ 0x82B87A3D }, test_6_chksum_fx16_fx8_fx8{ 0x1D12879D }, test_6_chksum_sa8{ 0xBC364FC3 };
 
 #elif defined(CRC_RM_CONVERGENT)
 // TODO: remove after fixing mli_math_acc_ashift_fx() and supporting acc40 shift with round
 #if defined(__FXAPI__)
 const crc32_calc test_1_chksum_fx16{ 0xE5852A3E }, test_1_chksum_fx16_fx8_fx8{ 0xF979CA35 }, test_1_chksum_sa8{ 0xBA369AB3 },
                  test_2_chksum_fx16{ 0xE5852A3E }, test_2_chksum_fx16_fx8_fx8{ 0xF979CA35 }, test_2_chksum_sa8{ 0xBA369AB3 },
-                 test_3_chksum_fx16{ 0x6F7E4D9B }, test_3_chksum_fx16_fx8_fx8{ 0xE47B56B4 }, test_3_chksum_sa8{ 0xBC580566 },
-                 test_4_chksum_fx16{ 0x3A84CF63 }, test_4_chksum_fx16_fx8_fx8{ 0x202E9565 }, test_4_chksum_sa8{ 0x2F5C16B5 },
-                 test_5_chksum_fx16{ 0xD81EFB70 }, test_5_chksum_fx16_fx8_fx8{ 0x7C0CE29B }, test_5_chksum_sa8{ 0x08D240F4 },
-                 test_6_chksum_fx16{ 0x31D77812 }, test_6_chksum_fx16_fx8_fx8{ 0x1D12879D }, test_6_chksum_sa8{ 0x921BE561 };
+                 test_3_chksum_fx16{ 0x6F7E4D9B }, test_3_chksum_fx16_fx8_fx8{ 0xE47B56B4 }, test_3_chksum_sa8{ 0xB0B3B302 },
+                 test_4_chksum_fx16{ 0x3A84CF63 }, test_4_chksum_fx16_fx8_fx8{ 0x202E9565 }, test_4_chksum_sa8{ 0xE0C80764 },
+                 test_5_chksum_fx16{ 0xD81EFB70 }, test_5_chksum_fx16_fx8_fx8{ 0x7C0CE29B }, test_5_chksum_sa8{ 0xB5805E4A },
+                 test_6_chksum_fx16{ 0x31D77812 }, test_6_chksum_fx16_fx8_fx8{ 0x1D12879D }, test_6_chksum_sa8{ 0xBC364FC3 };
 #else
 const crc32_calc test_1_chksum_fx16{ 0xCA3B3621 }, test_1_chksum_fx16_fx8_fx8{ 0xF979CA35 }, test_1_chksum_sa8{ 0xBA369AB3 },
                  test_2_chksum_fx16{ 0xCA3B3621 }, test_2_chksum_fx16_fx8_fx8{ 0xF979CA35 }, test_2_chksum_sa8{ 0xBA369AB3 },
-                 test_3_chksum_fx16{ 0x0575B7B5 }, test_3_chksum_fx16_fx8_fx8{ 0xE47B56B4 }, test_3_chksum_sa8{ 0xBC580566 },
-                 test_4_chksum_fx16{ 0x4DEDC850 }, test_4_chksum_fx16_fx8_fx8{ 0x202E9565 }, test_4_chksum_sa8{ 0x2F5C16B5 },
-                 test_5_chksum_fx16{ 0xA9D9FC7B }, test_5_chksum_fx16_fx8_fx8{ 0x7C0CE29B }, test_5_chksum_sa8{ 0x08D240F4 },
-                 test_6_chksum_fx16{ 0x82B87A3D }, test_6_chksum_fx16_fx8_fx8{ 0x1D12879D }, test_6_chksum_sa8{ 0x921BE561 };
+                 test_3_chksum_fx16{ 0x0575B7B5 }, test_3_chksum_fx16_fx8_fx8{ 0xE47B56B4 }, test_3_chksum_sa8{ 0xB0B3B302 },
+                 test_4_chksum_fx16{ 0x4DEDC850 }, test_4_chksum_fx16_fx8_fx8{ 0x202E9565 }, test_4_chksum_sa8{ 0xE0C80764 },
+                 test_5_chksum_fx16{ 0xA9D9FC7B }, test_5_chksum_fx16_fx8_fx8{ 0x7C0CE29B }, test_5_chksum_sa8{ 0xB5805E4A },
+                 test_6_chksum_fx16{ 0x82B87A3D }, test_6_chksum_fx16_fx8_fx8{ 0x1D12879D }, test_6_chksum_sa8{ 0xBC364FC3 };
 #endif
-
 #else // Not defined CRC_*
 const crc32_calc  test_1_chksum_fx16, test_1_chksum_fx16_fx8_fx8, test_1_chksum_sa8,
                   test_2_chksum_fx16, test_2_chksum_fx16_fx8_fx8, test_2_chksum_sa8,
diff --git a/user_tests/tests/mli_krn_lstm_cell/tests_mli_krn_lstm_cell.cc b/user_tests/tests/mli_krn_lstm_cell/tests_mli_krn_lstm_cell.cc