Increase LSTM accuracy

AhmedHussein535 · dzakhar · commit 8edb47fbf133 · 2021-08-18T18:04:52.000+03:00
diff --git a/lib/src/kernels/common/impl/mli_krn_lstm_cell_ref.h b/lib/src/kernels/common/impl/mli_krn_lstm_cell_ref.h
@@ -84,8 +84,8 @@ MLI_FORCE_INLINE void lstm_cell_prepare_and_run(
         ir_asym_params.sa.scale_frac_bits.capacity = 0;
         ir_tensor.el_params = ir_asym_params;
     } else {
-        // 1sign and 3 integer bits for TANH/SIGM input is enough
-        ir_tensor.el_params.fx.frac_bits = (sizeof(io_T) * 8) - 1 - 3;
+        // [-32, 32] is enough for TANH/SIGM input
+        ir_tensor.el_params.fx.frac_bits = 10;
         ir_tensor.el_params.fx.frac_bits = MIN(ir_tensor.el_params.fx.frac_bits, in->el_params.fx.frac_bits + weights_in->el_params.fx.frac_bits);
     }
 
diff --git a/lib/src/kernels/common/impl/mli_krn_lstm_cell_vdsp.h b/lib/src/kernels/common/impl/mli_krn_lstm_cell_vdsp.h
@@ -86,8 +86,8 @@ MLI_FORCE_INLINE void lstm_cell_prepare_and_run(
         ir_asym_params.sa.scale_frac_bits.capacity = 0;
         ir_tensor.el_params = ir_asym_params;
     } else {
-        // 1sign and 3 integer bits for TANH/SIGM input is enough
-        ir_tensor.el_params.fx.frac_bits = (sizeof(io_T) * 8) - 1 - 3;
+        // [-32, 32] is enough for TANH/SIGM input
+        ir_tensor.el_params.fx.frac_bits = 10;
         ir_tensor.el_params.fx.frac_bits = MIN(ir_tensor.el_params.fx.frac_bits, in->el_params.fx.frac_bits + weights_in->el_params.fx.frac_bits);
     }
 
diff --git a/lib/src/pal/dsp/mli_math.h b/lib/src/pal/dsp/mli_math.h
@@ -186,7 +186,7 @@ template <> MLI_FORCE_INLINE v2q15_t mli_math_sub_fx(v2q15_t L, v2q15_t R) {
 
 // Maximum of two fx operands
 //========================================================================
-template < typename io_T > 
+template < typename io_T >
 MLI_FORCE_INLINE io_T mli_math_max_fx(io_T L, io_T R) {
     return MAX(L, R);
 }
@@ -196,19 +196,19 @@ MLI_FORCE_INLINE l_T mli_math_max_fx(l_T L, r_T R) {
     return MAX(L, R);
 }
 
-template <> 
+template <>
 MLI_FORCE_INLINE v2q15_t mli_math_max_fx(v2q15_t L, v2q15_t R) {
     return fx_max_v2q15(L, R);
 }
 
-template <typename l_T, typename r_T> 
+template <typename l_T, typename r_T>
 MLI_FORCE_INLINE v2q15_t mli_math_max_fx(v2q15_t L, r_T R) {
     return fx_max_v2q15(L, fx_replic_v2q15(R));
 }
 
 // Minimum of two fx operands
 //========================================================================
-template < typename io_T > 
+template < typename io_T >
 MLI_FORCE_INLINE io_T mli_math_min_fx(io_T L, io_T R) {
     return MIN(L, R);
 }
@@ -218,12 +218,12 @@ MLI_FORCE_INLINE l_T mli_math_min_fx(l_T L, r_T R) {
     return (L < R) ? L : R;
 }
 
-template <> 
+template <>
 MLI_FORCE_INLINE v2q15_t mli_math_min_fx(v2q15_t L, v2q15_t R) {
     return fx_min_v2q15(L, R);
 }
 
-template <typename l_T, typename r_T> 
+template <typename l_T, typename r_T>
 MLI_FORCE_INLINE v2q15_t mli_math_min_fx(v2q15_t L, r_T R) {
     return fx_min_v2q15(L, fx_replic_v2q15(R));
 }
@@ -327,6 +327,11 @@ template <> MLI_FORCE_INLINE mli_acc32_t mli_math_acc_ashift_fx(mli_acc32_t acc,
 }
 
 template <> MLI_FORCE_INLINE mli_acc40_t mli_math_acc_ashift_fx(mli_acc40_t acc, int shift_right) {
+    if (shift_right > 0) {
+        mli_acc40_t rnd = {((1ll << shift_right) >> 1)};
+        acc = fx_add_a40(acc, rnd);
+    }
+
     return fx_asr_a40(acc, shift_right);
 }
 
@@ -386,7 +391,7 @@ template < typename in_T > MLI_FORCE_INLINE void *mli_math_cast_scalar_to_ptr_fx
 
 // Comparators
 //========================================================================
-template < typename io_T > 
+template < typename io_T >
 static MLI_FORCE_INLINE bool mli_prv_less_than_1(io_T value, uint8_t frac_bits) {
     if (frac_bits >= sizeof(io_T) * 8 - 1)
         return true;
diff --git a/user_tests/tests/mli_krn_gru_cell/tests_mli_krn_gru_cell.cc b/user_tests/tests/mli_krn_gru_cell/tests_mli_krn_gru_cell.cc
@@ -68,17 +68,6 @@ const crc32_calc test_1_chksum_fx16{ 0x93713917 }, test_1_chksum_fx16_fx8_fx8{ 0
                  test_8_chksum_fx16{ 0xDBDD80AD }, test_8_chksum_fx16_fx8_fx8{ 0x5D935ADE }, test_8_chksum_sa8{ 0x71E73A61 };
 
 #elif defined(CRC_RM_CONVERGENT)
-// TODO: remove after fixing mli_math_acc_ashift_fx() and supporting acc40 shift with round
-#if defined(__FXAPI__)
-const crc32_calc test_1_chksum_fx16{ 0x898EF9AC }, test_1_chksum_fx16_fx8_fx8{ 0xF3E45489 }, test_1_chksum_sa8{ 0x605D7927 },
-                 test_2_chksum_fx16{ 0x898EF9AC }, test_2_chksum_fx16_fx8_fx8{ 0xF3E45489 }, test_2_chksum_sa8{ 0x605D7927 },
-                 test_3_chksum_fx16{ 0xE14A4F30 }, test_3_chksum_fx16_fx8_fx8{ 0x0D9F97BB }, test_3_chksum_sa8{ 0x6A03698A },
-                 test_4_chksum_fx16{ 0xEBCB8726 }, test_4_chksum_fx16_fx8_fx8{ 0xBA61FDE2 }, test_4_chksum_sa8{ 0x3F8041AD },
-                 test_5_chksum_fx16{ 0x4E35CC3A }, test_5_chksum_fx16_fx8_fx8{ 0x63209ADF }, test_5_chksum_sa8{ 0xE36EB137 },
-                 test_6_chksum_fx16{ 0x44B4042C }, test_6_chksum_fx16_fx8_fx8{ 0xD4DEF086 }, test_6_chksum_sa8{ 0xB6ED9910 },
-                 test_7_chksum_fx16{ 0x697A5BA9 }, test_7_chksum_fx16_fx8_fx8{ 0x60AEE5D5 }, test_7_chksum_sa8{ 0xD54F47E2 },
-                 test_8_chksum_fx16{ 0xBDDA2972 }, test_8_chksum_fx16_fx8_fx8{ 0x1B477499 }, test_8_chksum_sa8{ 0x427B2A3F };
-#else
 const crc32_calc test_1_chksum_fx16{ 0x93713917 }, test_1_chksum_fx16_fx8_fx8{ 0xF3E45489 }, test_1_chksum_sa8{ 0x605D7927 },
                  test_2_chksum_fx16{ 0x93713917 }, test_2_chksum_fx16_fx8_fx8{ 0xF3E45489 }, test_2_chksum_sa8{ 0x605D7927 },
                  test_3_chksum_fx16{ 0xEA93E0FF }, test_3_chksum_fx16_fx8_fx8{ 0x0D9F97BB }, test_3_chksum_sa8{ 0x6A03698A },
@@ -87,7 +76,6 @@ const crc32_calc test_1_chksum_fx16{ 0x93713917 }, test_1_chksum_fx16_fx8_fx8{ 0
                  test_6_chksum_fx16{ 0x5F51D618 }, test_6_chksum_fx16_fx8_fx8{ 0xD4DEF086 }, test_6_chksum_sa8{ 0xB6ED9910 },
                  test_7_chksum_fx16{ 0xF35521BE }, test_7_chksum_fx16_fx8_fx8{ 0x60AEE5D5 }, test_7_chksum_sa8{ 0xD54F47E2 },
                  test_8_chksum_fx16{ 0xDBDD80AD }, test_8_chksum_fx16_fx8_fx8{ 0x1B477499 }, test_8_chksum_sa8{ 0x427B2A3F };
-#endif
 #else // Not defined CRC_*
 const crc32_calc  test_1_chksum_fx16, test_1_chksum_fx16_fx8_fx8, test_1_chksum_sa8,
                   test_2_chksum_fx16, test_2_chksum_fx16_fx8_fx8, test_2_chksum_sa8,
diff --git a/user_tests/tests/mli_krn_lstm_cell/tests_mli_krn_lstm_cell.cc b/user_tests/tests/mli_krn_lstm_cell/tests_mli_krn_lstm_cell.cc
diff --git a/user_tests/tests/mli_krn_rnn_dense/tests_mli_krn_rnn_dense.cc b/user_tests/tests/mli_krn_rnn_dense/tests_mli_krn_rnn_dense.cc