fix lstm requant

AhmedHussein535 · JaccovG · commit 46a8b1f5788a · 2021-04-13T14:44:56.000+02:00
diff --git a/lib/src/bricks/impl/mli_krn_rnn_dense_op_ref.h b/lib/src/bricks/impl/mli_krn_rnn_dense_op_ref.h
@@ -140,7 +140,7 @@ static inline void rnn_dense_op(
                     /* row_step= */ 1, /* ch_step= */ 1);
             accu = mli_math_add_fx(accu, other_additives[idx]);
 
-            acc_ir = mli::krn::ir_rnn_result_requantize(accu, &in_to_out_quant_params[idx]);
+            acc_ir = mli::krn::ir_rnn_result_requantize<acc_T>(accu, &in_to_out_quant_params[idx]);
             acc_res_ir = mli_math_add_fx(acc_res_ir, acc_ir);
             accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
         }
diff --git a/lib/src/bricks/impl/mli_krn_rnn_dense_op_vdsp.h b/lib/src/bricks/impl/mli_krn_rnn_dense_op_vdsp.h
@@ -94,6 +94,10 @@ static inline void rnn_dense_op_stacked(
     dense_out_ptr -= gates_num * out_elements;
 }
 
+MLI_FORCE_INLINE vNx4int_t mli_math_add_accus(vNx4int_t L, vNx4int_t R) {
+    return mli_math_add_fx(L, R);
+}
+
 MLI_FORCE_INLINE vNx2accint_t mli_math_add_accus(vNx2accint_t L, vNx2accint_t R) {
 	return mli_math_add(L, R);
 }
@@ -128,16 +132,16 @@ static inline void rnn_dense_op(
         quant_T * in_to_out_quant_params,
         const io_T val_min_limit,
         const io_T val_max_limit) {
-
+    typedef typename std::conditional<std::is_same<acc_T, vNx4accshort_t>::value, vNx4int_t, acc_T>::type ir_T;
     int num_lanes = get_number_lanes<acc_T>();
 
     for (int o_idx = 0; o_idx < out_elements; o_idx += num_lanes) {
         int remaining_ch = out_elements - o_idx;
         int current_chs = MIN(remaining_ch, num_lanes); // number of channels computed in this loop iteration
 
         acc_T accu = mli_prv_init_accu<acc_T>();
-        acc_T acc_ir = mli_prv_init_accu<acc_T>();
-        acc_T acc_res_ir = mli_prv_init_accu<acc_T>();
+        ir_T acc_ir = mli_prv_init_accu<ir_T>();
+        ir_T acc_res_ir = mli_prv_init_accu<ir_T>();
 
         auto output_params = adjust_quant_params_v(&in_to_out_quant_params[0], 0);
         accu = mli::krn::bias_additive(&bias[o_idx], accu, &output_params, /* add_preshift_rnd */ false);
@@ -150,7 +154,7 @@ static inline void rnn_dense_op(
 
             /* TODO: can be optimized using adjust_quant_params_v, and also optimize ir_rnn_result_requantize function */
             mli::krn::ref::adjust_quant_params(&in_to_out_quant_params[idx], o_idx);
-            acc_ir = mli::krn::ir_rnn_result_requantize(accu, &in_to_out_quant_params[idx]);
+            acc_ir = mli::krn::ir_rnn_result_requantize<acc_T, ir_T>(accu, &in_to_out_quant_params[idx]);
 
             acc_res_ir = mli_math_add_accus(acc_res_ir, acc_ir);
             accu = mli_prv_init_accu<acc_T>();
diff --git a/lib/src/bricks/impl/mli_prv_quant_vdsp.h b/lib/src/bricks/impl/mli_prv_quant_vdsp.h
@@ -503,6 +503,25 @@ MLI_FORCE_INLINE void ir_result_cast_relu_store_v(
     mli_prv_store_n_samples(o_ptr, out, num);
 }
 
+template <>
+MLI_FORCE_INLINE void ir_result_cast_relu_store_v(
+        MLI_CONV_OUT_PTR(int8_t) __restrict o_ptr,
+        vNx4int_t acc,
+        const s8asym_quant_specific_out_params_v* quant_params,
+        const int16_t val_min_limit,
+        const int16_t val_max_limit,
+        int num) {
+
+    vNx4short_t accu_scaled = mli_math_cast_fx<vNx4int_t, vNx4short_t>(acc);
+    accu_scaled = mli_math_add_fx<vNx4short_t>(accu_scaled, quant_params->out_offset);
+
+    accu_scaled = mli_math_min_fx(accu_scaled, val_max_limit);
+    accu_scaled = mli_math_max_fx(accu_scaled, val_min_limit);
+
+    vNx4char_t out = to_vNx4char_t(accu_scaled);
+    mli_prv_store_n_samples(o_ptr, out, num);
+}
+
 template <>
 MLI_FORCE_INLINE void ir_result_cast_relu_store_v(
         MLI_CONV_OUT_PTR(int16_t) __restrict o_ptr,
@@ -537,19 +556,19 @@ MLI_FORCE_INLINE void ir_result_cast_relu_store_v(
     mli_prv_store_n_samples(o_ptr, out, num);
 }
 
-template <typename acc_T>
-MLI_FORCE_INLINE acc_T ir_rnn_result_requantize(
+template <typename acc_T, typename out_T=acc_T>
+MLI_FORCE_INLINE out_T ir_rnn_result_requantize(
 		const acc_T acc,
 		const fx_quant_specific_params* params) {
     const int in_to_ir_shift = params->out_shift;
     int shift_right = mli_math_max_fx(in_to_ir_shift, 0);
     int shift_left = mli_math_max_fx(-in_to_ir_shift, 0);
-    acc_T acc_shifted = mli_math_asl_fx(acc, shift_left);
-    return mli_math_asr_rnd_fx<acc_T, int>(acc_shifted, shift_right);
+    out_T acc_shifted = mli_math_asl_fx(acc, shift_left);
+    return mli_math_asr_rnd_fx<out_T, int>(acc_shifted, shift_right);
 }
 
 template <>
-MLI_FORCE_INLINE vNx4accshort_t ir_rnn_result_requantize(
+MLI_FORCE_INLINE vNx4int_t ir_rnn_result_requantize(
         const vNx4accshort_t acc,
         const s8asym_quant_specific_params* params) {
 
@@ -578,28 +597,7 @@ MLI_FORCE_INLINE vNx4accshort_t ir_rnn_result_requantize(
     acc_shifted = mli_math_asr_rnd_fx(acc_shifted, shift_right);
     acc_shifted = mli_math_asl_fx(acc_shifted, shift_left);
 
-#if (__Xvec_guard_bit_option == 0)
-    vNx4short_t acc_short = mli_math_cast_fx<vNx4int_t, vNx4short_t>(acc_shifted);
-    vNx4accshort_t res = mli_math_init_accu_add<vNx4short_t, vNx4accshort_t>(acc_short, (vNx4short_t)0);
-#else
-    vNx4int_t norm;
-    vNx4short_t acc_short = mli_math_norm_cast_fx</*left_shift*/ false>(acc_shifted , &norm);
-
-    constexpr int guard_bits = 8;
-    vNx4int_t mask = (1 << norm) - 1;
-    vNx4int_t acc_shifted_low = acc_shifted & mask;
-    // If the norm is more than the number of guardbits,
-    // so the masked_acc has to be shifted, since the result is shifted with max shift equals to number of guardbits.
-    vNx4int_t mask_shift = mli_math_max_fx(norm - guard_bits, 0);
-    acc_shifted_low = mli_math_asr_fx(acc_shifted_low, mask_shift);
-
-    norm = mli_math_min_fx(norm, guard_bits);
-    vNx4accshort_t res = mli_math_init_accu_add<vNx4short_t, vNx4accshort_t>(acc_short, (vNx4short_t)0);
-    res = mli_math_asl_fx<vNx4accshort_t, vNx4short_t>(res, to_vNx4short_t(norm));
-    res = mli_math_add(res, to_vNx4short_t(acc_shifted_low));
-#endif
-
-    return res;
+    return acc_shifted;
 }
 
 } // namespace vdsp
diff --git a/lib/src/bricks/mli_prv_quant_decl.h b/lib/src/bricks/mli_prv_quant_decl.h
@@ -420,16 +420,17 @@ MLI_FORCE_INLINE void ir_result_cast_relu_store_v(
         int num);
 #endif
 
-template <typename acc_T, typename quant_T>
-MLI_FORCE_INLINE acc_T ir_rnn_result_requantize(
+template <typename acc_T, typename out_T, typename quant_T>
+MLI_FORCE_INLINE out_T ir_rnn_result_requantize(
         const acc_T acc, const quant_T* params);
-template <typename acc_T>
-MLI_FORCE_INLINE acc_T ir_rnn_result_requantize(
+
+template <typename acc_T, typename out_T>
+MLI_FORCE_INLINE out_T ir_rnn_result_requantize(
         const acc_T acc, const fx_quant_specific_params* params);
 
 #if defined(__Xvec_width)
 template <>
-MLI_FORCE_INLINE vNx4accshort_t ir_rnn_result_requantize(
+MLI_FORCE_INLINE vNx4int_t ir_rnn_result_requantize(
         const vNx4accshort_t acc,
         const s8asym_quant_specific_params* params);
 #endif
diff --git a/lib/src/pal/vdsp/mli_prv_dsp.h b/lib/src/pal/vdsp/mli_prv_dsp.h
@@ -286,6 +286,11 @@ MLI_FORCE_INLINE vNx4accshort_t mli_prv_init_accu<vNx4accshort_t>() {
     return vvcmpy((vNx4char_t)0, (int8_t)0);
 }
 
+template<>
+MLI_FORCE_INLINE vNx4int_t mli_prv_init_accu<vNx4int_t>() {
+    return ((vNx4int_t) (0));
+}
+
 MLI_FORCE_INLINE vNx4accshort_t mli_prv_init_accu(vNx4char_t l, int8_t r) {
     return vvcmpy(l, r);
 }
@@ -299,7 +304,6 @@ MLI_FORCE_INLINE vNx2accint_t mli_prv_init_accu(vNx2short_t l, int16_t r) {
     return vvcmpy(l, r);
 }
 
-
 template<>
 MLI_FORCE_INLINE vNx4accint_t mli_prv_init_accu<vNx4accint_t>() {
     vNx4accint_t r;

Original file line number	Diff line number	Diff line change
`@@ -140,7 +140,7 @@ static inline void rnn_dense_op(`
`140`	`140`	`/* row_step= / 1, / ch_step= */ 1);`
`141`	`141`	`accu = mli_math_add_fx(accu, other_additives[idx]);`
`142`	`142`
`143`		`- acc_ir = mli::krn::ir_rnn_result_requantize(accu, &in_to_out_quant_params[idx]);`
	`143`	`+ acc_ir = mli::krn::ir_rnn_result_requantize<acc_T>(accu, &in_to_out_quant_params[idx]);`
`144`	`144`	`acc_res_ir = mli_math_add_fx(acc_res_ir, acc_ir);`
`145`	`145`	`accu = mli_math_mul_fx<io_T, acc_T>(0, 0);`
`146`	`146`	`}`
Original file line number	Diff line number	Diff line change
`@@ -286,6 +286,11 @@ MLI_FORCE_INLINE vNx4accshort_t mli_prv_init_accu<vNx4accshort_t>() {`
`286`	`286`	`return vvcmpy((vNx4char_t)0, (int8_t)0);`
`287`	`287`	`}`
`288`	`288`
	`289`	`+template<>`
	`290`	`+MLI_FORCE_INLINE vNx4int_t mli_prv_init_accu<vNx4int_t>() {`
	`291`	`+ return ((vNx4int_t) (0));`
	`292`	`+}`
	`293`	`+`
`289`	`294`	`MLI_FORCE_INLINE vNx4accshort_t mli_prv_init_accu(vNx4char_t l, int8_t r) {`
`290`	`295`	`return vvcmpy(l, r);`
`291`	`296`	`}`
`@@ -299,7 +304,6 @@ MLI_FORCE_INLINE vNx2accint_t mli_prv_init_accu(vNx2short_t l, int16_t r) {`
`299`	`304`	`return vvcmpy(l, r);`
`300`	`305`	`}`
`301`	`306`
`302`		`-`
`303`	`307`	`template<>`
`304`	`308`	`MLI_FORCE_INLINE vNx4accint_t mli_prv_init_accu<vNx4accint_t>() {`
`305`	`309`	`vNx4accint_t r;`