foss-for-synopsys-dwc-arc-processors
diff --git a/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_ref.h‎
Lines changed: 10 additions & 14 deletions b/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_ref.h‎
Lines changed: 10 additions & 14 deletions
diff --git a/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_vdsp.h‎
Lines changed: 35 additions & 14 deletions b/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_vdsp.h‎
Lines changed: 35 additions & 14 deletions
diff --git a/‎lib/src/bricks/impl/mli_prv_quant_ref.h‎
Lines changed: 54 additions & 13 deletions b/‎lib/src/bricks/impl/mli_prv_quant_ref.h‎
Lines changed: 54 additions & 13 deletions
diff --git a/‎lib/src/bricks/impl/mli_prv_quant_vdsp.h‎
Lines changed: 102 additions & 23 deletions b/‎lib/src/bricks/impl/mli_prv_quant_vdsp.h‎
Lines changed: 102 additions & 23 deletions
@@ -122,9 +122,11 @@ static inline void rnn_dense_op(
     }
 
     for (int o_idx = 0; o_idx < out_elements; o_idx++) {
-        io_T out_val = 0; 
+
         acc_T accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
-        acc_T prev_step = mli_math_mul_fx<io_T, acc_T>(0, 0);
+        acc_T acc_ir = mli_math_mul_fx<io_T, acc_T>(0, 0);
+        acc_T acc_res_ir = mli_math_mul_fx<io_T, acc_T>(0, 0);
+
         accu = mli::krn::bias_additive(&bias[o_idx], accu, &in_to_out_quant_params[0]);
 
         for(int idx = 0; idx < inputs_num; idx++) {
@@ -137,20 +139,14 @@ static inline void rnn_dense_op(
                     in_elements[idx], /* height= */ 1, /* ch= */ 1, w_ch_out_mem_strides[idx], 
                     /* row_step= */ 1, /* ch_step= */ 1);
             accu = mli_math_add_fx(accu, other_additives[idx]);
-            accu = mli_math_add_fx(accu, prev_step);
-
-            if(inputs_num - idx != 1) {
-                prev_step = mli::krn::ref::ir_rnn_result_requantize(accu, &in_to_out_quant_params[idx],
-                                &in_to_out_quant_params[idx+1], /* krn_idx= */ 0);
-                accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
-            } else {
-                out_val = mli::krn::ref::result_cast<io_T, acc_T, quant_T>(accu, &in_to_out_quant_params[idx]);
-            }
+
+            acc_ir = mli::krn::ir_rnn_result_requantize(accu, &in_to_out_quant_params[idx]);
+            acc_res_ir = mli_math_add_fx(acc_res_ir, acc_ir);
+            accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
         }
 
-        out_val = MIN(out_val, val_max_limit);
-        out_val = MAX(out_val, val_min_limit);
-        out[o_idx] = out_val;
+        out[o_idx] = mli::krn::ir_result_cast_relu_store<io_T, acc_T, quant_T>(acc_res_ir,
+        		&in_to_out_quant_params[inputs_num - 1], val_min_limit, val_max_limit);
     }
 }
 
 
@@ -94,6 +94,27 @@ static inline void rnn_dense_op_stacked(
     dense_out_ptr -= gates_num * out_elements;
 }
 
+MLI_FORCE_INLINE vNx2accint_t mli_math_add_accus(vNx2accint_t L, vNx2accint_t R) {
+	return mli_math_add(L, R);
+}
+
+MLI_FORCE_INLINE vNx4accint_t mli_math_add_accus(vNx4accint_t L, vNx4accint_t R) {
+	return mli_math_add(L, R);
+}
+
+MLI_FORCE_INLINE vNx4accshort_t mli_math_add_accus(vNx4accshort_t L, vNx4accshort_t R) {
+#if (__Xvec_guard_bit_option == 0)
+	vNx4short_t L_short = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(L);
+	vNx4short_t R_short = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(R);
+
+	vNx4short_t res = mli_math_add_fx<vNx4short_t>(L_short, R_short);
+
+	return mli_math_init_accu_add<vNx4short_t, vNx4accshort_t>(res, (vNx4short_t)0);
+#else
+	return mli_math_add(L, R);
+#endif
+}
+
 template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T>
 static inline void rnn_dense_op(
         const MLI_PTR(io_T) __restrict * inputs,
@@ -109,12 +130,14 @@ static inline void rnn_dense_op(
         const io_T val_max_limit) {
 
     int num_lanes = get_number_lanes<acc_T>();
+
     for (int o_idx = 0; o_idx < out_elements; o_idx += num_lanes) {
         int remaining_ch = out_elements - o_idx;
         int current_chs = MIN(remaining_ch, num_lanes); // number of channels computed in this loop iteration
 
-        acc_T accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
-        acc_T prev_step = mli_math_mul_fx<io_T, acc_T>(0, 0);
+        acc_T accu = mli_prv_init_accu<acc_T>();
+        acc_T acc_ir = mli_prv_init_accu<acc_T>();
+        acc_T acc_res_ir = mli_prv_init_accu<acc_T>();
 
         auto output_params = adjust_quant_params_v(&in_to_out_quant_params[0], 0);
         accu = mli::krn::bias_additive(&bias[o_idx], accu, &output_params, /* add_preshift_rnd */ false);
@@ -124,20 +147,18 @@ static inline void rnn_dense_op(
             output_params = adjust_quant_params_v(&in_to_out_quant_params[idx], 0);
             accu = dotprod_inputzp_1D_v(inputs[idx], &weights[idx][o_idx], accu, in_elements[idx],
                     1, w_ch_out_mem_strides[idx], &in_to_out_quant_params[idx]);
-            accu = mli_math_add(accu, prev_step);
-
-            if(inputs_num - idx != 1) {
-                mli::krn::ref::adjust_quant_params(&in_to_out_quant_params[idx], o_idx);
-                prev_step = mli::krn::ir_rnn_result_requantize(accu, &in_to_out_quant_params[idx],
-                                &in_to_out_quant_params[idx + 1], /* krn_idx= */ 0);
-                accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
-            } else {
-                // Cast result to output type with scaling
-                mli::krn::result_cast_relu_store_v(&out[o_idx], accu, &output_params,
-                        val_min_limit, val_max_limit, current_chs, /* add_preshift_rnd */ true);
-            }
+
+            /* TODO: can be optimized using adjust_quant_params_v, and also optimize ir_rnn_result_requantize function */
+            mli::krn::ref::adjust_quant_params(&in_to_out_quant_params[idx], o_idx);
+            acc_ir = mli::krn::ir_rnn_result_requantize(accu, &in_to_out_quant_params[idx]);
+
+            acc_res_ir = mli_math_add_accus(acc_res_ir, acc_ir);
+            accu = mli_prv_init_accu<acc_T>();
         }
 
+        // Cast result to output type with scaling
+        mli::krn::ir_result_cast_relu_store_v(&out[o_idx], acc_res_ir, &output_params,
+                                val_min_limit, val_max_limit, current_chs);
     }
 }
 
 
@@ -399,14 +399,12 @@ static MLI_FORCE_INLINE void result_cast_relu_store(
         const int16_t val_max_limit) {
 
     o_T out = result_cast<o_T, acc_T, quant_T>(acc, quant_params);
-    out = MIN(out, val_max_limit);
-    out = MAX(out, val_min_limit);
+    out = mli_math_min_fx(out, val_max_limit);
+    out = mli_math_max_fx(out, val_min_limit);
 
     *o_ptr = (o_T) out;
 }
 
-
-
 template <typename io_T, typename acc_T, typename b_T, mli_math_type math_type>
 MLI_FORCE_INLINE io_T result_cast(const acc_T acc, const b_T bias, const int32_t out_mul,
                                const conv_math_params* math_params) {
@@ -438,21 +436,64 @@ MLI_FORCE_INLINE int8_t result_cast<int8_t, mli_acc32_t, int32_t, S8ASYM_MATH>(
     return out_val;
 }
 
+template <>
+MLI_FORCE_INLINE int16_t ir_result_cast_relu_store(
+        const mli_acc40_t acc,
+        const fx_quant_specific_params* math_params,
+        const int16_t val_min_limit,
+        const int16_t val_max_limit) {
+    int16_t out_val = mli_math_cast_fx<mli_acc40_t, int16_t>(acc);
+    out_val = mli_math_min_fx(out_val, val_max_limit);
+    out_val = mli_math_max_fx(out_val, val_min_limit);
+    return out_val;
+}
+
+template <>
+MLI_FORCE_INLINE int16_t ir_result_cast_relu_store(
+        const mli_acc32_t acc,
+        const fx_quant_specific_params* math_params,
+        const int16_t val_min_limit,
+        const int16_t val_max_limit) {
+	int16_t out_val = mli_math_cast_fx<mli_acc32_t, int16_t>(acc);
+    out_val = mli_math_min_fx(out_val, val_max_limit);
+    out_val = mli_math_max_fx(out_val, val_min_limit);
+    return out_val;
+}
+
+template <>
+MLI_FORCE_INLINE int8_t ir_result_cast_relu_store(
+        const mli_acc32_t acc,
+        const s8asym_quant_specific_params* quant_params,
+		const int8_t val_min_limit,
+		const int8_t val_max_limit) {
+
+    const int16_t out_no_offset = mli_math_cast_fx<int32_t, int16_t>(acc);
+    int8_t out_val = mli_math_cast_fx<int16_t, int8_t>(mli_math_add_fx(out_no_offset, quant_params->out_offset), 0);
+
+    out_val = mli_math_min_fx(out_val, val_max_limit);
+    out_val = mli_math_max_fx(out_val, val_min_limit);
+
+    return out_val;
+}
+
 template <typename acc_T>
-MLI_FORCE_INLINE acc_T ir_rnn_result_requantize(const acc_T acc, const fx_quant_specific_params* current_params,
-                                                        const fx_quant_specific_params* next_params, int krn_idx) {
-    const int shift = current_params->out_shift - next_params->out_shift;
-    return mli_math_acc_ashift_fx<acc_T>(acc, shift);
+MLI_FORCE_INLINE acc_T ir_rnn_result_requantize(
+		const acc_T acc,
+		const fx_quant_specific_params* current_params) {
+    const int in_to_ir_shift = current_params->out_shift;
+    return mli_math_acc_ashift_fx<acc_T>(acc, in_to_ir_shift);
 }
 
 template <>
-MLI_FORCE_INLINE mli_acc32_t ir_rnn_result_requantize(const mli_acc32_t acc, const s8asym_quant_specific_params* current_params,
-                                                        const s8asym_quant_specific_params* next_params, int krn_idx) {
-    const int32_t mul = current_params->out_mul / next_params->weight_scales[krn_idx];
-    const int shift = current_params->out_shift - next_params->weight_shifts[krn_idx];
+MLI_FORCE_INLINE mli_acc32_t ir_rnn_result_requantize(
+		const mli_acc32_t acc,
+		const s8asym_quant_specific_params* current_params) {
+
+    const int32_t mul = current_params->out_mul;
+    const int in_to_ir_shift = current_params->out_shift;
 
     auto accu_scaled = mli_math_mul_fx<int32_t, int64_t>(acc, mul);
-    auto out_no_offset = mli_math_cast_fx<int64_t, int32_t>(accu_scaled, shift);
+    auto out_no_offset = mli_math_cast_fx<int64_t, int32_t>(accu_scaled, in_to_ir_shift);
     return out_no_offset;
 }
 
 
@@ -318,8 +318,8 @@ MLI_FORCE_INLINE vNx4short_t mli_prv_convert_sa8_fx16(
         const int16_t zero_point,
         const int16_t scale,
 		const int shift) {
-    int shift_right = MAX(shift, 0);
-    int shift_left = MAX(-shift, 0);
+    int shift_right = mli_math_max_fx(shift, 0);
+    int shift_left = mli_math_max_fx(-shift, 0);
     vNx4short_t in_biased_shifted_no_zp = mli_math_sub_fx<vNx4short_t>(in_val, zero_point);
     vNx4int_t in_scaled = mli_math_mul_fx<vNx4short_t, vNx4int_t>(in_biased_shifted_no_zp, scale);
     vNx4short_t res = mli_math_cast_fx<vNx4int_t, vNx4short_t>(in_scaled, shift_right);
@@ -423,8 +423,8 @@ MLI_FORCE_INLINE void result_cast_relu_store_v(
 
     accu_scaled = accu_scaled + quant_params->out_offset;
 
-    accu_scaled = MIN(accu_scaled, val_max_limit);
-    accu_scaled = MAX(accu_scaled, val_min_limit);
+    accu_scaled = mli_math_min_fx(accu_scaled, val_max_limit);
+    accu_scaled = mli_math_max_fx(accu_scaled, val_min_limit);
 
     vNx4char_t out = to_vNx4char_t(accu_scaled);
     mli_prv_store_n_samples(o_ptr, out, num);
@@ -442,8 +442,8 @@ MLI_FORCE_INLINE void result_cast_relu_store_v(
 
     vNx4char_t out = mli_math_acc_cast_fx<vNx4char_t, vNx4accshort_t>(acc, quant_params->out_shift);
 
-    out = MIN(out, val_max_limit);
-    out = MAX(out, val_min_limit);
+    out = mli_math_min_fx(out, val_max_limit);
+    out = mli_math_max_fx(out, val_min_limit);
 
     mli_prv_store_n_samples(o_ptr, out, num);
 }
@@ -460,8 +460,8 @@ MLI_FORCE_INLINE void result_cast_relu_store_v(
 
     vNx2short_t out = mli_math_acc_cast_fx<vNx2short_t, vNx2accint_t>(acc, quant_params->out_shift);
 
-    out = MIN(out, val_max_limit);
-    out = MAX(out, val_min_limit);
+    out = mli_math_min_fx(out, val_max_limit);
+    out = mli_math_max_fx(out, val_min_limit);
 
     mli_prv_store_n_samples(o_ptr, out, num);
 }
@@ -478,32 +478,83 @@ MLI_FORCE_INLINE void result_cast_relu_store_v(
 
     vNx4short_t out = mli_math_acc_cast_fx<vNx4short_t, vNx4accint_t>(acc, quant_params->out_shift);
 
-    out = MIN(out, val_max_limit);
-    out = MAX(out, val_min_limit);
+    out = mli_math_min_fx(out, val_max_limit);
+    out = mli_math_max_fx(out, val_min_limit);
+
+    mli_prv_store_n_samples(o_ptr, out, num);
+}
+
+template <>
+MLI_FORCE_INLINE void ir_result_cast_relu_store_v(
+        MLI_CONV_OUT_PTR(int8_t) __restrict o_ptr,
+        vNx4accshort_t acc,
+        const s8asym_quant_specific_out_params_v* quant_params,
+        const int16_t val_min_limit,
+        const int16_t val_max_limit,
+        int num) {
+
+	vNx4short_t accu_scaled = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(acc);
+	accu_scaled = mli_math_add_fx<vNx4short_t>(accu_scaled, quant_params->out_offset);
+
+	accu_scaled = mli_math_min_fx(accu_scaled, val_max_limit);
+    accu_scaled = mli_math_max_fx(accu_scaled, val_min_limit);
+
+    vNx4char_t out = to_vNx4char_t(accu_scaled);
+    mli_prv_store_n_samples(o_ptr, out, num);
+}
+
+template <>
+MLI_FORCE_INLINE void ir_result_cast_relu_store_v(
+        MLI_CONV_OUT_PTR(int16_t) __restrict o_ptr,
+        vNx2accint_t acc,
+        const fx_quant_specific_params* quant_params,
+        const int16_t val_min_limit,
+        const int16_t val_max_limit,
+        int num) {
+
+    vNx2short_t out = mli_math_acc_cast_fx<vNx2short_t, vNx2accint_t>(acc);
+
+    out = mli_math_min_fx(out, val_max_limit);
+    out = mli_math_max_fx(out, val_min_limit);
+
+    mli_prv_store_n_samples(o_ptr, out, num);
+}
+
+template <>
+MLI_FORCE_INLINE void ir_result_cast_relu_store_v(
+        MLI_CONV_OUT_PTR(int16_t) __restrict o_ptr,
+        vNx4accint_t acc,
+        const fx_quant_specific_params* quant_params,
+        const int16_t val_min_limit,
+        const int16_t val_max_limit,
+        int num) {
+
+    vNx4short_t out = mli_math_acc_cast_fx<vNx4short_t, vNx4accint_t>(acc);
+
+    out = mli_math_min_fx(out, val_max_limit);
+    out = mli_math_max_fx(out, val_min_limit);
 
     mli_prv_store_n_samples(o_ptr, out, num);
 }
 
 template <typename acc_T>
-MLI_FORCE_INLINE acc_T ir_rnn_result_requantize(const acc_T acc, const fx_quant_specific_params* current_params,
-                                                const fx_quant_specific_params* next_params, int krn_idx) {
-    const int shift = current_params->out_shift - next_params->out_shift;
-    int shift_right = MAX(shift, 0);
-    int shift_left = MAX(-shift, 0);
+MLI_FORCE_INLINE acc_T ir_rnn_result_requantize(
+		const acc_T acc,
+		const fx_quant_specific_params* params) {
+    const int in_to_ir_shift = params->out_shift;
+    int shift_right = mli_math_max_fx(in_to_ir_shift, 0);
+    int shift_left = mli_math_max_fx(-in_to_ir_shift, 0);
     acc_T acc_shifted = mli_math_asl_fx(acc, shift_left);
     return mli_math_asr_rnd_fx<acc_T, int>(acc_shifted, shift_right);
 }
 
 template <>
 MLI_FORCE_INLINE vNx4accshort_t ir_rnn_result_requantize(
         const vNx4accshort_t acc,
-        const s8asym_quant_specific_params* current_params,
-        const s8asym_quant_specific_params* next_params, int krn_idx) {
-
-    MLI_ASSERT(krn_idx == 0);
+        const s8asym_quant_specific_params* params) {
 
-    const int32_t mul = current_params->out_mul / next_params->weight_scales[0];
-    const int shift = current_params->out_shift - next_params->weight_shifts[0];
+    const int32_t mul = params->out_mul;
+    const int in_to_ir_shift = params->out_shift;
 
     int mul_norm = mli_math_norm_fx<int32_t, int32_t>(mul);
     int32_t mul_shifted = mul << mul_norm;
@@ -512,14 +563,42 @@ MLI_FORCE_INLINE vNx4accshort_t ir_rnn_result_requantize(
     vNx4int_t acc_norm = mli_math_norm_fx<vNx4int_t, vNx4int_t>(acc_int);
     acc_int = mli_math_asl_fx<vNx4int_t, vNx4int_t>(acc_int, acc_norm);
 
-    vNx4int_t total_shift = mli_math_add_fx<vNx4int_t>(acc_norm, (mul_norm + shift));
     vNx4int_t acc_scaled = mli_math_mul_fx_high(acc_int, mul_shifted);
-    vNx4int_t acc_shifted = mli_math_asr_rnd_fx(acc_scaled, total_shift);
 
+    constexpr int mul_high_shift = 32;
+    constexpr int max_int_shift = 30;
+    vNx4int_t total_shift = mli_math_add_fx<vNx4int_t>(acc_norm, (mul_norm - mul_high_shift + in_to_ir_shift));
+    vNx4int_t shift_left = mli_math_max_fx(-total_shift, 0);
+    vNx4int_t shift_right = mli_math_max_fx(total_shift, 0);
+
+    vNx4int_t preshift = mli_math_max_fx(shift_right - max_int_shift, 0);
+    shift_right = shift_right - preshift;
+
+    vNx4int_t acc_shifted = mli_math_asr_fx(acc_scaled, preshift);
+    acc_shifted = mli_math_asr_rnd_fx(acc_shifted, shift_right);
+    acc_shifted = mli_math_asl_fx(acc_shifted, shift_left);
+
+#if (__Xvec_guard_bit_option == 0)
+    vNx4short_t acc_short = mli_math_cast_fx<vNx4int_t, vNx4short_t>(acc_shifted);
+    vNx4accshort_t res = mli_math_init_accu_add<vNx4short_t, vNx4accshort_t>(acc_short, (vNx4short_t)0);
+#else
     vNx4int_t norm;
     vNx4short_t acc_short = mli_math_norm_cast_fx</*left_shift*/ false>(acc_shifted , &norm);
+
+    constexpr int guard_bits = 8;
+    vNx4int_t mask = (1 << norm) - 1;
+    vNx4int_t acc_shifted_low = acc_shifted & mask;
+    // If the norm is more than the number of guardbits,
+    // so the masked_acc has to be shifted, since the result is shifted with max shift equals to number of guardbits.
+    vNx4int_t mask_shift = mli_math_max_fx(norm - guard_bits, 0);
+    acc_shifted_low = mli_math_asr_fx(acc_shifted_low, mask_shift);
+
+    norm = mli_math_min_fx(norm, guard_bits);
     vNx4accshort_t res = mli_math_init_accu_add<vNx4short_t, vNx4accshort_t>(acc_short, (vNx4short_t)0);
     res = mli_math_asl_fx<vNx4accshort_t, vNx4short_t>(res, to_vNx4short_t(norm));
+    res = mli_math_add(res, to_vNx4short_t(acc_shifted_low));
+#endif
+
     return res;
 }