@@ -94,6 +94,27 @@ static inline void rnn_dense_op_stacked(
9494 dense_out_ptr -= gates_num * out_elements;
9595}
9696
97+ MLI_FORCE_INLINE vNx2accint_t mli_math_add_accus (vNx2accint_t L, vNx2accint_t R) {
98+ return mli_math_add (L, R);
99+ }
100+
101+ MLI_FORCE_INLINE vNx4accint_t mli_math_add_accus (vNx4accint_t L, vNx4accint_t R) {
102+ return mli_math_add (L, R);
103+ }
104+
105+ MLI_FORCE_INLINE vNx4accshort_t mli_math_add_accus (vNx4accshort_t L, vNx4accshort_t R) {
106+ #if (__Xvec_guard_bit_option == 0)
107+ vNx4short_t L_short = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(L);
108+ vNx4short_t R_short = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(R);
109+
110+ vNx4short_t res = mli_math_add_fx<vNx4short_t>(L_short, R_short);
111+
112+ return mli_math_init_accu_add<vNx4short_t, vNx4accshort_t>(res, (vNx4short_t)0 );
113+ #else
114+ return mli_math_add (L, R);
115+ #endif
116+ }
117+
97118template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T>
98119static inline void rnn_dense_op (
99120 const MLI_PTR (io_T) __restrict * inputs,
@@ -125,7 +146,8 @@ static inline void rnn_dense_op(
125146 output_params = adjust_quant_params_v (&in_to_out_quant_params[idx], 0 );
126147 accu = dotprod_inputzp_1D_v (inputs[idx], &weights[idx][o_idx], accu, in_elements[idx],
127148 1 , w_ch_out_mem_strides[idx], &in_to_out_quant_params[idx]);
128- accu = mli_math_add (accu, prev_step);
149+
150+ accu = mli_math_add_accus (accu, prev_step);
129151
130152 if (inputs_num - idx != 1 ) {
131153 mli::krn::ref::adjust_quant_params (&in_to_out_quant_params[idx], o_idx);
0 commit comments