@@ -130,6 +130,8 @@ static inline void ip_op(
130130 const int out_shift,
131131 const int16_t input_offset,
132132 const int16_t output_offset) {
133+ const int left_shift = out_shift > 0 ? 0 : -out_shift;
134+ const int right_shift = out_shift > 0 ? out_shift : 0 ;
133135 // Matrix-Vector multiplication
134136 // ==============================
135137 if (_Rarely (in_elements < 8 )) {
@@ -143,14 +145,16 @@ static inline void ip_op(
143145 weights++;
144146 }
145147 in -= in_elements;
148+
149+ accu = mli_math_acc_ashift_fx (accu, -left_shift);
146150 accu = mli_math_scale_mul<acc_T, true >(accu, out_mul);
147151
148152 // adding the output offset needs to happen after the output mul and output shift
149153 // but before the cast to the output container size.
150154 // because the cast and shift are combined in one function, the output offset is
151- // added before, and multiplied with 1<< out_shift to compensate.
152- accu = mli_math_mac_fx (accu, (int16_t )(1 <<out_shift ), (io_T)output_offset);
153- out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, out_shift );
155+ // added before, and multiplied with 1<< right_shift to compensate.
156+ accu = mli_math_mac_fx (accu, (int16_t )(1 <<right_shift ), (io_T)output_offset);
157+ out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, right_shift );
154158 }
155159 } else {
156160 if ((in_elements & 0x3 ) == 0 ) {
@@ -165,15 +169,16 @@ LOOP_PIPELINE_ENABLE
165169 weights += 4 ;
166170 }
167171 in -= in_elements;
168-
172+
173+ accu = mli_math_acc_ashift_fx (accu, -left_shift);
169174 accu = mli_math_scale_mul<acc_T, true >(accu, out_mul);
170175
171176 // adding the output offset needs to happen after the output mul and output shift
172177 // but before the cast to the output container size.
173178 // because the cast and shift are combined in one function, the output offset is
174- // added before, and multiplied with 1<< out_shift to compensate.
175- accu = mli_math_mac_fx (accu, (int16_t )(1 <<out_shift ), (io_T)output_offset);
176- out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, out_shift );
179+ // added before, and multiplied with 1<< right_shift to compensate.
180+ accu = mli_math_mac_fx (accu, (int16_t )(1 <<right_shift ), (io_T)output_offset);
181+ out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, right_shift );
177182 }
178183 } else {
179184 for (int o_idx = 0 ; o_idx < out_elements; o_idx++) {
@@ -197,14 +202,16 @@ LOOP_PIPELINE_ENABLE
197202 weights += 4 ;
198203 }
199204 in -= in_elements;
205+
206+ accu = mli_math_acc_ashift_fx (accu, -left_shift);
200207 accu = mli_math_scale_mul<acc_T, true >(accu, out_mul);
201208
202209 // adding the output offset needs to happen after the output mul and output shift
203210 // but before the cast to the output container size.
204211 // because the cast and shift are combined in one function, the output offset is
205- // added before, and multiplied with 1<< out_shift to compensate.
206- accu = mli_math_mac_fx (accu, (int16_t )(1 <<out_shift ), (io_T)output_offset);
207- out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, out_shift );
212+ // added before, and multiplied with 1<< right_shift to compensate.
213+ accu = mli_math_mac_fx (accu, (int16_t )(1 <<right_shift ), (io_T)output_offset);
214+ out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, right_shift );
208215 }
209216 }
210217 }
@@ -224,7 +231,7 @@ static void fully_connected_prepare_and_run(
224231 MLI_CONV_OUT_PTR (io_T) out_ptr = (MLI_CONV_OUT_PTR (io_T)) (out->data );
225232
226233 int ch_out = bias->shape [0 ];
227- int in_sz = mli_prv_count_elem_num (in) ;
234+ int in_sz = weights-> shape [ 1 ] ;
228235
229236 // Define shift values
230237 int bias_shift = mli_prv_calc_shift (in, weights, bias);
0 commit comments