@@ -106,31 +106,28 @@ static __attribute__ ((always_inline)) void depthwise_convolution2D_hwcn_nopad(
106106 acc_T bias_add_ch1 = bias_additive (*biases++, 0x0 , &v2quant_params[0 ]);
107107 acc_T bias_add_ch2 = bias_additive (*biases++, 0x0 , &v2quant_params[1 ]);
108108
109- v2accum40_t v2acc_weights_add = {bias_add_ch1, bias_add_ch2};
109+ __v2i32_t v2acc_weights_add = {bias_add_ch1, bias_add_ch2};
110110 v2acc_weights_add = weights_additive_v (w_ptr, &v2acc_weights_add, &quant_params, kernel_width, kernel_height,
111111 krn_col_step, krn_row_step);
112- __v2i32_t v2acc_weights_add_int = {fx_q31_cast_nf_a40 (fx_get_v2a40 (v2acc_weights_add, 0 )),
113- fx_q31_cast_nf_a40 (fx_get_v2a40 (v2acc_weights_add, 1 ))};
114112 __builtin_assume (amount_rows > 0 );
115113 for (int H_idx = 0 ; H_idx < amount_rows; H_idx++) {
116114 __builtin_assume (amount_columns > 0 );
117115
118- __v2i32_t v2accu_dotprod = v2acc_weights_add_int ;
116+ __v2i32_t v2accu_dotprod = v2acc_weights_add ;
119117 dotprod2D_hwc_v (in_ptr, w_ptr, &v2accu_dotprod, kernel_width, kernel_height,
120118 in_col_step, in_row_step, krn_col_step, krn_row_step);
119+ in_ptr += in_increment_clmn_loop;
121120 for (int W_idx = 0 ; W_idx < amount_columns; W_idx++) {
122-
123121 // Cast result to output type
124122 mli_prv_clip_relu_store_output_v (out_ptr, &v2accu_dotprod, v2quant_params, val_min_limit, val_max_limit);
125-
126- in_ptr += in_increment_clmn_loop;
127123 out_ptr += out_increment_clmn_loop;
128124
129- v2accu_dotprod = v2acc_weights_add_int ;
125+ v2accu_dotprod = v2acc_weights_add ;
130126 dotprod2D_hwc_v (in_ptr, w_ptr, &v2accu_dotprod, kernel_width, kernel_height,
131127 in_col_step, in_row_step, krn_col_step, krn_row_step);
128+ in_ptr += in_increment_clmn_loop;
132129 } // for W_idx
133- in_ptr += in_increment_row_loop;
130+ in_ptr += in_increment_row_loop - stride_width * filters * in_ch ;
134131 out_ptr += out_increment_row_loop;
135132 } // for H_idx
136133 in_ptr -= in_compensation_row_loop;
@@ -156,10 +153,9 @@ static __attribute__ ((always_inline)) void depthwise_convolution2D_hwcn_nopad(
156153 // Convolution core. Here calculations performes in a unfolded expression way:
157154 // out_val = (x-x_zp)*(w) + b) = -sum_i(w*x_zp) + sum(x*w) + b
158155 // ============================================
159- __v2i32_t accu = { 0 , 0 } ;
156+ __v2i32_t accu = v2global_other_additives ;
160157 accu = dotprod2D_inp_width_v (in_ptr, w_ptr, &accu, kernel_width, kernel_height,
161158 in_col_step, in_row_step, krn_col_step, krn_row_step, in_increment_clmn_loop);
162- accu += v2global_other_additives;
163159
164160 // Cast result to output type
165161 mli_prv_clip_relu_store_output_inp_width_v (out_ptr, &accu, &quant_params, val_min_limit, val_max_limit, out_increment_clmn_loop);
0 commit comments