@@ -83,9 +83,9 @@ static __attribute__ ((always_inline)) void depthwise_convolution2D_hwcn_nopad(
8383 const int out_increment_in_ch_loop = channel_per_loop - out_compensation_row_loop;
8484 const int out_increment_in_ch_loop_v = channels_per_loop_v - out_compensation_row_loop;
8585
86- MLI_PTR (io_T) __restrict in_ptr = (MLI_PTR (io_T) __restrict)in_ftrs;
86+ const MLI_PTR (io_T) __restrict in_ptr = (MLI_PTR (io_T) __restrict)in_ftrs;
8787 MLI_CONV_OUT_PTR (io_T) __restrict out_ptr = (MLI_CONV_OUT_PTR (io_T) __restrict)out_ftrs;
88- MLI_PTR (w_T) __restrict w_ptr = (MLI_PTR (w_T) __restrict)weights;
88+ const MLI_PTR (w_T) __restrict w_ptr = (MLI_PTR (w_T) __restrict)weights;
8989 // MLI_PTR(w_T) __restrict w_ptr_local = (MLI_PTR(w_T) __restrict)weights;
9090 int out_ch_idx = 0 ;
9191
@@ -114,20 +114,21 @@ static __attribute__ ((always_inline)) void depthwise_convolution2D_hwcn_nopad(
114114 __builtin_assume (amount_columns > 0 );
115115
116116 __v2i32_t v2accu_dotprod = v2acc_weights_add;
117- dotprod2D_hwc_v (in_ptr, w_ptr, &v2accu_dotprod, kernel_width, kernel_height,
118- in_col_step, in_row_step, krn_col_step, krn_row_step);
119- in_ptr += in_increment_clmn_loop;
120117 for (int W_idx = 0 ; W_idx < amount_columns; W_idx++) {
118+ dotprod2D_hwc_v (&in_ptr, &w_ptr, &v2accu_dotprod, kernel_width, kernel_height,
119+ in_col_step, in_row_step, krn_col_step, krn_row_step);
120+ // compensite increment of input tensor pointer from dotprod2D_hwc_v function
121+ in_ptr += in_increment_clmn_loop - kernel_height * in_row_step;
122+ // compensite increment of weights pointer from dotprod2D_hwc_v function
123+ w_ptr -= kernel_height * krn_row_step;
124+
121125 // Cast result to output type
122126 mli_prv_clip_relu_store_output_v (out_ptr, &v2accu_dotprod, v2quant_params, val_min_limit, val_max_limit);
123127 out_ptr += out_increment_clmn_loop;
124128
125129 v2accu_dotprod = v2acc_weights_add;
126- dotprod2D_hwc_v (in_ptr, w_ptr, &v2accu_dotprod, kernel_width, kernel_height,
127- in_col_step, in_row_step, krn_col_step, krn_row_step);
128- in_ptr += in_increment_clmn_loop;
129130 } // for W_idx
130- in_ptr += in_increment_row_loop - stride_width * filters * in_ch ;
131+ in_ptr += in_increment_row_loop;
131132 out_ptr += out_increment_row_loop;
132133 } // for H_idx
133134 in_ptr -= in_compensation_row_loop;
@@ -154,28 +155,31 @@ static __attribute__ ((always_inline)) void depthwise_convolution2D_hwcn_nopad(
154155 // out_val = (x-x_zp)*(w) + b) = -sum_i(w*x_zp) + sum(x*w) + b
155156 // ============================================
156157 __v2i32_t accu = v2global_other_additives;
157- accu = dotprod2D_inp_width_v (in_ptr, w_ptr, &accu, kernel_width, kernel_height,
158+ accu = dotprod2D_inp_width_v (& in_ptr, & w_ptr, &accu, kernel_width, kernel_height,
158159 in_col_step, in_row_step, krn_col_step, krn_row_step, in_increment_clmn_loop);
159-
160+ // compensite increment of input tensor pointer from dotprod2D_hwc_v function
161+ in_ptr += 2 * in_increment_clmn_loop - kernel_height * in_row_step;
162+ // compensite increment of weights pointer from dotprod2D_hwc_v function
163+ w_ptr -= kernel_height * krn_row_step;
160164 // Cast result to output type
161165 mli_prv_clip_relu_store_output_inp_width_v (out_ptr, &accu, &quant_params, val_min_limit, val_max_limit, out_increment_clmn_loop);
162-
163- in_ptr += 2 * in_increment_clmn_loop;
164166 out_ptr += 2 * out_increment_clmn_loop;
165167 } // for W_idx
166168 if ( amount_columns & 0x1 ) {
167169 // Convolution core. Here calculations performes in a unfolded expression way:
168170 // out_val = (x-x_zp)*(w) + b) = -sum_i(w*x_zp) + sum(x*w) + b
169171 // ============================================
170172 acc_T accu = 0 ;
171- accu = dotprod2D (in_ptr, w_ptr, accu, kernel_width, kernel_height,
173+ accu = dotprod2D (& in_ptr, & w_ptr, accu, kernel_width, kernel_height,
172174 in_col_step, in_row_step, krn_col_step, krn_row_step);
175+ // compensite increment of input tensor pointer from dotprod2D_hwc_v function
176+ in_ptr += in_increment_clmn_loop - kernel_height * in_row_step;
177+ // compensite increment of weights pointer from dotprod2D_hwc_v function
178+ w_ptr -= kernel_height * krn_row_step;
173179 accu += global_other_additives;
174180
175181 // Cast result to output type
176182 mli_prv_clip_relu_store_output (out_ptr, accu, &quant_params, val_min_limit, val_max_limit);
177-
178- in_ptr += in_increment_clmn_loop;
179183 out_ptr += out_increment_clmn_loop;
180184 }
181185 in_ptr += in_increment_row_loop;
@@ -882,3 +886,4 @@ LOOP_PIPELINE_ENABLE
882886
883887#endif // _MLI_KRN_CONV2D_HWC_H_
884888
889+
0 commit comments