2323 * Targets:
2424 *
2525 ******************************************************************************/
26+
2627template <typename io_T>
2728static inline void __attribute__ ((always_inline)) avepool_chw_nopad(
2829 const int row_beg,
@@ -48,6 +49,9 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad(
4849 (void )padding_bot;
4950
5051 const int kernel_size = kernel_width * kernel_height;
52+ int16_t mul = 0 ;
53+ int shift = 0 ;
54+ get_mul_shift_value (kernel_size, kernel_size, &mul, &shift);
5155
5256 MLI_OUT_PTR (io_T) __restrict p_out_ftrs = out_ftrs + row_beg * out_width + clmn_beg;
5357 MLI_PTR (io_T) __restrict in_ptr = (MLI_PTR (io_T))in_ftrs + in_width * (row_beg * stride_height - padding_top) +
@@ -59,9 +63,9 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad(
5963 for (int j = 0 ; j < (row_end - row_beg); j++) {
6064 for (int k = 0 ; k < (clmn_end - clmn_beg); k++) {
6165 accum40_t accum_40 = fx_create_a40 (0x0 , 0x0 );
62- reduce_sum2D (&accum_40, in_ptr, kernel_width, kernel_height, in_width);
66+ reduce_sum2D (&accum_40, in_ptr, kernel_width, kernel_height, in_width, mul );
6367 // Write results
64- mli_prv_clip_div_and_store_result (p_out_ftrs, kernel_size, accum_40 );
68+ mli_prv_shift_clip_and_store_output (p_out_ftrs, &accum_40, shift );
6569
6670 p_out_ftrs++;
6771 in_ptr += stride_width;
@@ -99,6 +103,9 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_even(
99103 (void )padding_bot;
100104
101105 const int kernel_size = kernel_width * kernel_height;
106+ int16_t mul = 0 ;
107+ int shift = 0 ;
108+ get_mul_shift_value (kernel_size, kernel_size, &mul, &shift);
102109
103110 MLI_OUT_PTR (io_T) __restrict p_out_ftrs = out_ftrs + row_beg * out_width + clmn_beg;
104111 MLI_PTR (io_T) __restrict in_ptr = (MLI_PTR (io_T))in_ftrs + in_width * (row_beg * stride_height - padding_top) +
@@ -110,11 +117,10 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_even(
110117 for (int j = 0 ; j < (row_end - row_beg); j++) {
111118 for (int k = 0 ; k < (clmn_end - clmn_beg); k++) {
112119 // Core Sum
113-
114120 accum40_t accum_40 = fx_create_a40 (0x0 , 0x0 );
115- reduce_sum2D_even (&accum_40, in_ptr, kernel_width, kernel_height, in_width);
121+ reduce_sum2D_even (&accum_40, in_ptr, kernel_width, kernel_height, in_width, mul );
116122 // Write results
117- mli_prv_clip_div_and_store_result (p_out_ftrs, kernel_size, accum_40 );
123+ mli_prv_shift_clip_and_store_output (p_out_ftrs, &accum_40, shift );
118124
119125 p_out_ftrs++;
120126 in_ptr += stride_width;
@@ -150,6 +156,7 @@ static inline void __attribute__((always_inline)) avepool_chw(
150156 const int padding_bot) {
151157 (void )padding_right;
152158 (void )padding_bot;
159+ unsigned int max_kernel_size = kernel_width * kernel_height;
153160
154161 MLI_OUT_PTR (io_T) __restrict out_ptr = out_ftrs + clmn_beg * out_width + clmn_beg;
155162 for (int ch_idx = 0 ; ch_idx < channels_num; ch_idx++) {
@@ -168,6 +175,9 @@ static inline void __attribute__((always_inline)) avepool_chw(
168175 int clmns = kernel_width + right_comp + left_comp;
169176
170177 const int kernel_size = rows * clmns;
178+ int16_t mul = 0 ;
179+ int shift = 0 ;
180+ get_mul_shift_value (kernel_size, max_kernel_size, &mul, &shift);
171181
172182 const MLI_PTR (io_T) __restrict in_ptr =
173183 in_ftrs + // starting point
@@ -176,9 +186,9 @@ static inline void __attribute__((always_inline)) avepool_chw(
176186 (W_idx * stride_width) - padding_left - left_comp; // move to column
177187
178188 accum40_t accum_40 = fx_create_a40 (0x0 , 0x0 );
179- reduce_sum2D (&accum_40, in_ptr, clmns, rows, in_width);
189+ reduce_sum2D (&accum_40, in_ptr, clmns, rows, in_width, mul );
180190 // Write results
181- mli_prv_clip_div_and_store_result (&p_out_ftrs[W_idx], kernel_size, accum_40 );
191+ mli_prv_shift_clip_and_store_output (&p_out_ftrs[W_idx], &accum_40, shift );
182192
183193 } // W_idx
184194 out_ptr += out_width + clmn_beg - clmn_end;
@@ -213,21 +223,21 @@ static inline void __attribute__((always_inline)) avepool_chw_k4x4_str1_nopad(
213223
214224 MLI_ASSERT (stride_width == 1 );
215225 MLI_ASSERT (stride_height == 1 );
226+ MLI_ASSERT (kernel_width == 4 );
227+ MLI_ASSERT (kernel_height == 4 );
216228
217229 MLI_OUT_PTR (io_T) __restrict p_out_ftrs = out_ftrs + row_beg * out_width + clmn_beg;
218230 MLI_PTR (io_T) __restrict in_ptr = (MLI_PTR (io_T))in_ftrs + in_width * (row_beg * stride_height - padding_top) +
219231 (clmn_beg * stride_width - padding_left);
220232 const int delta_W = (clmn_end - clmn_beg);
221233 const int delta_H = (row_end - row_beg);
222- const int kernel_size = (kernel_width * kernel_height);
223234
224235 for (int ch_idx = 0 ; ch_idx < channels_num; ch_idx++) {
225236 for (int j = 0 ; j < (row_end - row_beg); j++) {
226237 for (int k = 0 ; k < (clmn_end - clmn_beg); k++) {
227238 accum40_t accum_40 = fx_create_a40 (0x0 , 0x0 );
228- reduce_sum2D_even (&accum_40, (const MLI_PTR (io_T))in_ptr, kernel_width, kernel_height, in_width);
229-
230- mli_prv_clip_div_and_store_result (p_out_ftrs, kernel_size, accum_40);
239+ reduce_sum2D_even (&accum_40, (const MLI_PTR (io_T))in_ptr, kernel_width, kernel_height, in_width, 1 );
240+ mli_prv_shift_clip_and_store_output (p_out_ftrs, &accum_40, 4 );
231241
232242 p_out_ftrs++;
233243 in_ptr += stride_width;
@@ -269,17 +279,18 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_k2x2(
269279 (clmn_beg * stride_width - padding_left);
270280 const int delta_W = (clmn_end - clmn_beg);
271281 const int delta_H = (row_end - row_beg);
272- const int kernel_size = (kernel_width * kernel_height);
282+
283+ MLI_ASSERT (kernel_width == 2 );
284+ MLI_ASSERT (kernel_height == 2 );
273285
274286 for (int ch_idx = 0 ; ch_idx < channels_num; ch_idx++) {
275287 for (int j = 0 ; j < (row_end - row_beg); j++) {
276288 for (int k = 0 ; k < (clmn_end - clmn_beg); k++) {
277289 // Core Sum
278290
279291 accum40_t accum_40 = fx_create_a40 (0x0 , 0x0 );
280- reduce_sum2D_even (&accum_40, in_ptr, kernel_width, kernel_height, in_width);
281-
282- mli_prv_clip_div_and_store_result (p_out_ftrs, kernel_size, accum_40);
292+ reduce_sum2D_even (&accum_40, in_ptr, kernel_width, kernel_height, in_width, 1 );
293+ mli_prv_shift_clip_and_store_output (p_out_ftrs, &accum_40, 2 );
283294
284295 p_out_ftrs++;
285296 in_ptr += stride_width;
@@ -317,6 +328,9 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_k4_Nx2_N_eve
317328 (void )padding_bot;
318329
319330 const int kernel_size = kernel_height * kernel_width;
331+ int16_t mul = 0 ;
332+ int shift = 0 ;
333+ get_mul_shift_value (kernel_size, kernel_size, &mul, &shift);
320334
321335 MLI_OUT_PTR (io_T) __restrict p_out_ftrs = out_ftrs + row_beg * out_width + clmn_beg;
322336 MLI_PTR (io_T) __restrict in_ptr = (MLI_PTR (io_T))in_ftrs + in_width * (row_beg * stride_height - padding_top) +
@@ -329,9 +343,8 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_k4_Nx2_N_eve
329343 for (int k = 0 ; k < (clmn_end - clmn_beg); k++) {
330344 // Core Sum
331345 accum40_t accum_40 = fx_create_a40 (0x0 , 0x0 );
332- reduce_sum2D_even (&accum_40, in_ptr, kernel_width, kernel_height, in_width);
333-
334- mli_prv_clip_div_and_store_result (p_out_ftrs, kernel_size, accum_40);
346+ reduce_sum2D_even (&accum_40, in_ptr, kernel_width, kernel_height, in_width, mul);
347+ mli_prv_shift_clip_and_store_output (p_out_ftrs, &accum_40, shift);
335348
336349 p_out_ftrs++;
337350 in_ptr += stride_width;
0 commit comments