Skip to content

Commit 93deccf

Browse files
committed
performance optimizations for avepool
1 parent eb99058 commit 93deccf

File tree

5 files changed

+214
-219
lines changed

5 files changed

+214
-219
lines changed

lib/src/kernels/pooling/mli_krn_avepool_chw.h

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
* Targets:
2424
*
2525
******************************************************************************/
26+
2627
template <typename io_T>
2728
static inline void __attribute__((always_inline)) avepool_chw_nopad(
2829
const int row_beg,
@@ -48,6 +49,9 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad(
4849
(void)padding_bot;
4950

5051
const int kernel_size = kernel_width * kernel_height;
52+
int16_t mul = 0;
53+
int shift = 0;
54+
get_mul_shift_value(kernel_size, kernel_size, &mul, &shift);
5155

5256
MLI_OUT_PTR(io_T) __restrict p_out_ftrs = out_ftrs + row_beg * out_width + clmn_beg;
5357
MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T))in_ftrs + in_width * (row_beg * stride_height - padding_top) +
@@ -59,9 +63,9 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad(
5963
for (int j = 0; j < (row_end - row_beg); j++) {
6064
for (int k = 0; k < (clmn_end - clmn_beg); k++) {
6165
accum40_t accum_40 = fx_create_a40(0x0, 0x0);
62-
reduce_sum2D(&accum_40, in_ptr, kernel_width, kernel_height, in_width);
66+
reduce_sum2D(&accum_40, in_ptr, kernel_width, kernel_height, in_width, mul);
6367
// Write results
64-
mli_prv_clip_div_and_store_result(p_out_ftrs, kernel_size, accum_40);
68+
mli_prv_shift_clip_and_store_output(p_out_ftrs, &accum_40, shift);
6569

6670
p_out_ftrs++;
6771
in_ptr += stride_width;
@@ -99,6 +103,9 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_even(
99103
(void)padding_bot;
100104

101105
const int kernel_size = kernel_width * kernel_height;
106+
int16_t mul = 0;
107+
int shift = 0;
108+
get_mul_shift_value(kernel_size, kernel_size, &mul, &shift);
102109

103110
MLI_OUT_PTR(io_T) __restrict p_out_ftrs = out_ftrs + row_beg * out_width + clmn_beg;
104111
MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T))in_ftrs + in_width * (row_beg * stride_height - padding_top) +
@@ -110,11 +117,10 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_even(
110117
for (int j = 0; j < (row_end - row_beg); j++) {
111118
for (int k = 0; k < (clmn_end - clmn_beg); k++) {
112119
// Core Sum
113-
114120
accum40_t accum_40 = fx_create_a40(0x0, 0x0);
115-
reduce_sum2D_even(&accum_40, in_ptr, kernel_width, kernel_height, in_width);
121+
reduce_sum2D_even(&accum_40, in_ptr, kernel_width, kernel_height, in_width, mul);
116122
// Write results
117-
mli_prv_clip_div_and_store_result(p_out_ftrs, kernel_size, accum_40);
123+
mli_prv_shift_clip_and_store_output(p_out_ftrs, &accum_40, shift);
118124

119125
p_out_ftrs++;
120126
in_ptr += stride_width;
@@ -150,6 +156,7 @@ static inline void __attribute__((always_inline)) avepool_chw(
150156
const int padding_bot) {
151157
(void)padding_right;
152158
(void)padding_bot;
159+
unsigned int max_kernel_size = kernel_width * kernel_height;
153160

154161
MLI_OUT_PTR(io_T) __restrict out_ptr = out_ftrs + clmn_beg * out_width + clmn_beg;
155162
for (int ch_idx = 0; ch_idx < channels_num; ch_idx++) {
@@ -168,6 +175,9 @@ static inline void __attribute__((always_inline)) avepool_chw(
168175
int clmns = kernel_width + right_comp + left_comp;
169176

170177
const int kernel_size = rows * clmns;
178+
int16_t mul = 0;
179+
int shift = 0;
180+
get_mul_shift_value(kernel_size, max_kernel_size, &mul, &shift);
171181

172182
const MLI_PTR(io_T) __restrict in_ptr =
173183
in_ftrs + // starting point
@@ -176,9 +186,9 @@ static inline void __attribute__((always_inline)) avepool_chw(
176186
(W_idx * stride_width) - padding_left - left_comp; // move to column
177187

178188
accum40_t accum_40 = fx_create_a40(0x0, 0x0);
179-
reduce_sum2D(&accum_40, in_ptr, clmns, rows, in_width);
189+
reduce_sum2D(&accum_40, in_ptr, clmns, rows, in_width, mul);
180190
// Write results
181-
mli_prv_clip_div_and_store_result(&p_out_ftrs[W_idx], kernel_size, accum_40);
191+
mli_prv_shift_clip_and_store_output(&p_out_ftrs[W_idx], &accum_40, shift);
182192

183193
} // W_idx
184194
out_ptr += out_width + clmn_beg - clmn_end;
@@ -213,21 +223,21 @@ static inline void __attribute__((always_inline)) avepool_chw_k4x4_str1_nopad(
213223

214224
MLI_ASSERT(stride_width == 1);
215225
MLI_ASSERT(stride_height == 1);
226+
MLI_ASSERT(kernel_width == 4);
227+
MLI_ASSERT(kernel_height == 4);
216228

217229
MLI_OUT_PTR(io_T) __restrict p_out_ftrs = out_ftrs + row_beg * out_width + clmn_beg;
218230
MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T))in_ftrs + in_width * (row_beg * stride_height - padding_top) +
219231
(clmn_beg * stride_width - padding_left);
220232
const int delta_W = (clmn_end - clmn_beg);
221233
const int delta_H = (row_end - row_beg);
222-
const int kernel_size = (kernel_width * kernel_height);
223234

224235
for (int ch_idx = 0; ch_idx < channels_num; ch_idx++) {
225236
for (int j = 0; j < (row_end - row_beg); j++) {
226237
for (int k = 0; k < (clmn_end - clmn_beg); k++) {
227238
accum40_t accum_40 = fx_create_a40(0x0, 0x0);
228-
reduce_sum2D_even(&accum_40, (const MLI_PTR(io_T))in_ptr, kernel_width, kernel_height, in_width);
229-
230-
mli_prv_clip_div_and_store_result(p_out_ftrs, kernel_size, accum_40);
239+
reduce_sum2D_even(&accum_40, (const MLI_PTR(io_T))in_ptr, kernel_width, kernel_height, in_width, 1);
240+
mli_prv_shift_clip_and_store_output(p_out_ftrs, &accum_40, 4);
231241

232242
p_out_ftrs++;
233243
in_ptr += stride_width;
@@ -269,17 +279,18 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_k2x2(
269279
(clmn_beg * stride_width - padding_left);
270280
const int delta_W = (clmn_end - clmn_beg);
271281
const int delta_H = (row_end - row_beg);
272-
const int kernel_size = (kernel_width * kernel_height);
282+
283+
MLI_ASSERT(kernel_width == 2);
284+
MLI_ASSERT(kernel_height == 2);
273285

274286
for (int ch_idx = 0; ch_idx < channels_num; ch_idx++) {
275287
for (int j = 0; j < (row_end - row_beg); j++) {
276288
for (int k = 0; k < (clmn_end - clmn_beg); k++) {
277289
// Core Sum
278290

279291
accum40_t accum_40 = fx_create_a40(0x0, 0x0);
280-
reduce_sum2D_even(&accum_40, in_ptr, kernel_width, kernel_height, in_width);
281-
282-
mli_prv_clip_div_and_store_result(p_out_ftrs, kernel_size, accum_40);
292+
reduce_sum2D_even(&accum_40, in_ptr, kernel_width, kernel_height, in_width, 1);
293+
mli_prv_shift_clip_and_store_output(p_out_ftrs, &accum_40, 2);
283294

284295
p_out_ftrs++;
285296
in_ptr += stride_width;
@@ -317,6 +328,9 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_k4_Nx2_N_eve
317328
(void)padding_bot;
318329

319330
const int kernel_size = kernel_height * kernel_width;
331+
int16_t mul = 0;
332+
int shift = 0;
333+
get_mul_shift_value(kernel_size, kernel_size, &mul, &shift);
320334

321335
MLI_OUT_PTR(io_T) __restrict p_out_ftrs = out_ftrs + row_beg * out_width + clmn_beg;
322336
MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T))in_ftrs + in_width * (row_beg * stride_height - padding_top) +
@@ -329,9 +343,8 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_k4_Nx2_N_eve
329343
for (int k = 0; k < (clmn_end - clmn_beg); k++) {
330344
// Core Sum
331345
accum40_t accum_40 = fx_create_a40(0x0, 0x0);
332-
reduce_sum2D_even(&accum_40, in_ptr, kernel_width, kernel_height, in_width);
333-
334-
mli_prv_clip_div_and_store_result(p_out_ftrs, kernel_size, accum_40);
346+
reduce_sum2D_even(&accum_40, in_ptr, kernel_width, kernel_height, in_width, mul);
347+
mli_prv_shift_clip_and_store_output(p_out_ftrs, &accum_40, shift);
335348

336349
p_out_ftrs++;
337350
in_ptr += stride_width;

lib/src/kernels/pooling/mli_krn_avepool_hwc_fx16.cc

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "mli_config.h"
1515
#include "mli_debug.h"
1616
#include "mli_helpers_api.h"
17+
#include "mli_krn_reduce_sum2d_chw.h"
1718
#include "mli_prv_dsp.h"
1819

1920
#ifdef __FXAPI__
@@ -44,8 +45,6 @@ extern "C" {
4445
*
4546
******************************************************************************/
4647

47-
static inline int32_t reduce_sum2D_hwc(MLI_PTR(int16_t) in, uint32_t width, uint32_t height, uint32_t channels, uint32_t in_row_step);
48-
4948
mli_status mli_krn_avepool_hwc_fx16(const mli_tensor* in, const mli_pool_cfg* cfg, mli_tensor* out) {
5049
mli_status ret = MLI_CHECK_STATUS(mli_chk_avepool_hwc_fx16(in, cfg, out), __func__);
5150
if (ret != MLI_STATUS_OK) return ret;
@@ -85,6 +84,10 @@ mli_status mli_krn_avepool_hwc_fx16(const mli_tensor* in, const mli_pool_cfg* cf
8584
const int32_t clmn_end = out_width - CEIL_DIV(padding_right, stride_width);
8685

8786
const int32_t kernel_size = kernel_width * kernel_height;
87+
int16_t mul = 0;
88+
int shift = 0;
89+
get_mul_shift_value(kernel_size, kernel_size, &mul, &shift);
90+
8891
for (int ch_idx = 0; ch_idx < channels_num; ch_idx++) {
8992
for (int H_idx = row_beg; H_idx < row_end; H_idx++) {
9093
for (int W_idx = clmn_beg; W_idx < clmn_end; W_idx++) {
@@ -96,12 +99,12 @@ mli_status mli_krn_avepool_hwc_fx16(const mli_tensor* in, const mli_pool_cfg* cf
9699
ch_idx; // move to channel
97100

98101
// Core Sum
99-
int32_t accum_32 = reduce_sum2D_hwc(in_ptr, kernel_width, kernel_height, channels_num, in_width);
102+
accum40_t accu = reduce_sum2D_hwc(in_ptr, kernel_width, kernel_height, channels_num, in_width, mul);
100103

101104
// Write results
102105
MLI_OUT_PTR(int16_t)
103106
p_out_ftrs = (MLI_OUT_PTR(int16_t))(out_ftrs + ch_idx + (H_idx * out_width + W_idx) * channels_num);
104-
mli_prv_clip_div_and_store_result(p_out_ftrs, kernel_size, accum_32);
107+
mli_prv_shift_clip_and_store_output(p_out_ftrs, &accu, shift);
105108
}
106109
}
107110
}
@@ -154,6 +157,11 @@ mli_status mli_krn_avepool_hwc_fx16(const mli_tensor* in, const mli_pool_cfg* cf
154157

155158
int32_t rows = kernel_height - top_comp - bottom_comp;
156159
int32_t clmns = kernel_width - right_comp - left_comp;
160+
unsigned int max_kernel_size = kernel_width * kernel_height;
161+
int kernel_size = rows * clmns;
162+
int16_t mul = 0;
163+
int shift = 0;
164+
get_mul_shift_value(kernel_size, max_kernel_size, &mul, &shift);
157165

158166
MLI_PTR(int16_t)
159167
in_ptr = in_ftrs + // starting point
@@ -163,12 +171,12 @@ mli_status mli_krn_avepool_hwc_fx16(const mli_tensor* in, const mli_pool_cfg* cf
163171
ch_idx;
164172

165173
// Core Sum
166-
int32_t accum_32 = reduce_sum2D_hwc(in_ptr, clmns, rows, channels_num, in_width);
174+
accum40_t accu = reduce_sum2D_hwc(in_ptr, clmns, rows, channels_num, in_width, mul);
167175

168176
// Write result
169177
MLI_OUT_PTR(int16_t)
170178
p_out_ftrs = (MLI_OUT_PTR(int16_t))(out_ftrs + ch_idx + (H_idx * out_width + W_idx) * channels_num);
171-
mli_prv_clip_div_and_store_result(p_out_ftrs, (int32_t)(rows * clmns), accum_32);
179+
mli_prv_shift_clip_and_store_output(p_out_ftrs, &accu, shift);
172180
}
173181
}
174182
}
@@ -185,17 +193,6 @@ mli_status mli_krn_avepool_hwc_fx16(const mli_tensor* in, const mli_pool_cfg* cf
185193
return MLI_STATUS_OK;
186194
}
187195

188-
static inline int32_t reduce_sum2D_hwc(MLI_PTR(int16_t) in, uint32_t width, uint32_t height, uint32_t channels, uint32_t in_row_step) {
189-
int32_t acc = 0;
190-
for (int row = 0; row < height; row++) {
191-
for (int clmn = 0; clmn < width; clmn++) {
192-
acc += in[clmn * channels];
193-
}
194-
in += in_row_step * channels;
195-
}
196-
return acc;
197-
}
198-
199196
#pragma code()
200197

201198
#ifdef __cplusplus

lib/src/kernels/pooling/mli_krn_avepool_hwc_fx8.cc

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "mli_config.h"
1515
#include "mli_debug.h"
1616
#include "mli_helpers_api.h"
17+
#include "mli_krn_reduce_sum2d_chw.h"
1718
#include "mli_prv_dsp.h"
1819
#include "mli_prv_tensor.h"
1920

@@ -85,7 +86,10 @@ mli_status mli_krn_avepool_hwc_fx8(const mli_tensor* in, const mli_pool_cfg* cfg
8586
const int32_t clmn_beg = CEIL_DIV(padding_left, stride_width);
8687
const int32_t clmn_end = out_width - CEIL_DIV(padding_right, stride_width);
8788

88-
int32_t divider = (kernel_height * kernel_width);
89+
const int32_t kernel_size = kernel_width * kernel_height;
90+
int16_t mul = 0;
91+
int shift = 0;
92+
get_mul_shift_value(kernel_size, kernel_size, &mul, &shift);
8993
for (int ch_idx = 0; ch_idx < channels_num; ch_idx++) {
9094
for (int H_idx = row_beg; H_idx < row_end; H_idx++) {
9195
for (int W_idx = clmn_beg; W_idx < clmn_end; W_idx++) {
@@ -97,11 +101,11 @@ mli_status mli_krn_avepool_hwc_fx8(const mli_tensor* in, const mli_pool_cfg* cfg
97101
ch_idx; // move to channel
98102

99103
// Core Sum
100-
int32_t accum_32 = reduce_sum2D_hwc(in_ptr, kernel_width, kernel_height, channels_num, in_width);
104+
accum40_t accu = reduce_sum2D_hwc(in_ptr, kernel_width, kernel_height, channels_num, in_width, mul);
101105

102106
MLI_OUT_PTR(int8_t)
103107
p_out_ftrs = (MLI_OUT_PTR(int8_t))(out_ftrs + ch_idx + (H_idx * out_width + W_idx) * channels_num);
104-
mli_prv_clip_div_and_store_result(p_out_ftrs, divider, accum_32);
108+
mli_prv_shift_clip_and_store_output(p_out_ftrs, &accu, shift);
105109
}
106110
}
107111
}
@@ -154,6 +158,11 @@ mli_status mli_krn_avepool_hwc_fx8(const mli_tensor* in, const mli_pool_cfg* cfg
154158

155159
int32_t rows = kernel_height - top_comp - bottom_comp;
156160
int32_t clmns = kernel_width - right_comp - left_comp;
161+
unsigned int max_kernel_size = kernel_width * kernel_height;
162+
int kernel_size = rows * clmns;
163+
int16_t mul = 0;
164+
int shift = 0;
165+
get_mul_shift_value(kernel_size, max_kernel_size, &mul, &shift);
157166

158167
MLI_PTR(int8_t)
159168
in_ptr = in_ftrs + // starting point
@@ -163,12 +172,12 @@ mli_status mli_krn_avepool_hwc_fx8(const mli_tensor* in, const mli_pool_cfg* cfg
163172
ch_idx;
164173

165174
// Core Sum
166-
int32_t accum_32 = reduce_sum2D_hwc(in_ptr, clmns, rows, channels_num, in_width);
175+
accum40_t accu = reduce_sum2D_hwc(in_ptr, clmns, rows, channels_num, in_width, mul);
167176

168177
// Write result
169178
MLI_OUT_PTR(int8_t)
170179
p_out_ftrs = (MLI_OUT_PTR(int8_t))(out_ftrs + ch_idx + (H_idx * out_width + W_idx) * channels_num);
171-
mli_prv_clip_div_and_store_result(p_out_ftrs, (int32_t)(rows * clmns), accum_32);
180+
mli_prv_shift_clip_and_store_output(p_out_ftrs, &accu, shift);
172181
}
173182
}
174183
}
@@ -185,17 +194,6 @@ mli_status mli_krn_avepool_hwc_fx8(const mli_tensor* in, const mli_pool_cfg* cfg
185194
return MLI_STATUS_OK;
186195
}
187196

188-
static inline int32_t reduce_sum2D_hwc(MLI_PTR(int8_t) in, uint32_t width, uint32_t height, uint32_t channels, uint32_t in_row_step) {
189-
int32_t acc = 0;
190-
for (int row = 0; row < height; row++) {
191-
for (int clmn = 0; clmn < width; clmn++) {
192-
acc += in[clmn * channels];
193-
}
194-
in += in_row_step * channels;
195-
}
196-
return acc;
197-
}
198-
199197
#pragma code()
200198

201199
#ifdef __cplusplus

0 commit comments

Comments
 (0)