foss-for-synopsys-dwc-arc-processors
diff --git a/‎lib/src/kernels/pooling/mli_krn_avepool_chw.h‎
Lines changed: 31 additions & 18 deletions b/‎lib/src/kernels/pooling/mli_krn_avepool_chw.h‎
Lines changed: 31 additions & 18 deletions
diff --git a/‎lib/src/kernels/pooling/mli_krn_avepool_hwc_fx16.cc‎
Lines changed: 14 additions & 17 deletions b/‎lib/src/kernels/pooling/mli_krn_avepool_hwc_fx16.cc‎
Lines changed: 14 additions & 17 deletions
diff --git a/‎lib/src/kernels/pooling/mli_krn_avepool_hwc_fx8.cc‎
Lines changed: 14 additions & 16 deletions b/‎lib/src/kernels/pooling/mli_krn_avepool_hwc_fx8.cc‎
Lines changed: 14 additions & 16 deletions
@@ -23,6 +23,7 @@
  * Targets:
  *
  ******************************************************************************/
+
 template <typename io_T>
 static inline void __attribute__((always_inline)) avepool_chw_nopad(
         const int row_beg,
@@ -48,6 +49,9 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad(
     (void)padding_bot;
 
     const int kernel_size = kernel_width * kernel_height;
+    int16_t mul = 0;
+    int shift = 0;
+    get_mul_shift_value(kernel_size, kernel_size, &mul, &shift);
 
     MLI_OUT_PTR(io_T) __restrict p_out_ftrs = out_ftrs + row_beg * out_width + clmn_beg;
     MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T))in_ftrs + in_width * (row_beg * stride_height - padding_top) +
@@ -59,9 +63,9 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad(
         for (int j = 0; j < (row_end - row_beg); j++) {
             for (int k = 0; k < (clmn_end - clmn_beg); k++) {
                 accum40_t accum_40 = fx_create_a40(0x0, 0x0);
-                reduce_sum2D(&accum_40, in_ptr, kernel_width, kernel_height, in_width);
+                reduce_sum2D(&accum_40, in_ptr, kernel_width, kernel_height, in_width, mul);
                 // Write results
-                mli_prv_clip_div_and_store_result(p_out_ftrs, kernel_size, accum_40);
+                mli_prv_shift_clip_and_store_output(p_out_ftrs, &accum_40, shift);
 
                 p_out_ftrs++;
                 in_ptr += stride_width;
@@ -99,6 +103,9 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_even(
     (void)padding_bot;
 
     const int kernel_size = kernel_width * kernel_height;
+    int16_t mul = 0;
+    int shift = 0;
+    get_mul_shift_value(kernel_size, kernel_size, &mul, &shift);
 
     MLI_OUT_PTR(io_T) __restrict p_out_ftrs = out_ftrs + row_beg * out_width + clmn_beg;
     MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T))in_ftrs + in_width * (row_beg * stride_height - padding_top) +
@@ -110,11 +117,10 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_even(
         for (int j = 0; j < (row_end - row_beg); j++) {
             for (int k = 0; k < (clmn_end - clmn_beg); k++) {
                 // Core Sum
-
                 accum40_t accum_40 = fx_create_a40(0x0, 0x0);
-                reduce_sum2D_even(&accum_40, in_ptr, kernel_width, kernel_height, in_width);
+                reduce_sum2D_even(&accum_40, in_ptr, kernel_width, kernel_height, in_width, mul);
                 // Write results
-                mli_prv_clip_div_and_store_result(p_out_ftrs, kernel_size, accum_40);
+                mli_prv_shift_clip_and_store_output(p_out_ftrs, &accum_40, shift);
 
                 p_out_ftrs++;
                 in_ptr += stride_width;
@@ -150,6 +156,7 @@ static inline void __attribute__((always_inline)) avepool_chw(
         const int padding_bot) {
     (void)padding_right;
     (void)padding_bot;
+    unsigned int max_kernel_size = kernel_width * kernel_height;
 
     MLI_OUT_PTR(io_T) __restrict out_ptr = out_ftrs + clmn_beg * out_width + clmn_beg;
     for (int ch_idx = 0; ch_idx < channels_num; ch_idx++) {
@@ -168,6 +175,9 @@ static inline void __attribute__((always_inline)) avepool_chw(
                 int clmns = kernel_width + right_comp + left_comp;
 
                 const int kernel_size = rows * clmns;
+                int16_t mul = 0;
+                int shift = 0;
+                get_mul_shift_value(kernel_size, max_kernel_size, &mul, &shift);
 
                 const MLI_PTR(io_T) __restrict in_ptr =
                         in_ftrs +                                                      // starting point
@@ -176,9 +186,9 @@ static inline void __attribute__((always_inline)) avepool_chw(
                         (W_idx * stride_width) - padding_left - left_comp;             // move to column
 
                 accum40_t accum_40 = fx_create_a40(0x0, 0x0);
-                reduce_sum2D(&accum_40, in_ptr, clmns, rows, in_width);
+                reduce_sum2D(&accum_40, in_ptr, clmns, rows, in_width, mul);
                 // Write results
-                mli_prv_clip_div_and_store_result(&p_out_ftrs[W_idx], kernel_size, accum_40);
+                mli_prv_shift_clip_and_store_output(&p_out_ftrs[W_idx], &accum_40, shift);
 
             }  // W_idx
             out_ptr += out_width + clmn_beg - clmn_end;
@@ -213,21 +223,21 @@ static inline void __attribute__((always_inline)) avepool_chw_k4x4_str1_nopad(
 
     MLI_ASSERT(stride_width == 1);
     MLI_ASSERT(stride_height == 1);
+    MLI_ASSERT(kernel_width == 4);
+    MLI_ASSERT(kernel_height == 4);
 
     MLI_OUT_PTR(io_T) __restrict p_out_ftrs = out_ftrs + row_beg * out_width + clmn_beg;
     MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T))in_ftrs + in_width * (row_beg * stride_height - padding_top) +
            (clmn_beg * stride_width - padding_left);
     const int delta_W = (clmn_end - clmn_beg);
     const int delta_H = (row_end - row_beg);
-    const int kernel_size = (kernel_width * kernel_height);
 
     for (int ch_idx = 0; ch_idx < channels_num; ch_idx++) {
         for (int j = 0; j < (row_end - row_beg); j++) {
             for (int k = 0; k < (clmn_end - clmn_beg); k++) {
                 accum40_t accum_40 = fx_create_a40(0x0, 0x0);
-                reduce_sum2D_even(&accum_40, (const MLI_PTR(io_T))in_ptr, kernel_width, kernel_height, in_width);
-
-                mli_prv_clip_div_and_store_result(p_out_ftrs, kernel_size, accum_40);
+                reduce_sum2D_even(&accum_40, (const MLI_PTR(io_T))in_ptr, kernel_width, kernel_height, in_width, 1);
+                mli_prv_shift_clip_and_store_output(p_out_ftrs, &accum_40, 4);
 
                 p_out_ftrs++;
                 in_ptr += stride_width;
@@ -269,17 +279,18 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_k2x2(
            (clmn_beg * stride_width - padding_left);
     const int delta_W = (clmn_end - clmn_beg);
     const int delta_H = (row_end - row_beg);
-    const int kernel_size = (kernel_width * kernel_height);
+
+    MLI_ASSERT(kernel_width == 2);
+    MLI_ASSERT(kernel_height == 2);
 
     for (int ch_idx = 0; ch_idx < channels_num; ch_idx++) {
         for (int j = 0; j < (row_end - row_beg); j++) {
             for (int k = 0; k < (clmn_end - clmn_beg); k++) {
                 // Core Sum
 
                 accum40_t accum_40 = fx_create_a40(0x0, 0x0);
-                reduce_sum2D_even(&accum_40, in_ptr, kernel_width, kernel_height, in_width);
-
-                mli_prv_clip_div_and_store_result(p_out_ftrs, kernel_size, accum_40);
+                reduce_sum2D_even(&accum_40, in_ptr, kernel_width, kernel_height, in_width, 1);
+                mli_prv_shift_clip_and_store_output(p_out_ftrs, &accum_40, 2);
 
                 p_out_ftrs++;
                 in_ptr += stride_width;
@@ -317,6 +328,9 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_k4_Nx2_N_eve
     (void)padding_bot;
 
     const int kernel_size = kernel_height * kernel_width;
+    int16_t mul = 0;
+    int shift = 0;
+    get_mul_shift_value(kernel_size, kernel_size, &mul, &shift);
 
     MLI_OUT_PTR(io_T) __restrict p_out_ftrs = out_ftrs + row_beg * out_width + clmn_beg;
     MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T))in_ftrs + in_width * (row_beg * stride_height - padding_top) +
@@ -329,9 +343,8 @@ static inline void __attribute__((always_inline)) avepool_chw_nopad_k4_Nx2_N_eve
             for (int k = 0; k < (clmn_end - clmn_beg); k++) {
                 // Core Sum
                 accum40_t accum_40 = fx_create_a40(0x0, 0x0);
-                reduce_sum2D_even(&accum_40, in_ptr, kernel_width, kernel_height, in_width);
-
-                mli_prv_clip_div_and_store_result(p_out_ftrs, kernel_size, accum_40);
+                reduce_sum2D_even(&accum_40, in_ptr, kernel_width, kernel_height, in_width, mul);
+                mli_prv_shift_clip_and_store_output(p_out_ftrs, &accum_40, shift);
 
                 p_out_ftrs++;
                 in_ptr += stride_width;
 
@@ -14,6 +14,7 @@
 #include "mli_config.h"
 #include "mli_debug.h"
 #include "mli_helpers_api.h"
+#include "mli_krn_reduce_sum2d_chw.h"
 #include "mli_prv_dsp.h"
 
 #ifdef __FXAPI__
@@ -44,8 +45,6 @@ extern "C" {
  *
  ******************************************************************************/
 
-static inline int32_t reduce_sum2D_hwc(MLI_PTR(int16_t) in, uint32_t width, uint32_t height, uint32_t channels, uint32_t in_row_step);
-
 mli_status mli_krn_avepool_hwc_fx16(const mli_tensor* in, const mli_pool_cfg* cfg, mli_tensor* out) {
     mli_status ret = MLI_CHECK_STATUS(mli_chk_avepool_hwc_fx16(in, cfg, out), __func__);
     if (ret != MLI_STATUS_OK) return ret;
@@ -85,6 +84,10 @@ mli_status mli_krn_avepool_hwc_fx16(const mli_tensor* in, const mli_pool_cfg* cf
         const int32_t clmn_end = out_width - CEIL_DIV(padding_right, stride_width);
 
         const int32_t kernel_size = kernel_width * kernel_height;
+        int16_t mul = 0;
+        int shift = 0;
+        get_mul_shift_value(kernel_size, kernel_size, &mul, &shift);
+
         for (int ch_idx = 0; ch_idx < channels_num; ch_idx++) {
             for (int H_idx = row_beg; H_idx < row_end; H_idx++) {
                 for (int W_idx = clmn_beg; W_idx < clmn_end; W_idx++) {
@@ -96,12 +99,12 @@ mli_status mli_krn_avepool_hwc_fx16(const mli_tensor* in, const mli_pool_cfg* cf
                              ch_idx;                                                            // move to channel
 
                     // Core Sum
-                    int32_t accum_32 = reduce_sum2D_hwc(in_ptr, kernel_width, kernel_height, channels_num, in_width);
+                    accum40_t accu = reduce_sum2D_hwc(in_ptr, kernel_width, kernel_height, channels_num, in_width, mul);
 
                     // Write results
                     MLI_OUT_PTR(int16_t)
                     p_out_ftrs = (MLI_OUT_PTR(int16_t))(out_ftrs + ch_idx + (H_idx * out_width + W_idx) * channels_num);
-                    mli_prv_clip_div_and_store_result(p_out_ftrs, kernel_size, accum_32);
+                    mli_prv_shift_clip_and_store_output(p_out_ftrs, &accu, shift);
                 }
             }
         }
@@ -154,6 +157,11 @@ mli_status mli_krn_avepool_hwc_fx16(const mli_tensor* in, const mli_pool_cfg* cf
 
                         int32_t rows = kernel_height - top_comp - bottom_comp;
                         int32_t clmns = kernel_width - right_comp - left_comp;
+                        unsigned int max_kernel_size = kernel_width * kernel_height;
+                        int kernel_size = rows * clmns;
+                        int16_t mul = 0;
+                        int shift = 0;
+                        get_mul_shift_value(kernel_size, max_kernel_size, &mul, &shift);
 
                         MLI_PTR(int16_t)
                         in_ptr = in_ftrs +  // starting point
@@ -163,12 +171,12 @@ mli_status mli_krn_avepool_hwc_fx16(const mli_tensor* in, const mli_pool_cfg* cf
                                  ch_idx;
 
                         // Core Sum
-                        int32_t accum_32 = reduce_sum2D_hwc(in_ptr, clmns, rows, channels_num, in_width);
+                        accum40_t accu = reduce_sum2D_hwc(in_ptr, clmns, rows, channels_num, in_width, mul);
 
                         // Write result
                         MLI_OUT_PTR(int16_t)
                         p_out_ftrs = (MLI_OUT_PTR(int16_t))(out_ftrs + ch_idx + (H_idx * out_width + W_idx) * channels_num);
-                        mli_prv_clip_div_and_store_result(p_out_ftrs, (int32_t)(rows * clmns), accum_32);
+                        mli_prv_shift_clip_and_store_output(p_out_ftrs, &accu, shift);
                     }
                 }
             }
@@ -185,17 +193,6 @@ mli_status mli_krn_avepool_hwc_fx16(const mli_tensor* in, const mli_pool_cfg* cf
     return MLI_STATUS_OK;
 }
 
-static inline int32_t reduce_sum2D_hwc(MLI_PTR(int16_t) in, uint32_t width, uint32_t height, uint32_t channels, uint32_t in_row_step) {
-    int32_t acc = 0;
-    for (int row = 0; row < height; row++) {
-        for (int clmn = 0; clmn < width; clmn++) {
-            acc += in[clmn * channels];
-        }
-        in += in_row_step * channels;
-    }
-    return acc;
-}
-
 #pragma code()
 
 #ifdef __cplusplus
 
@@ -14,6 +14,7 @@
 #include "mli_config.h"
 #include "mli_debug.h"
 #include "mli_helpers_api.h"
+#include "mli_krn_reduce_sum2d_chw.h"
 #include "mli_prv_dsp.h"
 #include "mli_prv_tensor.h"
 
@@ -85,7 +86,10 @@ mli_status mli_krn_avepool_hwc_fx8(const mli_tensor* in, const mli_pool_cfg* cfg
         const int32_t clmn_beg = CEIL_DIV(padding_left, stride_width);
         const int32_t clmn_end = out_width - CEIL_DIV(padding_right, stride_width);
 
-        int32_t divider = (kernel_height * kernel_width);
+        const int32_t kernel_size = kernel_width * kernel_height;
+        int16_t mul = 0;
+        int shift = 0;
+        get_mul_shift_value(kernel_size, kernel_size, &mul, &shift);
         for (int ch_idx = 0; ch_idx < channels_num; ch_idx++) {
             for (int H_idx = row_beg; H_idx < row_end; H_idx++) {
                 for (int W_idx = clmn_beg; W_idx < clmn_end; W_idx++) {
@@ -97,11 +101,11 @@ mli_status mli_krn_avepool_hwc_fx8(const mli_tensor* in, const mli_pool_cfg* cfg
                              ch_idx;                                                            // move to channel
 
                     // Core Sum
-                    int32_t accum_32 = reduce_sum2D_hwc(in_ptr, kernel_width, kernel_height, channels_num, in_width);
+                    accum40_t accu = reduce_sum2D_hwc(in_ptr, kernel_width, kernel_height, channels_num, in_width, mul);
 
                     MLI_OUT_PTR(int8_t)
                     p_out_ftrs = (MLI_OUT_PTR(int8_t))(out_ftrs + ch_idx + (H_idx * out_width + W_idx) * channels_num);
-                    mli_prv_clip_div_and_store_result(p_out_ftrs, divider, accum_32);
+                    mli_prv_shift_clip_and_store_output(p_out_ftrs, &accu, shift);
                 }
             }
         }
@@ -154,6 +158,11 @@ mli_status mli_krn_avepool_hwc_fx8(const mli_tensor* in, const mli_pool_cfg* cfg
 
                         int32_t rows = kernel_height - top_comp - bottom_comp;
                         int32_t clmns = kernel_width - right_comp - left_comp;
+                        unsigned int max_kernel_size = kernel_width * kernel_height;
+                        int kernel_size = rows * clmns;
+                        int16_t mul = 0;
+                        int shift = 0;
+                        get_mul_shift_value(kernel_size, max_kernel_size, &mul, &shift);
 
                         MLI_PTR(int8_t)
                         in_ptr = in_ftrs +  // starting point
@@ -163,12 +172,12 @@ mli_status mli_krn_avepool_hwc_fx8(const mli_tensor* in, const mli_pool_cfg* cfg
                                  ch_idx;
 
                         // Core Sum
-                        int32_t accum_32 = reduce_sum2D_hwc(in_ptr, clmns, rows, channels_num, in_width);
+                        accum40_t accu = reduce_sum2D_hwc(in_ptr, clmns, rows, channels_num, in_width, mul);
 
                         // Write result
                         MLI_OUT_PTR(int8_t)
                         p_out_ftrs = (MLI_OUT_PTR(int8_t))(out_ftrs + ch_idx + (H_idx * out_width + W_idx) * channels_num);
-                        mli_prv_clip_div_and_store_result(p_out_ftrs, (int32_t)(rows * clmns), accum_32);
+                        mli_prv_shift_clip_and_store_output(p_out_ftrs, &accu, shift);
                     }
                 }
             }
@@ -185,17 +194,6 @@ mli_status mli_krn_avepool_hwc_fx8(const mli_tensor* in, const mli_pool_cfg* cfg
     return MLI_STATUS_OK;
 }
 
-static inline int32_t reduce_sum2D_hwc(MLI_PTR(int8_t) in, uint32_t width, uint32_t height, uint32_t channels, uint32_t in_row_step) {
-    int32_t acc = 0;
-    for (int row = 0; row < height; row++) {
-        for (int clmn = 0; clmn < width; clmn++) {
-            acc += in[clmn * channels];
-        }
-        in += in_row_step * channels;
-    }
-    return acc;
-}
-
 #pragma code()
 
 #ifdef __cplusplus