improvements for 8bit AGU

JaccovG · JaccovG · commit 60e9e29d1107 · 2019-04-16T17:33:38.000+02:00
diff --git a/lib/src/kernels/convolution/mli_krn_conv2d_chw.h b/lib/src/kernels/convolution/mli_krn_conv2d_chw.h
@@ -924,7 +924,7 @@ static inline void __attribute__ ((always_inline)) conv2d_row_str1 (
 /* optimized function that can do both the borders and the main part
  * for multiple kernel sizes and padding sizes. */
 template < typename io_T, typename w_T >
-static inline void __attribute__ ((always_inline)) conv2d_chw_str1 (
+static inline void __attribute__ ((always_inline)) conv2d_chw_str1_impl (
         const MLI_PTR (io_T) __restrict in_ftrs,
         const MLI_PTR (w_T) __restrict weights,
         const MLI_PTR (w_T) __restrict biases,
@@ -1324,4 +1324,72 @@ static inline void __attribute__ ((always_inline)) conv2d_chw (
     }
 }
 
+template < typename io_T, typename w_T >
+static inline void __attribute__ ((always_inline)) conv2d_chw_str1 (
+        const MLI_PTR (io_T) __restrict in_ftrs,
+        const MLI_PTR (w_T) __restrict weights,
+        const MLI_PTR (w_T) __restrict biases,
+        MLI_CONV_OUT_PTR (io_T) __restrict out_ftrs,
+        const rect_t * const perception_area,
+        const int bias_shift,
+        const int out_shift,
+        const int16_t val_min_limit,
+        const int16_t val_max_limit,
+        const int in_ch, const int in_width, const int in_height,
+        const int out_ch, const int out_width, const int out_height,
+        const int kernel_height, const int kernel_width,
+        const int stride_height, const int stride_width,
+        const int padding_top, const int padding_bot,
+        const int padding_left, const int padding_right,
+        const int fixed_padding, const int depthwise) {
+
+    conv2d_chw_str1_impl(
+        in_ftrs, weights, biases, out_ftrs, perception_area,
+        bias_shift, out_shift,
+        val_min_limit, val_max_limit,
+        in_ch, in_width, in_height,
+        out_ch, out_width, out_height,
+        kernel_height, kernel_width,
+        stride_height, stride_width,
+        padding_top, padding_bot, padding_left, padding_right,
+        fixed_padding, depthwise);
+}
+
+#if !defined __Xxy
+/* For platforms without AGU, conv2d_chw gives better performance for 8bit,
+ * because the _dmachbl and _dmachbm are used, and they have integrated
+ * sign extention from 8 to 16bit.
+ * For platforms with AGU, the sign extention is done by the AGU
+ */
+static inline void __attribute__ ((always_inline)) conv2d_chw_str1 (
+        const MLI_PTR (int8_t) __restrict in_ftrs,
+        const MLI_PTR (int8_t) __restrict weights,
+        const MLI_PTR (int8_t) __restrict biases,
+        MLI_CONV_OUT_PTR (int8_t) __restrict out_ftrs,
+        const rect_t * const perception_area,
+        const int bias_shift,
+        const int out_shift,
+        const int16_t val_min_limit,
+        const int16_t val_max_limit,
+        const int in_ch, const int in_width, const int in_height,
+        const int out_ch, const int out_width, const int out_height,
+        const int kernel_height, const int kernel_width,
+        const int stride_height, const int stride_width,
+        const int padding_top, const int padding_bot,
+        const int padding_left, const int padding_right,
+        const int fixed_padding, const int depthwise) {
+
+    conv2d_chw(
+        in_ftrs, weights, biases, out_ftrs, perception_area,
+        bias_shift, out_shift,
+        val_min_limit, val_max_limit,
+        in_ch, in_width, in_height,
+        out_ch, out_width, out_height,
+        kernel_height, kernel_width,
+        stride_height, stride_width,
+        padding_top, padding_bot, padding_left, padding_right,
+        fixed_padding, depthwise);
+}
+#endif
+
 #endif // _MLI_KRN_CONV2D_CHW_H_
diff --git a/lib/src/kernels/convolution/mli_krn_conv2d_chw_fx8.cc b/lib/src/kernels/convolution/mli_krn_conv2d_chw_fx8.cc
@@ -499,7 +499,7 @@ mli_status mli_krn_conv2d_chw_fx8_k2x2_str1_krnpad(
 
     mli_prv_fx_init_dsp_ctrl();
 
-    conv2d_chw(
+    conv2d_chw_str1(
         in_ftrs, wt, bs, out_ftrs, &cent_area,
         bias_shift, out_shift,
         val_limit.min, val_limit.max,
@@ -597,7 +597,7 @@ mli_status mli_krn_conv2d_chw_fx8_k2x2_ch1_str1_krnpad(
 
     mli_prv_fx_init_dsp_ctrl();
 
-    conv2d_chw(
+    conv2d_chw_str1(
         in_ftrs, wt, bs, out_ftrs, &cent_area,
         bias_shift, out_shift,
         val_limit.min, val_limit.max,
@@ -695,7 +695,7 @@ mli_status mli_krn_conv2d_chw_fx8_k3x3_str1_krnpad(
 
     mli_prv_fx_init_dsp_ctrl();
 
-    conv2d_chw(
+    conv2d_chw_str1(
         in_ftrs, wt, bs, out_ftrs, &cent_area,
         bias_shift, out_shift,
         val_limit.min, val_limit.max,
@@ -793,7 +793,7 @@ mli_status mli_krn_conv2d_chw_fx8_k3x3_ch1_str1_krnpad(
 
     mli_prv_fx_init_dsp_ctrl();
 
-    conv2d_chw(
+    conv2d_chw_str1(
         in_ftrs, wt, bs, out_ftrs, &cent_area,
         bias_shift, out_shift,
         val_limit.min, val_limit.max,
@@ -891,7 +891,7 @@ mli_status mli_krn_conv2d_chw_fx8_k4x4_str1_krnpad(
 
     mli_prv_fx_init_dsp_ctrl();
 
-    conv2d_chw(
+    conv2d_chw_str1(
         in_ftrs, wt, bs, out_ftrs, &cent_area,
         bias_shift, out_shift,
         val_limit.min, val_limit.max,
@@ -989,7 +989,7 @@ mli_status mli_krn_conv2d_chw_fx8_k4x4_ch1_str1_krnpad(
 
     mli_prv_fx_init_dsp_ctrl();
 
-    conv2d_chw(
+    conv2d_chw_str1(
         in_ftrs, wt, bs, out_ftrs, &cent_area,
         bias_shift, out_shift,
         val_limit.min, val_limit.max,
@@ -1087,7 +1087,7 @@ mli_status mli_krn_conv2d_chw_fx8_k5x5_str1_krnpad(
 
     mli_prv_fx_init_dsp_ctrl();
 
-    conv2d_chw(
+    conv2d_chw_str1(
         in_ftrs, wt, bs, out_ftrs, &cent_area,
         bias_shift, out_shift,
         val_limit.min, val_limit.max,
@@ -1185,7 +1185,7 @@ mli_status mli_krn_conv2d_chw_fx8_k5x5_ch1_str1_krnpad(
 
     mli_prv_fx_init_dsp_ctrl();
 
-    conv2d_chw(
+    conv2d_chw_str1(
         in_ftrs, wt, bs, out_ftrs, &cent_area,
         bias_shift, out_shift,
         val_limit.min, val_limit.max,
@@ -1283,7 +1283,7 @@ mli_status mli_krn_conv2d_chw_fx8_k6x6_str1_krnpad(
 
     mli_prv_fx_init_dsp_ctrl();
 
-    conv2d_chw(
+    conv2d_chw_str1(
         in_ftrs, wt, bs, out_ftrs, &cent_area,
         bias_shift, out_shift,
         val_limit.min, val_limit.max,
@@ -1381,7 +1381,7 @@ mli_status mli_krn_conv2d_chw_fx8_k6x6_ch1_str1_krnpad(
 
     mli_prv_fx_init_dsp_ctrl();
 
-    conv2d_chw(
+    conv2d_chw_str1(
         in_ftrs, wt, bs, out_ftrs, &cent_area,
         bias_shift, out_shift,
         val_limit.min, val_limit.max,
@@ -1479,7 +1479,7 @@ mli_status mli_krn_conv2d_chw_fx8_k7x7_str1_krnpad(
 
     mli_prv_fx_init_dsp_ctrl();
 
-    conv2d_chw(
+    conv2d_chw_str1(
         in_ftrs, wt, bs, out_ftrs, &cent_area,
         bias_shift, out_shift,
         val_limit.min, val_limit.max,
@@ -1577,7 +1577,7 @@ mli_status mli_krn_conv2d_chw_fx8_k7x7_ch1_str1_krnpad(
 
     mli_prv_fx_init_dsp_ctrl();
 
-    conv2d_chw(
+    conv2d_chw_str1(
         in_ftrs, wt, bs, out_ftrs, &cent_area,
         bias_shift, out_shift,
         val_limit.min, val_limit.max,
diff --git a/lib/src/private/mli_prv_load_store.h b/lib/src/private/mli_prv_load_store.h
@@ -21,7 +21,7 @@
 
 
 static inline v2q15_t __attribute__ ((always_inline)) mli_prv_load_2_samples (const MLI_PTR (int8_t) __restrict in) {
-#ifndef _ARC
+#if defined __Xxy
     return __builtin_convertvector (*(MLI_PTR (v2i8_t)) in, v2q15_t);
 #else
     int16_t two8bitvalues = *(MLI_PTR (int16_t)) in;