@@ -924,7 +924,7 @@ static inline void __attribute__ ((always_inline)) conv2d_row_str1 (
924924/* optimized function that can do both the borders and the main part
925925 * for multiple kernel sizes and padding sizes. */
926926template < typename io_T, typename w_T >
927- static inline void __attribute__ ((always_inline)) conv2d_chw_str1 (
927+ static inline void __attribute__ ((always_inline)) conv2d_chw_str1_impl (
928928 const MLI_PTR (io_T) __restrict in_ftrs,
929929 const MLI_PTR (w_T) __restrict weights,
930930 const MLI_PTR (w_T) __restrict biases,
@@ -1324,4 +1324,72 @@ static inline void __attribute__ ((always_inline)) conv2d_chw (
13241324 }
13251325}
13261326
1327+ template < typename io_T, typename w_T >
1328+ static inline void __attribute__ ((always_inline)) conv2d_chw_str1 (
1329+ const MLI_PTR (io_T) __restrict in_ftrs,
1330+ const MLI_PTR (w_T) __restrict weights,
1331+ const MLI_PTR (w_T) __restrict biases,
1332+ MLI_CONV_OUT_PTR (io_T) __restrict out_ftrs,
1333+ const rect_t * const perception_area,
1334+ const int bias_shift,
1335+ const int out_shift,
1336+ const int16_t val_min_limit,
1337+ const int16_t val_max_limit,
1338+ const int in_ch, const int in_width, const int in_height,
1339+ const int out_ch, const int out_width, const int out_height,
1340+ const int kernel_height, const int kernel_width,
1341+ const int stride_height, const int stride_width,
1342+ const int padding_top, const int padding_bot,
1343+ const int padding_left, const int padding_right,
1344+ const int fixed_padding, const int depthwise) {
1345+
1346+ conv2d_chw_str1_impl (
1347+ in_ftrs, weights, biases, out_ftrs, perception_area,
1348+ bias_shift, out_shift,
1349+ val_min_limit, val_max_limit,
1350+ in_ch, in_width, in_height,
1351+ out_ch, out_width, out_height,
1352+ kernel_height, kernel_width,
1353+ stride_height, stride_width,
1354+ padding_top, padding_bot, padding_left, padding_right,
1355+ fixed_padding, depthwise);
1356+ }
1357+
1358+ #if !defined __Xxy
1359+ /* For platforms without AGU, conv2d_chw gives better performance for 8bit,
1360+ * because the _dmachbl and _dmachbm are used, and they have integrated
1361+ * sign extention from 8 to 16bit.
1362+ * For platforms with AGU, the sign extention is done by the AGU
1363+ */
1364+ static inline void __attribute__ ((always_inline)) conv2d_chw_str1 (
1365+ const MLI_PTR (int8_t ) __restrict in_ftrs,
1366+ const MLI_PTR (int8_t ) __restrict weights,
1367+ const MLI_PTR (int8_t ) __restrict biases,
1368+ MLI_CONV_OUT_PTR (int8_t ) __restrict out_ftrs,
1369+ const rect_t * const perception_area,
1370+ const int bias_shift,
1371+ const int out_shift,
1372+ const int16_t val_min_limit,
1373+ const int16_t val_max_limit,
1374+ const int in_ch, const int in_width, const int in_height,
1375+ const int out_ch, const int out_width, const int out_height,
1376+ const int kernel_height, const int kernel_width,
1377+ const int stride_height, const int stride_width,
1378+ const int padding_top, const int padding_bot,
1379+ const int padding_left, const int padding_right,
1380+ const int fixed_padding, const int depthwise) {
1381+
1382+ conv2d_chw (
1383+ in_ftrs, weights, biases, out_ftrs, perception_area,
1384+ bias_shift, out_shift,
1385+ val_min_limit, val_max_limit,
1386+ in_ch, in_width, in_height,
1387+ out_ch, out_width, out_height,
1388+ kernel_height, kernel_width,
1389+ stride_height, stride_width,
1390+ padding_top, padding_bot, padding_left, padding_right,
1391+ fixed_padding, depthwise);
1392+ }
1393+ #endif
1394+
13271395#endif // _MLI_KRN_CONV2D_CHW_H_
0 commit comments