Skip to content

Commit 60e9e29

Browse files
committed
improvements for 8bit AGU
1 parent 5c1de19 commit 60e9e29

File tree

3 files changed

+82
-14
lines changed

3 files changed

+82
-14
lines changed

lib/src/kernels/convolution/mli_krn_conv2d_chw.h

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -924,7 +924,7 @@ static inline void __attribute__ ((always_inline)) conv2d_row_str1 (
924924
/* optimized function that can do both the borders and the main part
925925
* for multiple kernel sizes and padding sizes. */
926926
template < typename io_T, typename w_T >
927-
static inline void __attribute__ ((always_inline)) conv2d_chw_str1 (
927+
static inline void __attribute__ ((always_inline)) conv2d_chw_str1_impl (
928928
const MLI_PTR (io_T) __restrict in_ftrs,
929929
const MLI_PTR (w_T) __restrict weights,
930930
const MLI_PTR (w_T) __restrict biases,
@@ -1324,4 +1324,72 @@ static inline void __attribute__ ((always_inline)) conv2d_chw (
13241324
}
13251325
}
13261326

1327+
template < typename io_T, typename w_T >
1328+
static inline void __attribute__ ((always_inline)) conv2d_chw_str1 (
1329+
const MLI_PTR (io_T) __restrict in_ftrs,
1330+
const MLI_PTR (w_T) __restrict weights,
1331+
const MLI_PTR (w_T) __restrict biases,
1332+
MLI_CONV_OUT_PTR (io_T) __restrict out_ftrs,
1333+
const rect_t * const perception_area,
1334+
const int bias_shift,
1335+
const int out_shift,
1336+
const int16_t val_min_limit,
1337+
const int16_t val_max_limit,
1338+
const int in_ch, const int in_width, const int in_height,
1339+
const int out_ch, const int out_width, const int out_height,
1340+
const int kernel_height, const int kernel_width,
1341+
const int stride_height, const int stride_width,
1342+
const int padding_top, const int padding_bot,
1343+
const int padding_left, const int padding_right,
1344+
const int fixed_padding, const int depthwise) {
1345+
1346+
conv2d_chw_str1_impl(
1347+
in_ftrs, weights, biases, out_ftrs, perception_area,
1348+
bias_shift, out_shift,
1349+
val_min_limit, val_max_limit,
1350+
in_ch, in_width, in_height,
1351+
out_ch, out_width, out_height,
1352+
kernel_height, kernel_width,
1353+
stride_height, stride_width,
1354+
padding_top, padding_bot, padding_left, padding_right,
1355+
fixed_padding, depthwise);
1356+
}
1357+
1358+
#if !defined __Xxy
1359+
/* For platforms without AGU, conv2d_chw gives better performance for 8bit,
1360+
* because the _dmachbl and _dmachbm are used, and they have integrated
1361+
* sign extention from 8 to 16bit.
1362+
* For platforms with AGU, the sign extention is done by the AGU
1363+
*/
1364+
static inline void __attribute__ ((always_inline)) conv2d_chw_str1 (
1365+
const MLI_PTR (int8_t) __restrict in_ftrs,
1366+
const MLI_PTR (int8_t) __restrict weights,
1367+
const MLI_PTR (int8_t) __restrict biases,
1368+
MLI_CONV_OUT_PTR (int8_t) __restrict out_ftrs,
1369+
const rect_t * const perception_area,
1370+
const int bias_shift,
1371+
const int out_shift,
1372+
const int16_t val_min_limit,
1373+
const int16_t val_max_limit,
1374+
const int in_ch, const int in_width, const int in_height,
1375+
const int out_ch, const int out_width, const int out_height,
1376+
const int kernel_height, const int kernel_width,
1377+
const int stride_height, const int stride_width,
1378+
const int padding_top, const int padding_bot,
1379+
const int padding_left, const int padding_right,
1380+
const int fixed_padding, const int depthwise) {
1381+
1382+
conv2d_chw(
1383+
in_ftrs, weights, biases, out_ftrs, perception_area,
1384+
bias_shift, out_shift,
1385+
val_min_limit, val_max_limit,
1386+
in_ch, in_width, in_height,
1387+
out_ch, out_width, out_height,
1388+
kernel_height, kernel_width,
1389+
stride_height, stride_width,
1390+
padding_top, padding_bot, padding_left, padding_right,
1391+
fixed_padding, depthwise);
1392+
}
1393+
#endif
1394+
13271395
#endif // _MLI_KRN_CONV2D_CHW_H_

lib/src/kernels/convolution/mli_krn_conv2d_chw_fx8.cc

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -499,7 +499,7 @@ mli_status mli_krn_conv2d_chw_fx8_k2x2_str1_krnpad(
499499

500500
mli_prv_fx_init_dsp_ctrl();
501501

502-
conv2d_chw(
502+
conv2d_chw_str1(
503503
in_ftrs, wt, bs, out_ftrs, &cent_area,
504504
bias_shift, out_shift,
505505
val_limit.min, val_limit.max,
@@ -597,7 +597,7 @@ mli_status mli_krn_conv2d_chw_fx8_k2x2_ch1_str1_krnpad(
597597

598598
mli_prv_fx_init_dsp_ctrl();
599599

600-
conv2d_chw(
600+
conv2d_chw_str1(
601601
in_ftrs, wt, bs, out_ftrs, &cent_area,
602602
bias_shift, out_shift,
603603
val_limit.min, val_limit.max,
@@ -695,7 +695,7 @@ mli_status mli_krn_conv2d_chw_fx8_k3x3_str1_krnpad(
695695

696696
mli_prv_fx_init_dsp_ctrl();
697697

698-
conv2d_chw(
698+
conv2d_chw_str1(
699699
in_ftrs, wt, bs, out_ftrs, &cent_area,
700700
bias_shift, out_shift,
701701
val_limit.min, val_limit.max,
@@ -793,7 +793,7 @@ mli_status mli_krn_conv2d_chw_fx8_k3x3_ch1_str1_krnpad(
793793

794794
mli_prv_fx_init_dsp_ctrl();
795795

796-
conv2d_chw(
796+
conv2d_chw_str1(
797797
in_ftrs, wt, bs, out_ftrs, &cent_area,
798798
bias_shift, out_shift,
799799
val_limit.min, val_limit.max,
@@ -891,7 +891,7 @@ mli_status mli_krn_conv2d_chw_fx8_k4x4_str1_krnpad(
891891

892892
mli_prv_fx_init_dsp_ctrl();
893893

894-
conv2d_chw(
894+
conv2d_chw_str1(
895895
in_ftrs, wt, bs, out_ftrs, &cent_area,
896896
bias_shift, out_shift,
897897
val_limit.min, val_limit.max,
@@ -989,7 +989,7 @@ mli_status mli_krn_conv2d_chw_fx8_k4x4_ch1_str1_krnpad(
989989

990990
mli_prv_fx_init_dsp_ctrl();
991991

992-
conv2d_chw(
992+
conv2d_chw_str1(
993993
in_ftrs, wt, bs, out_ftrs, &cent_area,
994994
bias_shift, out_shift,
995995
val_limit.min, val_limit.max,
@@ -1087,7 +1087,7 @@ mli_status mli_krn_conv2d_chw_fx8_k5x5_str1_krnpad(
10871087

10881088
mli_prv_fx_init_dsp_ctrl();
10891089

1090-
conv2d_chw(
1090+
conv2d_chw_str1(
10911091
in_ftrs, wt, bs, out_ftrs, &cent_area,
10921092
bias_shift, out_shift,
10931093
val_limit.min, val_limit.max,
@@ -1185,7 +1185,7 @@ mli_status mli_krn_conv2d_chw_fx8_k5x5_ch1_str1_krnpad(
11851185

11861186
mli_prv_fx_init_dsp_ctrl();
11871187

1188-
conv2d_chw(
1188+
conv2d_chw_str1(
11891189
in_ftrs, wt, bs, out_ftrs, &cent_area,
11901190
bias_shift, out_shift,
11911191
val_limit.min, val_limit.max,
@@ -1283,7 +1283,7 @@ mli_status mli_krn_conv2d_chw_fx8_k6x6_str1_krnpad(
12831283

12841284
mli_prv_fx_init_dsp_ctrl();
12851285

1286-
conv2d_chw(
1286+
conv2d_chw_str1(
12871287
in_ftrs, wt, bs, out_ftrs, &cent_area,
12881288
bias_shift, out_shift,
12891289
val_limit.min, val_limit.max,
@@ -1381,7 +1381,7 @@ mli_status mli_krn_conv2d_chw_fx8_k6x6_ch1_str1_krnpad(
13811381

13821382
mli_prv_fx_init_dsp_ctrl();
13831383

1384-
conv2d_chw(
1384+
conv2d_chw_str1(
13851385
in_ftrs, wt, bs, out_ftrs, &cent_area,
13861386
bias_shift, out_shift,
13871387
val_limit.min, val_limit.max,
@@ -1479,7 +1479,7 @@ mli_status mli_krn_conv2d_chw_fx8_k7x7_str1_krnpad(
14791479

14801480
mli_prv_fx_init_dsp_ctrl();
14811481

1482-
conv2d_chw(
1482+
conv2d_chw_str1(
14831483
in_ftrs, wt, bs, out_ftrs, &cent_area,
14841484
bias_shift, out_shift,
14851485
val_limit.min, val_limit.max,
@@ -1577,7 +1577,7 @@ mli_status mli_krn_conv2d_chw_fx8_k7x7_ch1_str1_krnpad(
15771577

15781578
mli_prv_fx_init_dsp_ctrl();
15791579

1580-
conv2d_chw(
1580+
conv2d_chw_str1(
15811581
in_ftrs, wt, bs, out_ftrs, &cent_area,
15821582
bias_shift, out_shift,
15831583
val_limit.min, val_limit.max,

lib/src/private/mli_prv_load_store.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222

2323
static inline v2q15_t __attribute__ ((always_inline)) mli_prv_load_2_samples (const MLI_PTR (int8_t) __restrict in) {
24-
#ifndef _ARC
24+
#if defined __Xxy
2525
return __builtin_convertvector (*(MLI_PTR (v2i8_t)) in, v2q15_t);
2626
#else
2727
int16_t two8bitvalues = *(MLI_PTR (int16_t)) in;

0 commit comments

Comments
 (0)