Skip to content

Commit 9f6a273

Browse files
authored
Merge pull request #109 from foss-for-synopsys-dwc-arc-processors/opt_pipe_pointwise
Opt pipe pointwise
2 parents e3c1f22 + 017e38a commit 9f6a273

File tree

1 file changed

+26
-8
lines changed

1 file changed

+26
-8
lines changed

lib/src/kernels/convolution/mli_krn_conv2d_hwc.h

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -785,10 +785,31 @@ static __attribute__ ((always_inline)) void pointwise_convolution2D_hwc_nopad(
785785
int even_in_ch = in_ch & (~0x3);
786786

787787
if ((in_ch & 0x3) == 0) {
788-
for (int H_idx = row_begin; H_idx < row_end; H_idx++) {
788+
for (int H_idx = 0; H_idx < amount_rows; H_idx++) {
789+
#if !defined(_ARCVER_ARCv2HS)
789790
int32_t init_accum_val = weights_add;
790791
acc_T accu = mli_prv_init_accu(init_accum_val);
791-
for (int W_idx = clmn_begin; W_idx < clmn_end; W_idx++) {
792+
for (int j = 0; j < (in_ch / 4); j++) {
793+
mli_prv_load_mac_vec4(&accu, in_ptr, w_ptr);
794+
in_ptr += 4;
795+
w_ptr += 4;
796+
}
797+
accu += bias_add;
798+
799+
// Cast result to output type, apply built-in ReLU Applying and write result
800+
mli_prv_clip_relu_store_output(out_ptr, accu, &quant_params, val_min_limit, val_max_limit);
801+
out_ptr += out_ch;
802+
in_ptr += in_ch * (stride_width - 1);
803+
w_ptr -= in_ch;
804+
805+
for (int W_idx = 1; W_idx < amount_columns; W_idx++) {
806+
init_accum_val = weights_add;
807+
accu = mli_prv_init_accu(init_accum_val);
808+
#else
809+
for (int W_idx = 0; W_idx < amount_columns; W_idx++) {
810+
int32_t init_accum_val = weights_add;
811+
acc_T accu = mli_prv_init_accu(init_accum_val);
812+
#endif
792813

793814
LOOP_PIPELINE_ENABLE
794815
for (int j = 0; j < (in_ch / 4); j++) {
@@ -803,18 +824,15 @@ LOOP_PIPELINE_ENABLE
803824
out_ptr += out_ch;
804825
in_ptr += in_ch * (stride_width - 1);
805826
w_ptr -= in_ch;
806-
807-
init_accum_val = weights_add;
808-
accu = mli_prv_init_accu(init_accum_val);
809-
} // for W_idx
827+
} // for W_idx
810828
out_ptr += out_width * out_ch - out_compensation_clmn_loop;
811829
in_ptr += stride_height * in_width * in_ch - in_compensation_clmn_loop;
812830
} // for H_idx
813831
} else {
814-
for (int H_idx = row_begin; H_idx < row_end; H_idx++) {
832+
for (int H_idx = 0; H_idx < amount_rows; H_idx++) {
815833
int32_t init_accum_val = weights_add;
816834
acc_T accu = mli_prv_init_accu(init_accum_val);
817-
for (int W_idx = clmn_begin; W_idx < clmn_end; W_idx++) {
835+
for (int W_idx = 0; W_idx < amount_columns; W_idx++) {
818836

819837
for (int k = 0; k < odd_rest_of_in_ch; k++) {
820838
mli_prv_load_mac(&accu, in_ptr++, w_ptr++);

0 commit comments

Comments
 (0)