@@ -785,10 +785,31 @@ static __attribute__ ((always_inline)) void pointwise_convolution2D_hwc_nopad(
785785 int even_in_ch = in_ch & (~0x3 );
786786
787787 if ((in_ch & 0x3 ) == 0 ) {
788- for (int H_idx = row_begin; H_idx < row_end; H_idx++) {
788+ for (int H_idx = 0 ; H_idx < amount_rows; H_idx++) {
789+ #if !defined(_ARCVER_ARCv2HS)
789790 int32_t init_accum_val = weights_add;
790791 acc_T accu = mli_prv_init_accu (init_accum_val);
791- for (int W_idx = clmn_begin; W_idx < clmn_end; W_idx++) {
792+ for (int j = 0 ; j < (in_ch / 4 ); j++) {
793+ mli_prv_load_mac_vec4 (&accu, in_ptr, w_ptr);
794+ in_ptr += 4 ;
795+ w_ptr += 4 ;
796+ }
797+ accu += bias_add;
798+
799+ // Cast result to output type, apply built-in ReLU Applying and write result
800+ mli_prv_clip_relu_store_output (out_ptr, accu, &quant_params, val_min_limit, val_max_limit);
801+ out_ptr += out_ch;
802+ in_ptr += in_ch * (stride_width - 1 );
803+ w_ptr -= in_ch;
804+
805+ for (int W_idx = 1 ; W_idx < amount_columns; W_idx++) {
806+ init_accum_val = weights_add;
807+ accu = mli_prv_init_accu (init_accum_val);
808+ #else
809+ for (int W_idx = 0 ; W_idx < amount_columns; W_idx++) {
810+ int32_t init_accum_val = weights_add;
811+ acc_T accu = mli_prv_init_accu (init_accum_val);
812+ #endif
792813
793814LOOP_PIPELINE_ENABLE
794815 for (int j = 0 ; j < (in_ch / 4 ); j++) {
@@ -803,18 +824,15 @@ LOOP_PIPELINE_ENABLE
803824 out_ptr += out_ch;
804825 in_ptr += in_ch * (stride_width - 1 );
805826 w_ptr -= in_ch;
806-
807- init_accum_val = weights_add;
808- accu = mli_prv_init_accu (init_accum_val);
809- } // for W_idx
827+ } // for W_idx
810828 out_ptr += out_width * out_ch - out_compensation_clmn_loop;
811829 in_ptr += stride_height * in_width * in_ch - in_compensation_clmn_loop;
812830 } // for H_idx
813831 } else {
814- for (int H_idx = row_begin ; H_idx < row_end ; H_idx++) {
832+ for (int H_idx = 0 ; H_idx < amount_rows ; H_idx++) {
815833 int32_t init_accum_val = weights_add;
816834 acc_T accu = mli_prv_init_accu (init_accum_val);
817- for (int W_idx = clmn_begin ; W_idx < clmn_end ; W_idx++) {
835+ for (int W_idx = 0 ; W_idx < amount_columns ; W_idx++) {
818836
819837 for (int k = 0 ; k < odd_rest_of_in_ch; k++) {
820838 mli_prv_load_mac (&accu, in_ptr++, w_ptr++);
0 commit comments