Skip to content

Commit 9702171

Browse files
kiyaevJaccovG
authored andcommitted
Optimize a bit depthwise_conv2d
1 parent e50c2fc commit 9702171

File tree

1 file changed

+7
-11
lines changed

1 file changed

+7
-11
lines changed

lib/src/kernels/convolution/mli_krn_conv2d_hwc.h

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -106,31 +106,28 @@ static __attribute__ ((always_inline)) void depthwise_convolution2D_hwcn_nopad(
106106
acc_T bias_add_ch1 = bias_additive(*biases++, 0x0, &v2quant_params[0]);
107107
acc_T bias_add_ch2 = bias_additive(*biases++, 0x0, &v2quant_params[1]);
108108

109-
v2accum40_t v2acc_weights_add = {bias_add_ch1, bias_add_ch2};
109+
__v2i32_t v2acc_weights_add = {bias_add_ch1, bias_add_ch2};
110110
v2acc_weights_add = weights_additive_v(w_ptr, &v2acc_weights_add, &quant_params, kernel_width, kernel_height,
111111
krn_col_step, krn_row_step);
112-
__v2i32_t v2acc_weights_add_int = {fx_q31_cast_nf_a40(fx_get_v2a40(v2acc_weights_add, 0)),
113-
fx_q31_cast_nf_a40(fx_get_v2a40(v2acc_weights_add, 1))};
114112
__builtin_assume(amount_rows > 0);
115113
for (int H_idx = 0; H_idx < amount_rows; H_idx++) {
116114
__builtin_assume(amount_columns > 0);
117115

118-
__v2i32_t v2accu_dotprod = v2acc_weights_add_int;
116+
__v2i32_t v2accu_dotprod = v2acc_weights_add;
119117
dotprod2D_hwc_v(in_ptr, w_ptr, &v2accu_dotprod, kernel_width, kernel_height,
120118
in_col_step, in_row_step, krn_col_step, krn_row_step);
119+
in_ptr += in_increment_clmn_loop;
121120
for (int W_idx = 0; W_idx < amount_columns; W_idx++) {
122-
123121
// Cast result to output type
124122
mli_prv_clip_relu_store_output_v(out_ptr, &v2accu_dotprod, v2quant_params, val_min_limit, val_max_limit);
125-
126-
in_ptr += in_increment_clmn_loop;
127123
out_ptr += out_increment_clmn_loop;
128124

129-
v2accu_dotprod = v2acc_weights_add_int;
125+
v2accu_dotprod = v2acc_weights_add;
130126
dotprod2D_hwc_v(in_ptr, w_ptr, &v2accu_dotprod, kernel_width, kernel_height,
131127
in_col_step, in_row_step, krn_col_step, krn_row_step);
128+
in_ptr += in_increment_clmn_loop;
132129
} // for W_idx
133-
in_ptr += in_increment_row_loop;
130+
in_ptr += in_increment_row_loop - stride_width * filters * in_ch;
134131
out_ptr += out_increment_row_loop;
135132
} // for H_idx
136133
in_ptr -= in_compensation_row_loop;
@@ -156,10 +153,9 @@ static __attribute__ ((always_inline)) void depthwise_convolution2D_hwcn_nopad(
156153
// Convolution core. Here calculations performes in a unfolded expression way:
157154
// out_val = (x-x_zp)*(w) + b) = -sum_i(w*x_zp) + sum(x*w) + b
158155
//============================================
159-
__v2i32_t accu = {0, 0};
156+
__v2i32_t accu = v2global_other_additives;
160157
accu = dotprod2D_inp_width_v(in_ptr, w_ptr, &accu, kernel_width, kernel_height,
161158
in_col_step, in_row_step, krn_col_step, krn_row_step, in_increment_clmn_loop);
162-
accu += v2global_other_additives;
163159

164160
// Cast result to output type
165161
mli_prv_clip_relu_store_output_inp_width_v(out_ptr, &accu, &quant_params, val_min_limit, val_max_limit, out_increment_clmn_loop);

0 commit comments

Comments
 (0)