Skip to content

Commit becb08c

Browse files
committed
optimize depthwise using sw pipeline
1 parent 30ba117 commit becb08c

File tree

10 files changed

+307
-208
lines changed

10 files changed

+307
-208
lines changed

cmake/settings.cmake

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ if (ARC)
7171
endif()
7272

7373
list(APPEND MLI_PLATFORM_FLAGS
74-
-Hon=Long_enums
75-
"SHELL: -mllvm -gen-lpcc=false"
74+
-Hon=Long_enums -Wcg,-arc-vdsp-AA=1
75+
"SHELL: -mllvm -gen-lpcc=false -mllvm -arc-sort-out-copy=true -mllvm -arc-vdsp-copy=3"
7676
)
7777
if (DEFINED BUILDLIB_DIR)
7878
list(APPEND MLI_PLATFORM_LINK_OPTIONS

lib/src/bricks/impl/mli_krn_dotprod_vdsp.h

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,53 @@ static MLI_FORCE_INLINE acc_T dotprod2D_vv(
170170
#pragma clang diagnostic pop
171171
}
172172

173+
template <typename io_T, typename w_T, typename acc_T>
174+
static MLI_FORCE_INLINE acc_T dotprod2D_vv_ptrvector(
175+
const MLI_PTR(io_T) __restrict in,
176+
const MLI_PTR(w_T) __restrict krn,
177+
acc_T accu,
178+
const int width,
179+
const int height,
180+
int in_col_step,
181+
int in_row_step,
182+
int kern_col_step,
183+
int kern_row_step) {
184+
int in_row_step_orig = in_row_step;
185+
in_row_step -= width * in_col_step;
186+
kern_row_step -= width * kern_col_step;
187+
188+
vNint_t addr_vec = 0;
189+
int i = 0;
190+
int offset = in_row_step_orig * sizeof(io_T);
191+
#pragma clang loop unroll(full)
192+
for (int row = 1; row < height; row++) {
193+
addr_vec[i++] = offset;
194+
offset += in_row_step_orig * sizeof(io_T);
195+
}
196+
i = 0;
197+
addr_vec += (int)in;
198+
199+
for (int clmn = 0; clmn < width; clmn++) {
200+
accu = mli_prv_mac_load_v_v(accu, krn, in);
201+
in += in_col_step;
202+
krn += kern_col_step;
203+
}
204+
krn += kern_row_step;
205+
206+
#pragma clang loop unroll(full)
207+
for (int row = 1; row < height; row++) {
208+
MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T))addr_vec[i++];
209+
#pragma clang loop unroll(full)
210+
for (int clmn = 0; clmn < width; clmn++) {
211+
accu = mli_prv_mac_load_v_v(accu, krn, in_ptr);
212+
in_ptr += in_col_step;
213+
krn += kern_col_step;
214+
}
215+
krn += kern_row_step;
216+
}
217+
return accu;
218+
}
219+
173220
template < typename in_T, typename w_T, typename acc_T >
174221
static MLI_FORCE_INLINE acc_T dotprod3D_v_pad (
175222
const MLI_PTR (in_T) __restrict in,

lib/src/bricks/mli_krn_dotprod.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ using mli::krn::vdsp::dotprod1D_v;
3333
using mli::krn::vdsp::dotprod1D_v_unroll;
3434
using mli::krn::ref::dotprod2D;
3535
using mli::krn::vdsp::dotprod2D_vv;
36+
using mli::krn::vdsp::dotprod2D_vv_ptrvector;
3637
using mli::krn::ref::dotprod3D;
3738
using mli::krn::vdsp::dotprod3D_v;
3839
using mli::krn::vdsp::dotprod3D_v_unroll;

lib/src/bricks/mli_krn_dotprod_decl.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,18 @@ static MLI_FORCE_INLINE acc_T dotprod2D_vv(
268268
int kern_col_step,
269269
int kern_row_step);
270270

271+
template <typename io_T, typename w_T, typename acc_T>
272+
static MLI_FORCE_INLINE acc_T dotprod2D_vv_ptrvector(
273+
const MLI_PTR(io_T) __restrict in,
274+
const MLI_PTR(w_T) __restrict krn,
275+
acc_T accu,
276+
const int width,
277+
const int height,
278+
int in_col_step,
279+
int in_row_step,
280+
int kern_col_step,
281+
int kern_row_step);
282+
271283
template < typename in_T, typename w_T, typename acc_T, bool fixed_size = false >
272284
static MLI_FORCE_INLINE acc_T dotprod3D_v (
273285
const MLI_PTR (in_T) __restrict in,

lib/src/kernels/convolution/impl/mli_krn_convolution_dsp.h

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ namespace dsp {
2626
//========================================================
2727
// Depthwise convolution 2D template
2828
//========================================================
29-
template <typename io_T, typename w_T, typename b_T, typename acc_T>
29+
template <typename io_T, typename w_T, typename b_T, typename acc_T, int fix_kernel_width, int fix_kernel_height>
3030
MLI_FORCE_INLINE void depthwise_convolution2D_hwcn_nopad(
3131
const tensor_private_t<MLI_PTR(io_T)> &in,
3232
const conv2d_weights_tensor_private_t<MLI_PTR(w_T)> &w,
@@ -171,7 +171,7 @@ MLI_FORCE_INLINE void depthwise_convolution2D_hwcn_nopad(
171171
} // for ch_mult_idx
172172
}
173173

174-
template <typename io_T, typename w_T, typename b_T, typename acc_T>
174+
template <typename io_T, typename w_T, typename b_T, typename acc_T, int fix_kernel_width, int fix_kernel_height>
175175
MLI_FORCE_INLINE void depthwise_convolution2D_hwcn(
176176
const tensor_private_t<MLI_PTR(io_T)> &in,
177177
const conv2d_weights_tensor_private_t<MLI_PTR(w_T)> &w,
@@ -330,7 +330,7 @@ MLI_FORCE_INLINE void depthwise_convolution2D_hwcn(
330330
}
331331
}
332332

333-
template <typename io_T, typename w_T, typename b_T, typename acc_T>
333+
template <typename io_T, typename w_T, typename b_T, typename acc_T, int fix_kernel_width, int fix_kernel_height>
334334
MLI_FORCE_INLINE void depthwise_convolution2D_hwcn_nopad(
335335
const tensor_private_t<MLI_PTR(io_T)> &in,
336336
const conv2d_weights_tensor_private_t<MLI_PTR(w_T)> &w,
@@ -344,7 +344,7 @@ MLI_FORCE_INLINE void depthwise_convolution2D_hwcn_nopad(
344344
const int dilation_height, const int dilation_width,
345345
const int padding_top, const int padding_left,
346346
const int padding_bot, const int padding_right) {
347-
mli::krn::ref::depthwise_convolution2D<io_T, w_T, b_T, acc_T, fx_quant_specific_params>(
347+
mli::krn::ref::depthwise_convolution2D<io_T, w_T, b_T, acc_T, fx_quant_specific_params, fix_kernel_width, fix_kernel_height>(
348348
in, w, biases, out, perception_area, quant_params,
349349
val_min_limit, val_max_limit,
350350
stride_height, stride_width,
@@ -353,7 +353,7 @@ MLI_FORCE_INLINE void depthwise_convolution2D_hwcn_nopad(
353353
padding_bot, padding_right);
354354
}
355355

356-
template <typename io_T, typename w_T, typename b_T, typename acc_T>
356+
template <typename io_T, typename w_T, typename b_T, typename acc_T, int fix_kernel_width, int fix_kernel_height>
357357
MLI_FORCE_INLINE void depthwise_convolution2D_hwcn(
358358
const tensor_private_t<MLI_PTR(io_T)> &in,
359359
const conv2d_weights_tensor_private_t<MLI_PTR(w_T)> &w,
@@ -367,7 +367,7 @@ MLI_FORCE_INLINE void depthwise_convolution2D_hwcn(
367367
const int dilation_height, const int dilation_width,
368368
const int padding_top, const int padding_left,
369369
const int padding_bot, const int padding_right) {
370-
mli::krn::ref::depthwise_convolution2D<io_T, w_T, b_T, acc_T, fx_quant_specific_params>(
370+
mli::krn::ref::depthwise_convolution2D<io_T, w_T, b_T, acc_T, fx_quant_specific_params, fix_kernel_width, fix_kernel_height>(
371371
in, w, biases, out, perception_area, quant_params,
372372
val_min_limit, val_max_limit,
373373
stride_height, stride_width,
@@ -377,7 +377,7 @@ MLI_FORCE_INLINE void depthwise_convolution2D_hwcn(
377377

378378
}
379379

380-
template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T>
380+
template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T, int fix_kernel_width, int fix_kernel_height>
381381
MLI_FORCE_INLINE void depthwise_convolution2D(
382382
const tensor_private_t<MLI_PTR(io_T)> &in,
383383
const conv2d_weights_tensor_private_t<MLI_PTR(w_T)> &w,
@@ -402,7 +402,7 @@ MLI_FORCE_INLINE void depthwise_convolution2D(
402402

403403
if ((perception_area_nopad.row_end > perception_area_nopad.row_beg)
404404
&& (perception_area_nopad.clmn_end > perception_area_nopad.clmn_beg)){
405-
depthwise_convolution2D_hwcn_nopad<io_T, w_T, b_T, acc_T>(
405+
depthwise_convolution2D_hwcn_nopad<io_T, w_T, b_T, acc_T, fix_kernel_width, fix_kernel_height>(
406406
in, w, biases, out, perception_area_nopad, quant_params,
407407
val_min_limit, val_max_limit,
408408
stride_height, stride_width,
@@ -441,7 +441,7 @@ MLI_FORCE_INLINE void depthwise_convolution2D(
441441
perc_areas[areas_num++].clmn_end = out.width;
442442
}
443443
for(int i = 0; i < areas_num; i ++) {
444-
depthwise_convolution2D_hwcn<io_T, w_T, b_T, acc_T>(
444+
depthwise_convolution2D_hwcn<io_T, w_T, b_T, acc_T, fix_kernel_width, fix_kernel_height>(
445445
in, w, biases, out, perc_areas[i], quant_params,
446446
val_min_limit, val_max_limit,
447447
stride_height, stride_width,

lib/src/kernels/convolution/impl/mli_krn_convolution_ref.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ MLI_FORCE_INLINE void convolution2D(
131131
//========================================================
132132
// Unified Depthwise convolution 2D template
133133
//========================================================
134-
template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T>
134+
template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T, int fix_kernel_width, int fix_kernel_height>
135135
MLI_FORCE_INLINE void depthwise_convolution2D(
136136
const tensor_private_t<MLI_PTR(io_T)> &in,
137137
const conv2d_weights_tensor_private_t<MLI_PTR(w_T)> &weights,
@@ -220,7 +220,7 @@ MLI_FORCE_INLINE void depthwise_convolution2D(
220220
} // for H_idx
221221
}
222222

223-
template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T>
223+
template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T, int fix_kernel_width, int fix_kernel_height>
224224
MLI_FORCE_INLINE void depthwise_convolution2D_wrapper(
225225
MLI_PTR(io_T) __restrict in_ptr,
226226
MLI_PTR(w_T) __restrict w_ptr,
@@ -245,7 +245,7 @@ MLI_FORCE_INLINE void depthwise_convolution2D_wrapper(
245245
weights_.ptr = w_ptr;
246246
out_.ptr = out_ptr;
247247

248-
mli::krn::depthwise_convolution2D<io_T, w_T, b_T, acc_T, quant_T>(
248+
mli::krn::depthwise_convolution2D<io_T, w_T, b_T, acc_T, quant_T, fix_kernel_width, fix_kernel_height>(
249249
in_, weights_, biases, out_, perception_area, quant_params,
250250
val_min_limit, val_max_limit,
251251
stride_height, stride_width, dilation_height, dilation_width,
@@ -358,7 +358,7 @@ MLI_FORCE_INLINE void conv2d_prepare_and_run(
358358
padding_top, padding_left,
359359
padding_bot, padding_right);
360360
} else {
361-
depthwise_convolution2D_wrapper<io_T, w_T, b_T, acc_T, quant_T>(
361+
depthwise_convolution2D_wrapper<io_T, w_T, b_T, acc_T, quant_T, fix_kernel_width, fix_kernel_height>(
362362
in_prv.ptr, weights_prv.ptr, out_prv.ptr,
363363
in_prv, weights_prv, bs, out_prv, cent_area, params,
364364
(io_T)val_limit.min, (io_T)val_limit.max,

0 commit comments

Comments
 (0)