Skip to content

Commit 536cdb2

Browse files
kiyaevJaccovG
authored andcommitted
Use pointers to pointers for depthwise and create specific functions for conv2d for save pointers state after a function call
1 parent 9702171 commit 536cdb2

File tree

4 files changed

+206
-16
lines changed

4 files changed

+206
-16
lines changed

lib/src/kernels/convolution/mli_krn_conv2d_hwc.h

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,9 @@ static __attribute__ ((always_inline)) void depthwise_convolution2D_hwcn_nopad(
8383
const int out_increment_in_ch_loop = channel_per_loop - out_compensation_row_loop;
8484
const int out_increment_in_ch_loop_v = channels_per_loop_v - out_compensation_row_loop;
8585

86-
MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T) __restrict)in_ftrs;
86+
const MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T) __restrict)in_ftrs;
8787
MLI_CONV_OUT_PTR(io_T) __restrict out_ptr = (MLI_CONV_OUT_PTR(io_T) __restrict)out_ftrs;
88-
MLI_PTR(w_T) __restrict w_ptr = (MLI_PTR(w_T) __restrict)weights;
88+
const MLI_PTR(w_T) __restrict w_ptr = (MLI_PTR(w_T) __restrict)weights;
8989
// MLI_PTR(w_T) __restrict w_ptr_local = (MLI_PTR(w_T) __restrict)weights;
9090
int out_ch_idx = 0;
9191

@@ -114,20 +114,21 @@ static __attribute__ ((always_inline)) void depthwise_convolution2D_hwcn_nopad(
114114
__builtin_assume(amount_columns > 0);
115115

116116
__v2i32_t v2accu_dotprod = v2acc_weights_add;
117-
dotprod2D_hwc_v(in_ptr, w_ptr, &v2accu_dotprod, kernel_width, kernel_height,
118-
in_col_step, in_row_step, krn_col_step, krn_row_step);
119-
in_ptr += in_increment_clmn_loop;
120117
for (int W_idx = 0; W_idx < amount_columns; W_idx++) {
118+
dotprod2D_hwc_v(&in_ptr, &w_ptr, &v2accu_dotprod, kernel_width, kernel_height,
119+
in_col_step, in_row_step, krn_col_step, krn_row_step);
120+
//compensite increment of input tensor pointer from dotprod2D_hwc_v function
121+
in_ptr += in_increment_clmn_loop - kernel_height * in_row_step;
122+
//compensite increment of weights pointer from dotprod2D_hwc_v function
123+
w_ptr -= kernel_height * krn_row_step;
124+
121125
// Cast result to output type
122126
mli_prv_clip_relu_store_output_v(out_ptr, &v2accu_dotprod, v2quant_params, val_min_limit, val_max_limit);
123127
out_ptr += out_increment_clmn_loop;
124128

125129
v2accu_dotprod = v2acc_weights_add;
126-
dotprod2D_hwc_v(in_ptr, w_ptr, &v2accu_dotprod, kernel_width, kernel_height,
127-
in_col_step, in_row_step, krn_col_step, krn_row_step);
128-
in_ptr += in_increment_clmn_loop;
129130
} // for W_idx
130-
in_ptr += in_increment_row_loop - stride_width * filters * in_ch;
131+
in_ptr += in_increment_row_loop;
131132
out_ptr += out_increment_row_loop;
132133
} // for H_idx
133134
in_ptr -= in_compensation_row_loop;
@@ -154,28 +155,31 @@ static __attribute__ ((always_inline)) void depthwise_convolution2D_hwcn_nopad(
154155
// out_val = (x-x_zp)*(w) + b) = -sum_i(w*x_zp) + sum(x*w) + b
155156
//============================================
156157
__v2i32_t accu = v2global_other_additives;
157-
accu = dotprod2D_inp_width_v(in_ptr, w_ptr, &accu, kernel_width, kernel_height,
158+
accu = dotprod2D_inp_width_v(&in_ptr, &w_ptr, &accu, kernel_width, kernel_height,
158159
in_col_step, in_row_step, krn_col_step, krn_row_step, in_increment_clmn_loop);
159-
160+
//compensite increment of input tensor pointer from dotprod2D_hwc_v function
161+
in_ptr += 2 * in_increment_clmn_loop - kernel_height * in_row_step;
162+
//compensite increment of weights pointer from dotprod2D_hwc_v function
163+
w_ptr -= kernel_height * krn_row_step;
160164
// Cast result to output type
161165
mli_prv_clip_relu_store_output_inp_width_v(out_ptr, &accu, &quant_params, val_min_limit, val_max_limit, out_increment_clmn_loop);
162-
163-
in_ptr += 2 * in_increment_clmn_loop;
164166
out_ptr += 2 * out_increment_clmn_loop;
165167
} // for W_idx
166168
if( amount_columns & 0x1) {
167169
// Convolution core. Here calculations performes in a unfolded expression way:
168170
// out_val = (x-x_zp)*(w) + b) = -sum_i(w*x_zp) + sum(x*w) + b
169171
//============================================
170172
acc_T accu = 0;
171-
accu = dotprod2D(in_ptr, w_ptr, accu, kernel_width, kernel_height,
173+
accu = dotprod2D(&in_ptr, &w_ptr, accu, kernel_width, kernel_height,
172174
in_col_step, in_row_step, krn_col_step, krn_row_step);
175+
//compensite increment of input tensor pointer from dotprod2D_hwc_v function
176+
in_ptr += in_increment_clmn_loop - kernel_height * in_row_step;
177+
//compensite increment of weights pointer from dotprod2D_hwc_v function
178+
w_ptr -= kernel_height * krn_row_step;
173179
accu += global_other_additives;
174180

175181
// Cast result to output type
176182
mli_prv_clip_relu_store_output(out_ptr, accu, &quant_params, val_min_limit, val_max_limit);
177-
178-
in_ptr += in_increment_clmn_loop;
179183
out_ptr += out_increment_clmn_loop;
180184
}
181185
in_ptr += in_increment_row_loop;
@@ -882,3 +886,4 @@ LOOP_PIPELINE_ENABLE
882886

883887
#endif // _MLI_KRN_CONV2D_HWC_H_
884888

889+

lib/src/kernels/convolution/mli_krn_dotprod.h

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,130 @@ static void __attribute__ ((always_inline)) dotprod2D_hwc_v (
7575
}
7676
}
7777

78+
//The function uses pointers to pointers for in and krn.
79+
//The caller of the function should compensate for the increment
80+
//done inside this function.
81+
template < typename in_T, typename w_T, typename acc_T >
82+
static void __attribute__ ((always_inline)) dotprod2D_hwc_d (
83+
const MLI_PTR(in_T) __restrict *in,
84+
const MLI_PTR(w_T) __restrict *krn,
85+
acc_T * accu,
86+
const int width,
87+
const int height,
88+
int in_col_step,
89+
int in_row_step,
90+
int kern_col_step,
91+
int kern_row_step) {
92+
in_row_step -= width * in_col_step;
93+
kern_row_step -= width * kern_col_step;
94+
95+
#pragma clang loop unroll(full)
96+
for (int32_t row = 0; row < height; row++) {
97+
#pragma clang loop unroll(full)
98+
for (int32_t clmn = 0; clmn < width; clmn++) {
99+
mli_prv_load_mac_vec2 (accu, *in, *krn);
100+
*krn += kern_col_step;
101+
*in += in_col_step;
102+
}
103+
*in += in_row_step;
104+
*krn += kern_row_step;
105+
}
106+
}
107+
108+
//The function uses pointers to pointers for in and krn.
109+
//The caller of the function should compensate for the increment
110+
//done inside this function.
111+
template < typename in_T, typename w_T, typename acc_T >
112+
static void __attribute__ ((always_inline)) dotprod2D_hwc_v (
113+
const MLI_PTR(in_T) __restrict *in,
114+
const MLI_PTR(w_T) __restrict *krn,
115+
acc_T * accu,
116+
const int width,
117+
const int height,
118+
int in_col_step,
119+
int in_row_step,
120+
int kern_col_step,
121+
int kern_row_step) {
122+
123+
in_row_step -= width * in_col_step;
124+
kern_row_step -= width * kern_col_step;
125+
#pragma clang loop unroll(full)
126+
for (int32_t row = 0; row < height; row++) {
127+
#pragma clang loop unroll(full)
128+
for (int32_t clmn = 0; clmn < width; clmn++) {
129+
v2q15_t k_v = mli_prv_load_2_samples(*krn);
130+
*krn += kern_col_step;
131+
v2q15_t tx = mli_prv_load_2_samples(*in);
132+
*in += in_col_step;
133+
mli_math_mac_fx_vec2 (accu, tx, k_v);
134+
}
135+
*in += in_row_step;
136+
*krn += kern_row_step;
137+
}
138+
}
139+
140+
//The function uses pointers to pointers for in and krn.
141+
//The caller of the function should compensate for the increment
142+
//done inside this function.
143+
template <typename io_T, typename w_T, typename acc_T>
144+
static acc_T __attribute__ ((always_inline)) dotprod2D_inp_width_v(
145+
const MLI_PTR(io_T) __restrict *inp,
146+
const MLI_PTR(w_T) __restrict *krn,
147+
acc_T *accu,
148+
const int width,
149+
const int height,
150+
int in_col_step,
151+
int in_row_step,
152+
int kern_col_step,
153+
int kern_row_step,
154+
int in_width_step) {
155+
in_row_step -= width * in_col_step;
156+
kern_row_step -= width * kern_col_step;
157+
#pragma clang loop unroll(full)
158+
for (int row = 0; row < height; row++) {
159+
#pragma clang loop unroll(full)
160+
for (int clmn = 0; clmn < width; clmn++) {
161+
int16_t k = **krn;
162+
v2q15_t k_v = { k, k };
163+
v2q15_t in_v = {(*inp)[0], (*inp)[in_width_step]};
164+
mli_math_mac_fx_vec2(accu, in_v, k_v);
165+
*inp += in_col_step;
166+
*krn += kern_col_step;
167+
}
168+
*inp += in_row_step;
169+
*krn += kern_row_step;
170+
}
171+
return *accu;
172+
}
173+
174+
//The function uses pointers to pointers for in and krn.
175+
//The caller of the function should compensate for the increment
176+
//done inside this function.
177+
template <typename io_T, typename w_T, typename acc_T>
178+
static acc_T __attribute__ ((always_inline)) dotprod2D(
179+
const MLI_PTR(io_T) __restrict *in,
180+
const MLI_PTR(w_T) __restrict *krn,
181+
acc_T accu,
182+
const int width,
183+
const int height,
184+
int in_col_step,
185+
int in_row_step,
186+
int kern_col_step,
187+
int kern_row_step) {
188+
in_row_step -= width * in_col_step;
189+
kern_row_step -= width * kern_col_step;
190+
for (int row = 0; row < height; row++) {
191+
for (int clmn = 0; clmn < width; clmn++) {
192+
accu = mli_math_mac_fx(accu, (**in), (**krn));
193+
*in += in_col_step;
194+
*krn += kern_col_step;
195+
}
196+
*in += in_row_step;
197+
*krn += kern_row_step;
198+
}
199+
return accu;
200+
}
201+
78202
template < typename in_T, typename w_T, typename acc_T >
79203
static void __attribute__ ((always_inline)) dotprod2D_hwc_d (
80204
const MLI_PTR(in_T) __restrict in,

lib/src/kernels/pooling/mli_krn_reduce_sum2d.h

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,48 @@ inline acc_T __attribute__((always_inline)) reduce_sum2D(
298298
return accu;
299299
}
300300

301+
//The function uses pointers to pointers for in.
302+
//The caller of the function should compensate for the increment
303+
//done inside this function.
304+
template <typename io_T, typename acc_T>
305+
static inline acc_T __attribute__((always_inline)) reduce_sum2D_v(
306+
const MLI_PTR(io_T) __restrict *in,
307+
const int16_t mul,
308+
acc_T *v2acc,
309+
310+
const int width,
311+
const int height,
312+
int in_col_step,
313+
int in_row_step) {
314+
315+
v2q15_t v2mul = {mul, mul};
316+
if (width == 1){
317+
#pragma clang loop unroll(full)
318+
for (int row = 0; row < height; row++) {
319+
mli_math_mac_fx_vec2(v2acc, mli_prv_load_2_samples(*in), v2mul);
320+
*in += in_row_step;
321+
}
322+
} else if (height == 1){
323+
#pragma clang loop unroll(full)
324+
for (int clmn = 0; clmn < width; clmn++) {
325+
mli_math_mac_fx_vec2(v2acc, mli_prv_load_2_samples(*in), v2mul);
326+
*in += in_col_step;
327+
}
328+
} else {
329+
in_row_step -= width * in_col_step;
330+
#pragma clang loop unroll(full)
331+
for (int row = 0; row < height; row++) {
332+
#pragma clang loop unroll(full)
333+
for (int clmn = 0; clmn < width; clmn++) {
334+
mli_math_mac_fx_vec2(v2acc, mli_prv_load_2_samples(*in), v2mul);
335+
*in += in_col_step;
336+
}
337+
*in += in_row_step;
338+
}
339+
}
340+
return *v2acc;
341+
}
342+
301343
template <typename io_T, typename acc_T>
302344
static inline acc_T __attribute__((always_inline)) reduce_sum2D_v(
303345
const MLI_PTR(io_T) __restrict in,

lib/src/private/mli_prv_quant.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,25 @@ inline mli_acc32_t __attribute__ ((always_inline)) weights_additive(
125125
return init_accum;
126126
}
127127

128+
//The function uses pointers to pointers for weights.
129+
//The caller of the function should compensate for the increment
130+
//done inside this function.
131+
template <typename acc_T>
132+
inline acc_T __attribute__ ((always_inline)) weights_additive_v(
133+
const MLI_PTR(int8_t) __restrict *weights, acc_T *init_accum,
134+
const s8asym_quant_specific_params* quant_params,
135+
const int width, const int height, int col_step, int row_step) {
136+
137+
// returns -(in_zero_point * cumsum(weights)) For S8ASYM
138+
if (quant_params->in_offset != 0) {
139+
acc_T tmp_acc = reduce_sum2D_v(weights, -quant_params->in_offset, init_accum, width, height, col_step, row_step);
140+
//compensite increment of weights pointer from reduce_sum2D_v function
141+
weights -= height * row_step;
142+
return tmp_acc;
143+
} else {
144+
return *init_accum;
145+
}
146+
}
128147

129148
template <typename acc_T>
130149
inline acc_T __attribute__ ((always_inline)) weights_additive_v(

0 commit comments

Comments
 (0)