Skip to content

Commit 27b8d8f

Browse files
committed
Optimize conv kernel for HS45d
1 parent 0bbea32 commit 27b8d8f

File tree

9 files changed

+730
-125
lines changed

9 files changed

+730
-125
lines changed

lib/src/bricks/impl/mli_krn_dotprod_vdsp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ static MLI_FORCE_INLINE vNx4short_t make_vindex2(
358358
int in_row_step,
359359
int unroll = 1,
360360
int in_unroll_step = 0) {
361-
vNx4short_t vindex;
361+
vNx4short_t vindex = 0;
362362
int vec_length = (sizeof(vNx2short_t) / sizeof(short));
363363
int idx = 0;
364364
MLI_ASSERT(width * height * unroll <= 2 * vec_length);
Lines changed: 104 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
/*
2-
* Copyright 2019-2020, Synopsys, Inc.
2+
* Copyright 2019-2025, Synopsys, Inc.
33
* All rights reserved.
44
*
55
* This source code is licensed under the BSD-3-Clause license found in
66
* the LICENSE file in the root directory of this source tree.
77
*
88
*/
99

10-
#ifndef _MLI_KRN_FULLY_CONNECTED_H_
11-
#define _MLI_KRN_FULLY_CONNECTED_H_
10+
#ifndef _MLI_KRN_FULLY_CONNECTED_DSP_H_
11+
#define _MLI_KRN_FULLY_CONNECTED_DSP_H_
1212

1313
#include "mli_config.h"
1414
#include "mli_debug.h"
@@ -19,110 +19,114 @@
1919
#include "math.h"
2020
#include "mli_prv_quant.h"
2121

22-
//================================================
23-
// Old version of optimized fully connected code
24-
//================================================
25-
26-
#if 0
27-
template <typename io_T, typename w_T>
28-
static MLI_FORCE_INLINE void full_connection(
29-
const MLI_PTR(io_T) __restrict in_ptr,
30-
const MLI_PTR(w_T) __restrict w_ptr,
31-
const MLI_PTR(w_T) bias_p,
32-
MLI_CONV_OUT_PTR(io_T) __restrict o_ptr,
33-
const int ch_out,
34-
const int inp_size,
22+
namespace mli {
23+
namespace krn {
24+
namespace dsp {
25+
26+
#pragma MLI_CODE_SECTION_START(".mli_lib")
27+
28+
//========================================================
29+
// Unified IP (Inner Product) template
30+
//========================================================
31+
template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T, bool no_zp>
32+
MLI_FORCE_INLINE void inner_product(
33+
const MLI_PTR(io_T) __restrict in,
34+
const MLI_PTR(w_T) __restrict weights,
35+
const MLI_PTR(b_T) __restrict biases,
36+
MLI_CONV_OUT_PTR(io_T) __restrict out,
37+
const int in_elements,
38+
const int out_elements,
3539
const int w_ch_out_mem_stride,
36-
const int bias_shift,
37-
const int out_shift) {
38-
if (_Rarely(inp_size < 8)) {
39-
for (int i = 0; i < ch_out; i++) {
40-
auto ip_out = mli_prv_init_accu_with_bias(in_ptr, *bias_p++, bias_shift);
41-
for (int j = 0; j < inp_size; j++) {
42-
mli_prv_load_mac(&ip_out, in_ptr++, w_ptr++);
43-
}
44-
in_ptr -= inp_size;
45-
w_ptr += w_ch_out_mem_stride - inp_size;
46-
47-
mli_prv_clip_and_store_output(o_ptr++, &ip_out, out_shift);
48-
}
49-
} else {
50-
if ((inp_size & 0x3) == 0) {
51-
const MLI_PTR(io_T) start_in_ptr = in_ptr;
52-
for (int i = 0; i < ch_out; i++) {
53-
auto ip_out = mli_prv_init_accu_with_bias(in_ptr, *bias_p++, bias_shift);
54-
55-
LOOP_PIPELINE_ENABLE
56-
LOOP_PIPELINE_ENABLE_BACKTRACKING
57-
for (int j = 0; j < (inp_size / 4); j++) {
58-
mli_prv_load_mac_vec4(&ip_out, in_ptr, w_ptr);
59-
in_ptr += 4;
60-
w_ptr += 4;
40+
quant_T quant_params,
41+
const io_T val_min_limit,
42+
const io_T val_max_limit) {
43+
// Unified Inner Product for both quantization scheme: MLI_FX (symmetric data, scales are power of two)
44+
// and s8asym (assymetric data, scales of any value)
45+
// Calculation implies dotproduct and bias add:
46+
// out_val = sum_i(x_r * w_r) + b_r
47+
//
48+
// Considering assymetric types(x_r = (x - x_zp) and w_r = (w - w_zp) + b_r
49+
// out_val = sum_i((x-x_zp)*(w-w_zp)) + b_r
50+
//
51+
// when we will open brackets:
52+
// out_val = sum(x*w) - sum_i(w*x_zp) - sum_i(x*w_zp) + sum_i(w_zp*x_zp) + b_r
53+
// where:
54+
// sum(x*w) - generic dotproduct which can't be avoided for any type
55+
// -sum_i(w*x_zp) - weights_additive.
56+
// Allways Zero for FX and can be reused in output channel calculations for s8asym
57+
// -sum_i(x*w_zp) - in_additive
58+
// Allways Zero for both FX and TF_s8asym assuming symmetric weights (w_zp == 0)
59+
// sum_i(w_zp*x_zp)- zp_additive
60+
// Allways Zero for both FX and TF_s8asym assuming symmetric weights (w_zp == 0)
61+
// b_r - bias_additive
62+
// (must be of the same type as accumulator, that may require bias re-quantization)
63+
//============================================
64+
#ifdef __Xdsp_wide
65+
if (quant_params_get_weigths_zeropoint(&quant_params) == 0 && (out_elements & 1) == 0) {
66+
int32_t bias_e = 1 << (((const fx_quant_specific_params *)(void*)&quant_params)->bias_shift);
67+
if(std::is_same<acc_T, mli_acc40_t>::value && std::is_same<io_T, int16_t>::value && std::is_same<w_T, int16_t>::value && bias_e < ((1 << 15) - 1)) {
68+
v2q15_t vval_max_limit = fx_replic_v2q15(val_max_limit);
69+
v2q15_t vval_min_limit = fx_replic_v2q15(val_min_limit);
70+
71+
int remaining_part = in_elements & 3;
72+
for (int o_idx = 0; o_idx < out_elements; o_idx += 2) {
73+
v2i16_t vbias_e = fx_create_v2i16(bias_e, bias_e);
74+
int32_t out_shift = ((const fx_quant_specific_params *)(void *)&quant_params)->out_shift - 16;
75+
v2accum40_t accu = fx_v2a40_mpy_v2q15(*(const v2q15_t *__restrict)(void *)&biases[o_idx], vbias_e);
76+
77+
for (int i = 0; i < in_elements - remaining_part; i += 4) {
78+
accu = fx_v2a40_mac_v2q15(accu, fx_replic_v2q15(in[i + 0]), *(const v2q15_t *__restrict)(void *)&weights[o_idx + (i + 0) * w_ch_out_mem_stride]);
79+
accu = fx_v2a40_mac_v2q15(accu, fx_replic_v2q15(in[i + 1]), *(const v2q15_t *__restrict)(void *)&weights[o_idx + (i + 1) * w_ch_out_mem_stride]);
80+
accu = fx_v2a40_mac_v2q15(accu, fx_replic_v2q15(in[i + 2]), *(const v2q15_t *__restrict)(void *)&weights[o_idx + (i + 2) * w_ch_out_mem_stride]);
81+
accu = fx_v2a40_mac_v2q15(accu, fx_replic_v2q15(in[i + 3]), *(const v2q15_t *__restrict)(void *)&weights[o_idx + (i + 3) * w_ch_out_mem_stride]);
6182
}
62-
in_ptr -= inp_size;
63-
w_ptr += w_ch_out_mem_stride - inp_size;
64-
MLI_EXTRA_ASSERT(start_in_ptr == in_ptr);
65-
66-
mli_prv_clip_and_store_output(o_ptr++, &ip_out, out_shift);
67-
}
68-
} else {
69-
const MLI_PTR(io_T) start_in_ptr = in_ptr;
70-
for (int i = 0; i < ch_out; i++) {
71-
auto ip_out = mli_prv_init_accu_with_bias(in_ptr, *bias_p++, bias_shift);
72-
73-
int odd_rest_of_inp_size = (inp_size & 0x3);
74-
for (int k = 0; k < odd_rest_of_inp_size; k++) {
75-
mli_prv_load_mac(&ip_out, in_ptr++, w_ptr++);
83+
84+
for (int r = (in_elements - remaining_part); r < in_elements; r++) {
85+
accu = fx_v2a40_mac_v2q15(accu, fx_replic_v2q15(in[r]), *(const v2q15_t *__restrict)(void *)&weights[o_idx + r * w_ch_out_mem_stride]);
7686
}
7787

78-
int even_inp_size = inp_size - odd_rest_of_inp_size;
79-
LOOP_PIPELINE_ENABLE
80-
LOOP_PIPELINE_ENABLE_BACKTRACKING
81-
for (int j = 0; j < (even_inp_size / 4); j++) {
82-
mli_prv_load_mac_vec4(&ip_out, in_ptr, w_ptr);
83-
in_ptr += 4;
84-
w_ptr += 4;
85-
}
86-
in_ptr -= inp_size;
87-
w_ptr += w_ch_out_mem_stride - inp_size;
88-
MLI_EXTRA_ASSERT(start_in_ptr == in_ptr);
89-
90-
mli_prv_clip_and_store_output(o_ptr++, &ip_out, out_shift);
88+
// Cast result to output type, apply built-in ReLU Applying and write result
89+
v2q15_t out_val = fx_v2q15_cast_nf_asr_rnd_v2a40(accu, out_shift);
90+
out_val = fx_min_v2q15(out_val, vval_max_limit);
91+
out_val = fx_max_v2q15(out_val, vval_min_limit);
92+
93+
v2q15_t *__restrict out_ptr = (v2q15_t *__restrict)(void *)&out[o_idx];
94+
*out_ptr = out_val;
9195
}
96+
} else {
97+
mli::krn::ref::inner_product<io_T, w_T, b_T, acc_T, quant_T, no_zp>(in,
98+
weights,
99+
biases,
100+
out,
101+
in_elements,
102+
out_elements,
103+
w_ch_out_mem_stride,
104+
quant_params,
105+
val_min_limit,
106+
val_max_limit);
92107
}
93-
}
94-
}
108+
}
109+
else
110+
#endif // #ifdef __Xdsp_wide
111+
{
112+
mli::krn::ref::inner_product<io_T, w_T, b_T, acc_T, quant_T, no_zp>(in,
113+
weights,
114+
biases,
115+
out,
116+
in_elements,
117+
out_elements,
118+
w_ch_out_mem_stride,
119+
quant_params,
120+
val_min_limit,
121+
val_max_limit);
122+
}
95123

96-
template <typename io_T, typename w_T>
97-
static MLI_FORCE_INLINE void fully_connected_prepare_and_run_fx(
98-
const mli_tensor* in,
99-
const mli_tensor* weights,
100-
const mli_tensor* bias,
101-
mli_tensor* out) {
102-
mli_prv_fx_init_dsp_ctrl();
103-
104-
const MLI_PTR(io_T) in_ptr = mli_prv_tensor_data_ptr<MLI_PTR(io_T)>(in);
105-
const MLI_PTR(w_T) w_ptr = mli_prv_tensor_data_ptr<MLI_PTR(w_T)>(weights);
106-
const MLI_PTR(w_T) b_ptr = mli_prv_tensor_data_ptr<MLI_PTR(w_T)>(bias);
107-
MLI_CONV_OUT_PTR(io_T) out_ptr = mli_prv_tensor_data_ptr<MLI_CONV_OUT_PTR(io_T)>(out);
108-
109-
int ch_out = weights->shape[0];
110-
int in_sz = mli_prv_count_elem_num(in);
111-
int w_ch_out_mem_stride_from_tensor = weights->mem_stride[0];
112-
int w_ch_out_mem_stride = (w_ch_out_mem_stride_from_tensor != 0) ?
113-
w_ch_out_mem_stride_from_tensor : in_sz;
124+
}
114125

115-
// Define shift values
116-
const int bias_shift = mli_prv_calc_shift(in, weights, bias);
117-
const int out_shift = mli_prv_calc_shift(in, weights, out);
126+
#pragma MLI_CODE_SECTION_END()
118127

119-
// Run basic calculation
120-
full_connection<io_T, w_T>(in_ptr, w_ptr, b_ptr, out_ptr, ch_out, in_sz, w_ch_out_mem_stride,
121-
bias_shift, out_shift);
128+
} // namespace dsp
129+
} // namespace krn
130+
} // namespace mli
122131

123-
// fill output tensor parameters
124-
out->el_type = in->el_type;
125-
out->shape[0] = ch_out;
126-
out->rank = 1;
127-
}
128-
#endif
132+
#endif // _MLI_KRN_FULLY_CONNECTED_DSP_H_

lib/src/kernels/common/mli_krn_fully_connected.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2019-2020, Synopsys, Inc.
2+
* Copyright 2019-2025, Synopsys, Inc.
33
* All rights reserved.
44
*
55
* This source code is licensed under the BSD-3-Clause license found in
@@ -32,7 +32,7 @@ using mli::krn::vdsp::inner_product;
3232
using mli::krn::ref::fully_connected_prepare_and_run;
3333

3434
#elif !defined(MLI_BUILD_REFERENCE) && defined(__FXAPI__)
35-
using mli::krn::ref::inner_product;
35+
using mli::krn::dsp::inner_product;
3636
using mli::krn::ref::fully_connected_prepare_and_run;
3737

3838
#else
@@ -56,7 +56,7 @@ using mli::krn::ref::fully_connected_prepare_and_run;
5656
#endif
5757

5858
#if !defined(MLI_BUILD_REFERENCE) && defined(__FXAPI__)
59-
//#include "impl/mli_krn_fully_connected_dsp.h"
59+
#include "impl/mli_krn_fully_connected_dsp.h"
6060
#endif
6161

6262
#endif //_MLI_KRN_FULLY_CONNECTED_H_

lib/src/kernels/common/mli_krn_fully_connected_decl.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2020-2020, Synopsys, Inc.
2+
* Copyright 2020-2025, Synopsys, Inc.
33
* All rights reserved.
44
*
55
* This source code is licensed under the BSD-3-Clause license found in
@@ -55,7 +55,18 @@ MLI_FORCE_INLINE void fully_connected_prepare_and_run(
5555
// DSP
5656
////////////////////////////////////////////////////////////////////////////////
5757
namespace dsp {
58-
58+
template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T, bool no_zp>
59+
MLI_FORCE_INLINE void inner_product(
60+
const MLI_PTR(io_T) __restrict in,
61+
const MLI_PTR(w_T) __restrict weights,
62+
const MLI_PTR(b_T) __restrict biases,
63+
MLI_CONV_OUT_PTR(io_T) __restrict out,
64+
const int in_elements,
65+
const int out_elements,
66+
const int w_ch_out_mem_stride,
67+
quant_T quant_params,
68+
const io_T val_min_limit,
69+
const io_T val_max_limit);
5970
} // namespace dsp
6071

6172
////////////////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)