|
1 | 1 | /* |
2 | | -* Copyright 2019-2020, Synopsys, Inc. |
| 2 | +* Copyright 2019-2025, Synopsys, Inc. |
3 | 3 | * All rights reserved. |
4 | 4 | * |
5 | 5 | * This source code is licensed under the BSD-3-Clause license found in |
6 | 6 | * the LICENSE file in the root directory of this source tree. |
7 | 7 | * |
8 | 8 | */ |
9 | 9 |
|
10 | | -#ifndef _MLI_KRN_FULLY_CONNECTED_H_ |
11 | | -#define _MLI_KRN_FULLY_CONNECTED_H_ |
| 10 | +#ifndef _MLI_KRN_FULLY_CONNECTED_DSP_H_ |
| 11 | +#define _MLI_KRN_FULLY_CONNECTED_DSP_H_ |
12 | 12 |
|
13 | 13 | #include "mli_config.h" |
14 | 14 | #include "mli_debug.h" |
|
19 | 19 | #include "math.h" |
20 | 20 | #include "mli_prv_quant.h" |
21 | 21 |
|
22 | | -//================================================ |
23 | | -// Old version of optimized fully connected code |
24 | | -//================================================ |
25 | | - |
26 | | -#if 0 |
27 | | -template <typename io_T, typename w_T> |
28 | | -static MLI_FORCE_INLINE void full_connection( |
29 | | - const MLI_PTR(io_T) __restrict in_ptr, |
30 | | - const MLI_PTR(w_T) __restrict w_ptr, |
31 | | - const MLI_PTR(w_T) bias_p, |
32 | | - MLI_CONV_OUT_PTR(io_T) __restrict o_ptr, |
33 | | - const int ch_out, |
34 | | - const int inp_size, |
| 22 | +namespace mli { |
| 23 | +namespace krn { |
| 24 | +namespace dsp { |
| 25 | + |
| 26 | +#pragma MLI_CODE_SECTION_START(".mli_lib") |
| 27 | + |
| 28 | +//======================================================== |
| 29 | +// Unified IP (Inner Product) template |
| 30 | +//======================================================== |
| 31 | +template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T, bool no_zp> |
| 32 | +MLI_FORCE_INLINE void inner_product( |
| 33 | + const MLI_PTR(io_T) __restrict in, |
| 34 | + const MLI_PTR(w_T) __restrict weights, |
| 35 | + const MLI_PTR(b_T) __restrict biases, |
| 36 | + MLI_CONV_OUT_PTR(io_T) __restrict out, |
| 37 | + const int in_elements, |
| 38 | + const int out_elements, |
35 | 39 | const int w_ch_out_mem_stride, |
36 | | - const int bias_shift, |
37 | | - const int out_shift) { |
38 | | - if (_Rarely(inp_size < 8)) { |
39 | | - for (int i = 0; i < ch_out; i++) { |
40 | | - auto ip_out = mli_prv_init_accu_with_bias(in_ptr, *bias_p++, bias_shift); |
41 | | - for (int j = 0; j < inp_size; j++) { |
42 | | - mli_prv_load_mac(&ip_out, in_ptr++, w_ptr++); |
43 | | - } |
44 | | - in_ptr -= inp_size; |
45 | | - w_ptr += w_ch_out_mem_stride - inp_size; |
46 | | - |
47 | | - mli_prv_clip_and_store_output(o_ptr++, &ip_out, out_shift); |
48 | | - } |
49 | | - } else { |
50 | | - if ((inp_size & 0x3) == 0) { |
51 | | - const MLI_PTR(io_T) start_in_ptr = in_ptr; |
52 | | - for (int i = 0; i < ch_out; i++) { |
53 | | - auto ip_out = mli_prv_init_accu_with_bias(in_ptr, *bias_p++, bias_shift); |
54 | | - |
55 | | -LOOP_PIPELINE_ENABLE |
56 | | -LOOP_PIPELINE_ENABLE_BACKTRACKING |
57 | | - for (int j = 0; j < (inp_size / 4); j++) { |
58 | | - mli_prv_load_mac_vec4(&ip_out, in_ptr, w_ptr); |
59 | | - in_ptr += 4; |
60 | | - w_ptr += 4; |
| 40 | + quant_T quant_params, |
| 41 | + const io_T val_min_limit, |
| 42 | + const io_T val_max_limit) { |
| 43 | + // Unified Inner Product for both quantization scheme: MLI_FX (symmetric data, scales are power of two) |
| 44 | + // and s8asym (assymetric data, scales of any value) |
| 45 | + // Calculation implies dotproduct and bias add: |
| 46 | + // out_val = sum_i(x_r * w_r) + b_r |
| 47 | + // |
| 48 | + // Considering assymetric types(x_r = (x - x_zp) and w_r = (w - w_zp) + b_r |
| 49 | + // out_val = sum_i((x-x_zp)*(w-w_zp)) + b_r |
| 50 | + // |
| 51 | + // when we will open brackets: |
| 52 | + // out_val = sum(x*w) - sum_i(w*x_zp) - sum_i(x*w_zp) + sum_i(w_zp*x_zp) + b_r |
| 53 | + // where: |
| 54 | + // sum(x*w) - generic dotproduct which can't be avoided for any type |
| 55 | + // -sum_i(w*x_zp) - weights_additive. |
| 56 | + // Allways Zero for FX and can be reused in output channel calculations for s8asym |
| 57 | + // -sum_i(x*w_zp) - in_additive |
| 58 | + // Allways Zero for both FX and TF_s8asym assuming symmetric weights (w_zp == 0) |
| 59 | + // sum_i(w_zp*x_zp)- zp_additive |
| 60 | + // Allways Zero for both FX and TF_s8asym assuming symmetric weights (w_zp == 0) |
| 61 | + // b_r - bias_additive |
| 62 | + // (must be of the same type as accumulator, that may require bias re-quantization) |
| 63 | + //============================================ |
| 64 | +#ifdef __Xdsp_wide |
| 65 | + if (quant_params_get_weigths_zeropoint(&quant_params) == 0 && (out_elements & 1) == 0) { |
| 66 | + int32_t bias_e = 1 << (((const fx_quant_specific_params *)(void*)&quant_params)->bias_shift); |
| 67 | + if(std::is_same<acc_T, mli_acc40_t>::value && std::is_same<io_T, int16_t>::value && std::is_same<w_T, int16_t>::value && bias_e < ((1 << 15) - 1)) { |
| 68 | + v2q15_t vval_max_limit = fx_replic_v2q15(val_max_limit); |
| 69 | + v2q15_t vval_min_limit = fx_replic_v2q15(val_min_limit); |
| 70 | + |
| 71 | + int remaining_part = in_elements & 3; |
| 72 | + for (int o_idx = 0; o_idx < out_elements; o_idx += 2) { |
| 73 | + v2i16_t vbias_e = fx_create_v2i16(bias_e, bias_e); |
| 74 | + int32_t out_shift = ((const fx_quant_specific_params *)(void *)&quant_params)->out_shift - 16; |
| 75 | + v2accum40_t accu = fx_v2a40_mpy_v2q15(*(const v2q15_t *__restrict)(void *)&biases[o_idx], vbias_e); |
| 76 | + |
| 77 | + for (int i = 0; i < in_elements - remaining_part; i += 4) { |
| 78 | + accu = fx_v2a40_mac_v2q15(accu, fx_replic_v2q15(in[i + 0]), *(const v2q15_t *__restrict)(void *)&weights[o_idx + (i + 0) * w_ch_out_mem_stride]); |
| 79 | + accu = fx_v2a40_mac_v2q15(accu, fx_replic_v2q15(in[i + 1]), *(const v2q15_t *__restrict)(void *)&weights[o_idx + (i + 1) * w_ch_out_mem_stride]); |
| 80 | + accu = fx_v2a40_mac_v2q15(accu, fx_replic_v2q15(in[i + 2]), *(const v2q15_t *__restrict)(void *)&weights[o_idx + (i + 2) * w_ch_out_mem_stride]); |
| 81 | + accu = fx_v2a40_mac_v2q15(accu, fx_replic_v2q15(in[i + 3]), *(const v2q15_t *__restrict)(void *)&weights[o_idx + (i + 3) * w_ch_out_mem_stride]); |
61 | 82 | } |
62 | | - in_ptr -= inp_size; |
63 | | - w_ptr += w_ch_out_mem_stride - inp_size; |
64 | | - MLI_EXTRA_ASSERT(start_in_ptr == in_ptr); |
65 | | - |
66 | | - mli_prv_clip_and_store_output(o_ptr++, &ip_out, out_shift); |
67 | | - } |
68 | | - } else { |
69 | | - const MLI_PTR(io_T) start_in_ptr = in_ptr; |
70 | | - for (int i = 0; i < ch_out; i++) { |
71 | | - auto ip_out = mli_prv_init_accu_with_bias(in_ptr, *bias_p++, bias_shift); |
72 | | - |
73 | | - int odd_rest_of_inp_size = (inp_size & 0x3); |
74 | | - for (int k = 0; k < odd_rest_of_inp_size; k++) { |
75 | | - mli_prv_load_mac(&ip_out, in_ptr++, w_ptr++); |
| 83 | + |
| 84 | + for (int r = (in_elements - remaining_part); r < in_elements; r++) { |
| 85 | + accu = fx_v2a40_mac_v2q15(accu, fx_replic_v2q15(in[r]), *(const v2q15_t *__restrict)(void *)&weights[o_idx + r * w_ch_out_mem_stride]); |
76 | 86 | } |
77 | 87 |
|
78 | | - int even_inp_size = inp_size - odd_rest_of_inp_size; |
79 | | -LOOP_PIPELINE_ENABLE |
80 | | -LOOP_PIPELINE_ENABLE_BACKTRACKING |
81 | | - for (int j = 0; j < (even_inp_size / 4); j++) { |
82 | | - mli_prv_load_mac_vec4(&ip_out, in_ptr, w_ptr); |
83 | | - in_ptr += 4; |
84 | | - w_ptr += 4; |
85 | | - } |
86 | | - in_ptr -= inp_size; |
87 | | - w_ptr += w_ch_out_mem_stride - inp_size; |
88 | | - MLI_EXTRA_ASSERT(start_in_ptr == in_ptr); |
89 | | - |
90 | | - mli_prv_clip_and_store_output(o_ptr++, &ip_out, out_shift); |
| 88 | + // Cast result to output type, apply built-in ReLU Applying and write result |
| 89 | + v2q15_t out_val = fx_v2q15_cast_nf_asr_rnd_v2a40(accu, out_shift); |
| 90 | + out_val = fx_min_v2q15(out_val, vval_max_limit); |
| 91 | + out_val = fx_max_v2q15(out_val, vval_min_limit); |
| 92 | + |
| 93 | + v2q15_t *__restrict out_ptr = (v2q15_t *__restrict)(void *)&out[o_idx]; |
| 94 | + *out_ptr = out_val; |
91 | 95 | } |
| 96 | + } else { |
| 97 | + mli::krn::ref::inner_product<io_T, w_T, b_T, acc_T, quant_T, no_zp>(in, |
| 98 | + weights, |
| 99 | + biases, |
| 100 | + out, |
| 101 | + in_elements, |
| 102 | + out_elements, |
| 103 | + w_ch_out_mem_stride, |
| 104 | + quant_params, |
| 105 | + val_min_limit, |
| 106 | + val_max_limit); |
92 | 107 | } |
93 | | - } |
94 | | -} |
| 108 | + } |
| 109 | + else |
| 110 | +#endif // #ifdef __Xdsp_wide |
| 111 | + { |
| 112 | + mli::krn::ref::inner_product<io_T, w_T, b_T, acc_T, quant_T, no_zp>(in, |
| 113 | + weights, |
| 114 | + biases, |
| 115 | + out, |
| 116 | + in_elements, |
| 117 | + out_elements, |
| 118 | + w_ch_out_mem_stride, |
| 119 | + quant_params, |
| 120 | + val_min_limit, |
| 121 | + val_max_limit); |
| 122 | + } |
95 | 123 |
|
96 | | -template <typename io_T, typename w_T> |
97 | | -static MLI_FORCE_INLINE void fully_connected_prepare_and_run_fx( |
98 | | - const mli_tensor* in, |
99 | | - const mli_tensor* weights, |
100 | | - const mli_tensor* bias, |
101 | | - mli_tensor* out) { |
102 | | - mli_prv_fx_init_dsp_ctrl(); |
103 | | - |
104 | | - const MLI_PTR(io_T) in_ptr = mli_prv_tensor_data_ptr<MLI_PTR(io_T)>(in); |
105 | | - const MLI_PTR(w_T) w_ptr = mli_prv_tensor_data_ptr<MLI_PTR(w_T)>(weights); |
106 | | - const MLI_PTR(w_T) b_ptr = mli_prv_tensor_data_ptr<MLI_PTR(w_T)>(bias); |
107 | | - MLI_CONV_OUT_PTR(io_T) out_ptr = mli_prv_tensor_data_ptr<MLI_CONV_OUT_PTR(io_T)>(out); |
108 | | - |
109 | | - int ch_out = weights->shape[0]; |
110 | | - int in_sz = mli_prv_count_elem_num(in); |
111 | | - int w_ch_out_mem_stride_from_tensor = weights->mem_stride[0]; |
112 | | - int w_ch_out_mem_stride = (w_ch_out_mem_stride_from_tensor != 0) ? |
113 | | - w_ch_out_mem_stride_from_tensor : in_sz; |
| 124 | +} |
114 | 125 |
|
115 | | - // Define shift values |
116 | | - const int bias_shift = mli_prv_calc_shift(in, weights, bias); |
117 | | - const int out_shift = mli_prv_calc_shift(in, weights, out); |
| 126 | +#pragma MLI_CODE_SECTION_END() |
118 | 127 |
|
119 | | - // Run basic calculation |
120 | | - full_connection<io_T, w_T>(in_ptr, w_ptr, b_ptr, out_ptr, ch_out, in_sz, w_ch_out_mem_stride, |
121 | | - bias_shift, out_shift); |
| 128 | +} // namespace dsp |
| 129 | +} // namespace krn |
| 130 | +} // namespace mli |
122 | 131 |
|
123 | | - // fill output tensor parameters |
124 | | - out->el_type = in->el_type; |
125 | | - out->shape[0] = ch_out; |
126 | | - out->rank = 1; |
127 | | -} |
128 | | -#endif |
| 132 | +#endif // _MLI_KRN_FULLY_CONNECTED_DSP_H_ |
0 commit comments