Skip to content

Commit ecacab1

Browse files
committed
Add latest changes from the dev branch
2 parents 625ad7f + a7922d8 commit ecacab1

File tree

58 files changed

+1477
-763
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+1477
-763
lines changed

cmake/settings.cmake

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ if (ARC)
7171
endif()
7272

7373
list(APPEND MLI_PLATFORM_FLAGS
74-
-Hon=Long_enums
75-
"SHELL: -mllvm -gen-lpcc=false"
74+
-Hon=Long_enums -Wcg,-arc-vdsp-AA=1
75+
"SHELL: -mllvm -gen-lpcc=false -mllvm -arc-sort-out-copy=true -mllvm -arc-vdsp-copy=3"
7676
)
7777
if (DEFINED BUILDLIB_DIR)
7878
list(APPEND MLI_PLATFORM_LINK_OPTIONS

doc/documents/mli_kernels/trans_leaky_relu.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ satisfy the following conditions before calling the function:
9696
single scale factor and a single zero offset.
9797

9898
- Zero offset of ``in`` and ``out`` tensors must be within [-128, 127] range.
99+
100+
- Zero offset of ``slope_coeffs`` tensor must be within [-16384, 16383] range.
99101

100102
Depending on the debug level (see section :ref:`err_codes`) this function performs a parameter
101103
check and returns the result as an ``mli_status`` code as described in section :ref:`kernl_sp_conf`.

doc/documents/mli_kernels/trans_param_relu.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,9 @@ satisfy the following conditions before calling the function:
126126
- ``in``, ``out`` and ``slope_coeff`` tensors must be quantized on the tensor level. This implies
127127
that the tensor contains a single scale factor and a single zero offset.
128128

129-
- Zero offset of ``in``, ``out`` and ``slope_coeffs`` tensors must be within [-128, 127] range.
129+
- Zero offset of ``in`` and ``out`` tensors must be within [-128, 127] range.
130+
131+
- Zero offset of ``slope_coeffs`` tensor must be within [-16384, 16383] range.
130132

131133
Depending on the debug level (see section :ref:`err_codes`) this function performs a parameter
132134
check and returns the result as an ``mli_status`` code as described in section :ref:`kernl_sp_conf`.

examples/example_cifar10_caffe/CMakeLists.txt

Lines changed: 37 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,6 @@ file(GLOB temp
1212
../auxiliary/*.cc
1313
../auxiliary/*.c
1414
)
15-
add_executable(example_cifar10_caffe
16-
cifar10_coefficients_hwcn_small.c
17-
cifar10_model_hwcn.c
18-
ml_api_cifar10_caffe_main.c
19-
${temp}
20-
)
21-
22-
target_link_libraries(example_cifar10_caffe PUBLIC mli)
23-
24-
target_include_directories(example_cifar10_caffe PRIVATE
25-
../../examples/auxiliary
26-
)
2715

2816
if (ARC)
2917
set(EXAMPLE_FLAGS
@@ -47,24 +35,43 @@ else()
4735
set(EXAMPLE_FLAGS)
4836
endif()
4937

50-
target_compile_options(example_cifar10_caffe PRIVATE
51-
${MLI_PLATFORM_COMPILE_OPTIONS}
52-
${EXAMPLE_FLAGS}
53-
)
38+
function(ADD_DATA_TYPE DATA_TYPE BIT_DEPTH)
39+
set(TARGET example_cifar10_caffe_${DATA_TYPE})
40+
add_executable(${TARGET}
41+
cifar10_coefficients_hwcn_small.c
42+
cifar10_model_hwcn.c
43+
ml_api_cifar10_caffe_main.c
44+
${temp}
45+
)
46+
target_link_libraries(${TARGET} PUBLIC mli)
5447

55-
target_link_options(example_cifar10_caffe PRIVATE
56-
${MLI_PLATFORM_LINK_OPTIONS}
57-
${EXAMPLE_FLAGS}
58-
)
48+
target_include_directories(${TARGET} PRIVATE
49+
../../examples/auxiliary
50+
)
51+
target_compile_options(${TARGET} PRIVATE
52+
${MLI_PLATFORM_COMPILE_OPTIONS}
53+
${EXAMPLE_FLAGS}
54+
)
55+
target_compile_definitions(${TARGET} PRIVATE
56+
MODEL_BIT_DEPTH=${BIT_DEPTH})
57+
target_link_options(${TARGET} PRIVATE
58+
${MLI_PLATFORM_LINK_OPTIONS}
59+
${EXAMPLE_FLAGS}
60+
)
61+
if (ARC)
62+
target_link_options(${TARGET} PRIVATE
63+
-m
64+
-Coutput=./elf_files/${TARGET}.map
65+
)
66+
endif()
5967

60-
if (ARC)
61-
target_link_options(example_cifar10_caffe PRIVATE
62-
-m
63-
-Coutput=./elf_files/example_cifar10_caffe.map
64-
)
65-
endif()
68+
set_target_properties(${TARGET}
69+
PROPERTIES
70+
RUNTIME_OUTPUT_DIRECTORY "./elf_files$<0:>"
71+
)
72+
endfunction()
73+
74+
ADD_DATA_TYPE(FX16 16)
75+
ADD_DATA_TYPE(SA8 8)
76+
ADD_DATA_TYPE(FX16_FX8_FX8 816)
6677

67-
set_target_properties(example_cifar10_caffe
68-
PROPERTIES
69-
RUNTIME_OUTPUT_DIRECTORY "./elf_files$<0:>"
70-
)

examples/example_cifar10_caffe/Makefile

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,25 @@ PUBLIC_DIR = ../..
1515
include $(PUBLIC_DIR)/make/rules.mk
1616

1717
BUILD_SUBDIR = examples$(PS)example_cifar10_caffe
18-
BUILD_TARGET = example_cifar10_caffe
18+
19+
20+
TYPES = \
21+
FX16 \
22+
SA8 \
23+
FX16_FX8_FX8
24+
25+
RUN_TARGETS = $(patsubst %,run_%,$(TYPES))
1926

2027
app: build
2128

22-
run: app
29+
run: run_FX16
30+
31+
$(RUN_TARGETS): run_%: app
2332
ifndef TCF_FILE
2433
$(BUILD_DIR)$(PS)$(BUILD_SUBDIR)$(PS)elf_files$(PS)example_cifar10_caffe $(RUN_ARGS)
2534
else
26-
mdb $(BUILD_DIR)$(PS)$(BUILD_SUBDIR)$(PS)elf_files$(PS)example_cifar10_caffe.elf \
35+
mdb $(BUILD_DIR)$(PS)$(BUILD_SUBDIR)$(PS)elf_files$(PS)example_cifar10_caffe_$*.elf \
2736
-cl -nsim -cmd=run -off=cr_for_more -cmd=exit -tcf=$(TCF_FILE) -- $(RUN_ARGS)
2837
endif
2938

30-
.PHONY: app run
39+
.PHONY: app $(RUN_TARGETS)

examples/example_cifar10_caffe/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,15 +144,15 @@ For this reason you can build and check application with 8 and 16 bit depth of N
144144

145145
* 16 bit depth of coefficients and data (FX16) (default):
146146

147-
gmake TCF_FILE=../../hw/em9d.tcf EXT_CFLAGS="-DMODEL_BIT_DEPTH=16"
147+
gmake run_FX16 TCF_FILE=../../hw/em9d.tcf
148148

149149
* 8 bit depth of coefficients and data (SA8):
150150

151-
gmake TCF_FILE=../../hw/em9d.tcf EXT_CFLAGS="-DMODEL_BIT_DEPTH=8"
151+
gmake run_SA8 TCF_FILE=../../hw/em9d.tcf
152152

153153
* 8x16: 8 bit depth of coefficients and 16 bit depth of data (FX8 weights and FX16 data):
154154

155-
gmake TCF_FILE=../../hw/em9d.tcf EXT_CFLAGS="-DMODEL_BIT_DEPTH=816"
155+
gmake run_FX16_FX8_FX8 TCF_FILE=../../hw/em9d.tcf
156156

157157
Example application may be used in three modes:
158158
1. **Built-in input processing.** Uses only hard-coded vector for the single input model inference.

lib/mli_lib.cmake

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,24 +73,32 @@ set(MLI_LIB_PRIVATE_INCLUDES
7373
${MLI_LIB_CMAKE_DIR}/src/pal
7474
)
7575

76+
set(MLI_LIB_PRIVATE_COMPILE_OPTIONS )
77+
7678
if (ARC)
77-
set(MLI_LIB_PRIVATE_COMPILE_OPTIONS
79+
list(APPEND MLI_LIB_PRIVATE_COMPILE_OPTIONS
7880
-Hnocopyr
7981
-Hpurge
8082
-Hsdata0
8183
-Hdense_prologue
84+
-tcf_core_config
85+
)
86+
endif()
87+
88+
if (ARC)
89+
list(APPEND MLI_LIB_PRIVATE_COMPILE_OPTIONS
90+
-Werror
8291
-Wall
8392
-Wno-nonportable-include-path
84-
-tcf_core_config
8593
)
8694
elseif (MSVC)
87-
set(MLI_LIB_PRIVATE_COMPILE_OPTIONS
88-
/W3
95+
list(APPEND MLI_LIB_PRIVATE_COMPILE_OPTIONS
96+
/W2
97+
/WX
8998
)
9099
else()
91-
set(MLI_LIB_PRIVATE_COMPILE_OPTIONS
100+
list(APPEND MLI_LIB_PRIVATE_COMPILE_OPTIONS
92101
-Werror
93-
-Wno-nonportable-include-path
94102
)
95103
endif()
96104

lib/src/bricks/impl/mli_krn_dotprod_vdsp.h

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,53 @@ static MLI_FORCE_INLINE acc_T dotprod2D_vv(
170170
#pragma clang diagnostic pop
171171
}
172172

173+
template <typename io_T, typename w_T, typename acc_T>
174+
static MLI_FORCE_INLINE acc_T dotprod2D_vv_ptrvector(
175+
const MLI_PTR(io_T) __restrict in,
176+
const MLI_PTR(w_T) __restrict krn,
177+
acc_T accu,
178+
const int width,
179+
const int height,
180+
int in_col_step,
181+
int in_row_step,
182+
int kern_col_step,
183+
int kern_row_step) {
184+
int in_row_step_orig = in_row_step;
185+
in_row_step -= width * in_col_step;
186+
kern_row_step -= width * kern_col_step;
187+
188+
vNint_t addr_vec = 0;
189+
int i = 0;
190+
int offset = in_row_step_orig * sizeof(io_T);
191+
#pragma clang loop unroll(full)
192+
for (int row = 1; row < height; row++) {
193+
addr_vec[i++] = offset;
194+
offset += in_row_step_orig * sizeof(io_T);
195+
}
196+
i = 0;
197+
addr_vec += (int)in;
198+
199+
for (int clmn = 0; clmn < width; clmn++) {
200+
accu = mli_prv_mac_load_v_v(accu, krn, in);
201+
in += in_col_step;
202+
krn += kern_col_step;
203+
}
204+
krn += kern_row_step;
205+
206+
#pragma clang loop unroll(full)
207+
for (int row = 1; row < height; row++) {
208+
MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T))addr_vec[i++];
209+
#pragma clang loop unroll(full)
210+
for (int clmn = 0; clmn < width; clmn++) {
211+
accu = mli_prv_mac_load_v_v(accu, krn, in_ptr);
212+
in_ptr += in_col_step;
213+
krn += kern_col_step;
214+
}
215+
krn += kern_row_step;
216+
}
217+
return accu;
218+
}
219+
173220
template < typename in_T, typename w_T, typename acc_T >
174221
static MLI_FORCE_INLINE acc_T dotprod3D_v_pad (
175222
const MLI_PTR (in_T) __restrict in,

lib/src/bricks/impl/mli_krn_rnn_dense_op_ref.h

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,11 @@ static inline void rnn_dense_op(
122122
}
123123

124124
for (int o_idx = 0; o_idx < out_elements; o_idx++) {
125-
io_T out_val = 0;
125+
126126
acc_T accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
127-
acc_T prev_step = mli_math_mul_fx<io_T, acc_T>(0, 0);
127+
acc_T acc_ir = mli_math_mul_fx<io_T, acc_T>(0, 0);
128+
acc_T acc_res_ir = mli_math_mul_fx<io_T, acc_T>(0, 0);
129+
128130
accu = mli::krn::bias_additive(&bias[o_idx], accu, &in_to_out_quant_params[0]);
129131

130132
for(int idx = 0; idx < inputs_num; idx++) {
@@ -137,20 +139,14 @@ static inline void rnn_dense_op(
137139
in_elements[idx], /* height= */ 1, /* ch= */ 1, w_ch_out_mem_strides[idx],
138140
/* row_step= */ 1, /* ch_step= */ 1);
139141
accu = mli_math_add_fx(accu, other_additives[idx]);
140-
accu = mli_math_add_fx(accu, prev_step);
141-
142-
if(inputs_num - idx != 1) {
143-
prev_step = mli::krn::ref::ir_rnn_result_requantize(accu, &in_to_out_quant_params[idx],
144-
&in_to_out_quant_params[idx+1], /* krn_idx= */ 0);
145-
accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
146-
} else {
147-
out_val = mli::krn::ref::result_cast<io_T, acc_T, quant_T>(accu, &in_to_out_quant_params[idx]);
148-
}
142+
143+
acc_ir = mli::krn::ir_rnn_result_requantize<acc_T>(accu, &in_to_out_quant_params[idx]);
144+
acc_res_ir = mli_math_add_fx(acc_res_ir, acc_ir);
145+
accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
149146
}
150147

151-
out_val = MIN(out_val, val_max_limit);
152-
out_val = MAX(out_val, val_min_limit);
153-
out[o_idx] = out_val;
148+
out[o_idx] = mli::krn::ir_result_cast_relu_store<io_T, acc_T, quant_T>(acc_res_ir,
149+
&in_to_out_quant_params[inputs_num - 1], val_min_limit, val_max_limit);
154150
}
155151
}
156152

lib/src/bricks/impl/mli_krn_rnn_dense_op_vdsp.h

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,31 @@ static inline void rnn_dense_op_stacked(
9494
dense_out_ptr -= gates_num * out_elements;
9595
}
9696

97+
MLI_FORCE_INLINE vNx4int_t mli_math_add_accus(vNx4int_t L, vNx4int_t R) {
98+
return mli_math_add_fx(L, R);
99+
}
100+
101+
MLI_FORCE_INLINE vNx2accint_t mli_math_add_accus(vNx2accint_t L, vNx2accint_t R) {
102+
return mli_math_add(L, R);
103+
}
104+
105+
MLI_FORCE_INLINE vNx4accint_t mli_math_add_accus(vNx4accint_t L, vNx4accint_t R) {
106+
return mli_math_add(L, R);
107+
}
108+
109+
MLI_FORCE_INLINE vNx4accshort_t mli_math_add_accus(vNx4accshort_t L, vNx4accshort_t R) {
110+
#if (__Xvec_guard_bit_option == 0)
111+
vNx4short_t L_short = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(L);
112+
vNx4short_t R_short = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(R);
113+
114+
vNx4short_t res = mli_math_add_fx<vNx4short_t>(L_short, R_short);
115+
116+
return mli_math_init_accu_add<vNx4short_t, vNx4accshort_t>(res, (vNx4short_t)0);
117+
#else
118+
return mli_math_add(L, R);
119+
#endif
120+
}
121+
97122
template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T>
98123
static inline void rnn_dense_op(
99124
const MLI_PTR(io_T) __restrict * inputs,
@@ -107,14 +132,16 @@ static inline void rnn_dense_op(
107132
quant_T * in_to_out_quant_params,
108133
const io_T val_min_limit,
109134
const io_T val_max_limit) {
110-
135+
typedef typename std::conditional<std::is_same<acc_T, vNx4accshort_t>::value, vNx4int_t, acc_T>::type ir_T;
111136
int num_lanes = get_number_lanes<acc_T>();
137+
112138
for (int o_idx = 0; o_idx < out_elements; o_idx += num_lanes) {
113139
int remaining_ch = out_elements - o_idx;
114140
int current_chs = MIN(remaining_ch, num_lanes); // number of channels computed in this loop iteration
115141

116-
acc_T accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
117-
acc_T prev_step = mli_math_mul_fx<io_T, acc_T>(0, 0);
142+
acc_T accu = mli_prv_init_accu<acc_T>();
143+
ir_T acc_ir = mli_prv_init_accu<ir_T>();
144+
ir_T acc_res_ir = mli_prv_init_accu<ir_T>();
118145

119146
auto output_params = adjust_quant_params_v(&in_to_out_quant_params[0], 0);
120147
accu = mli::krn::bias_additive(&bias[o_idx], accu, &output_params, /* add_preshift_rnd */ false);
@@ -124,20 +151,18 @@ static inline void rnn_dense_op(
124151
output_params = adjust_quant_params_v(&in_to_out_quant_params[idx], 0);
125152
accu = dotprod_inputzp_1D_v(inputs[idx], &weights[idx][o_idx], accu, in_elements[idx],
126153
1, w_ch_out_mem_strides[idx], &in_to_out_quant_params[idx]);
127-
accu = mli_math_add(accu, prev_step);
128-
129-
if(inputs_num - idx != 1) {
130-
mli::krn::ref::adjust_quant_params(&in_to_out_quant_params[idx], o_idx);
131-
prev_step = mli::krn::ir_rnn_result_requantize(accu, &in_to_out_quant_params[idx],
132-
&in_to_out_quant_params[idx + 1], /* krn_idx= */ 0);
133-
accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
134-
} else {
135-
// Cast result to output type with scaling
136-
mli::krn::result_cast_relu_store_v(&out[o_idx], accu, &output_params,
137-
val_min_limit, val_max_limit, current_chs, /* add_preshift_rnd */ true);
138-
}
154+
155+
/* TODO: can be optimized using adjust_quant_params_v, and also optimize ir_rnn_result_requantize function */
156+
mli::krn::ref::adjust_quant_params(&in_to_out_quant_params[idx], o_idx);
157+
acc_ir = mli::krn::ir_rnn_result_requantize<acc_T, ir_T>(accu, &in_to_out_quant_params[idx]);
158+
159+
acc_res_ir = mli_math_add_accus(acc_res_ir, acc_ir);
160+
accu = mli_prv_init_accu<acc_T>();
139161
}
140162

163+
// Cast result to output type with scaling
164+
mli::krn::ir_result_cast_relu_store_v(&out[o_idx], acc_res_ir, &output_params,
165+
val_min_limit, val_max_limit, current_chs);
141166
}
142167
}
143168

0 commit comments

Comments
 (0)