foss-for-synopsys-dwc-arc-processors
diff --git a/‎cmake/settings.cmake‎
Lines changed: 2 additions & 2 deletions b/‎cmake/settings.cmake‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/documents/mli_kernels/trans_leaky_relu.rst‎
Lines changed: 2 additions & 0 deletions b/‎doc/documents/mli_kernels/trans_leaky_relu.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎doc/documents/mli_kernels/trans_param_relu.rst‎
Lines changed: 3 additions & 1 deletion b/‎doc/documents/mli_kernels/trans_param_relu.rst‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/example_cifar10_caffe/CMakeLists.txt‎
Lines changed: 37 additions & 30 deletions b/‎examples/example_cifar10_caffe/CMakeLists.txt‎
Lines changed: 37 additions & 30 deletions
diff --git a/‎examples/example_cifar10_caffe/Makefile‎
Lines changed: 13 additions & 4 deletions b/‎examples/example_cifar10_caffe/Makefile‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎examples/example_cifar10_caffe/README.md‎
Lines changed: 3 additions & 3 deletions b/‎examples/example_cifar10_caffe/README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lib/mli_lib.cmake‎
Lines changed: 14 additions & 6 deletions b/‎lib/mli_lib.cmake‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎lib/src/bricks/impl/mli_krn_dotprod_vdsp.h‎
Lines changed: 47 additions & 0 deletions b/‎lib/src/bricks/impl/mli_krn_dotprod_vdsp.h‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_ref.h‎
Lines changed: 10 additions & 14 deletions b/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_ref.h‎
Lines changed: 10 additions & 14 deletions
diff --git a/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_vdsp.h‎
Lines changed: 40 additions & 15 deletions b/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_vdsp.h‎
Lines changed: 40 additions & 15 deletions
@@ -71,8 +71,8 @@ if (ARC)
     endif()
 
     list(APPEND MLI_PLATFORM_FLAGS
-        -Hon=Long_enums
-        "SHELL: -mllvm -gen-lpcc=false"
+        -Hon=Long_enums -Wcg,-arc-vdsp-AA=1 
+        "SHELL: -mllvm -gen-lpcc=false -mllvm -arc-sort-out-copy=true -mllvm -arc-vdsp-copy=3"
     )
     if (DEFINED BUILDLIB_DIR)
         list(APPEND MLI_PLATFORM_LINK_OPTIONS
 
@@ -96,6 +96,8 @@ satisfy the following conditions before calling the function:
    single scale factor and a single zero offset.
 
  - Zero offset of ``in`` and ``out`` tensors must be within [-128, 127] range.
+
+ - Zero offset of ``slope_coeffs`` tensor must be within [-16384, 16383] range.
 
 Depending on the debug level (see section :ref:`err_codes`) this function performs a parameter 
 check and returns the result as an ``mli_status`` code as described in section :ref:`kernl_sp_conf`.
@@ -126,7 +126,9 @@ satisfy the following conditions before calling the function:
  - ``in``, ``out`` and ``slope_coeff`` tensors must be quantized on the tensor level. This implies 
    that the tensor contains a single scale factor and a single zero offset.
 
- - Zero offset of ``in``, ``out`` and ``slope_coeffs`` tensors must be within [-128, 127] range.
+ - Zero offset of ``in`` and ``out`` tensors must be within [-128, 127] range.
+
+ - Zero offset of ``slope_coeffs`` tensor must be within [-16384, 16383] range.
 
 Depending on the debug level (see section :ref:`err_codes`) this function performs a parameter 
 check and returns the result as an ``mli_status`` code as described in section :ref:`kernl_sp_conf`.
@@ -12,18 +12,6 @@ file(GLOB temp
     ../auxiliary/*.cc
     ../auxiliary/*.c
 )
-add_executable(example_cifar10_caffe
-    cifar10_coefficients_hwcn_small.c
-    cifar10_model_hwcn.c
-    ml_api_cifar10_caffe_main.c
-    ${temp}
-)
-
-target_link_libraries(example_cifar10_caffe PUBLIC mli)
-
-target_include_directories(example_cifar10_caffe PRIVATE
-    ../../examples/auxiliary
-)
 
 if (ARC)
 set(EXAMPLE_FLAGS
@@ -47,24 +35,43 @@ else()
 set(EXAMPLE_FLAGS)
 endif()
 
-target_compile_options(example_cifar10_caffe PRIVATE
-    ${MLI_PLATFORM_COMPILE_OPTIONS}
-    ${EXAMPLE_FLAGS}
-)
+function(ADD_DATA_TYPE DATA_TYPE BIT_DEPTH)
+    set(TARGET example_cifar10_caffe_${DATA_TYPE})
+    add_executable(${TARGET}
+        cifar10_coefficients_hwcn_small.c
+        cifar10_model_hwcn.c
+        ml_api_cifar10_caffe_main.c
+        ${temp}
+    )
+    target_link_libraries(${TARGET} PUBLIC mli)
 
-target_link_options(example_cifar10_caffe PRIVATE
-    ${MLI_PLATFORM_LINK_OPTIONS}
-    ${EXAMPLE_FLAGS}
-)
+    target_include_directories(${TARGET} PRIVATE
+        ../../examples/auxiliary
+    )
+    target_compile_options(${TARGET} PRIVATE
+        ${MLI_PLATFORM_COMPILE_OPTIONS}
+        ${EXAMPLE_FLAGS}
+    )
+    target_compile_definitions(${TARGET} PRIVATE
+        MODEL_BIT_DEPTH=${BIT_DEPTH})
+    target_link_options(${TARGET} PRIVATE
+        ${MLI_PLATFORM_LINK_OPTIONS}
+        ${EXAMPLE_FLAGS}
+    )
+    if (ARC)
+        target_link_options(${TARGET} PRIVATE
+            -m
+            -Coutput=./elf_files/${TARGET}.map
+        )
+    endif()
 
-if (ARC)
-target_link_options(example_cifar10_caffe PRIVATE
-    -m
-    -Coutput=./elf_files/example_cifar10_caffe.map
-)
-endif()
+    set_target_properties(${TARGET}
+        PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY "./elf_files$<0:>"
+    )
+endfunction()
+
+ADD_DATA_TYPE(FX16 16)
+ADD_DATA_TYPE(SA8 8)
+ADD_DATA_TYPE(FX16_FX8_FX8 816)
 
-set_target_properties(example_cifar10_caffe
-    PROPERTIES
-    RUNTIME_OUTPUT_DIRECTORY "./elf_files$<0:>"
-)
 
@@ -15,16 +15,25 @@ PUBLIC_DIR = ../..
 include $(PUBLIC_DIR)/make/rules.mk
 
 BUILD_SUBDIR = examples$(PS)example_cifar10_caffe
-BUILD_TARGET = example_cifar10_caffe
+
+
+TYPES = \
+	FX16 \
+	SA8 \
+	FX16_FX8_FX8
+
+RUN_TARGETS = $(patsubst %,run_%,$(TYPES))
 
 app: build
 
-run: app
+run: run_FX16
+
+$(RUN_TARGETS): run_%: app
 ifndef TCF_FILE
 	$(BUILD_DIR)$(PS)$(BUILD_SUBDIR)$(PS)elf_files$(PS)example_cifar10_caffe $(RUN_ARGS)
 else
-	mdb $(BUILD_DIR)$(PS)$(BUILD_SUBDIR)$(PS)elf_files$(PS)example_cifar10_caffe.elf \
+	mdb $(BUILD_DIR)$(PS)$(BUILD_SUBDIR)$(PS)elf_files$(PS)example_cifar10_caffe_$*.elf \
 		-cl  -nsim -cmd=run -off=cr_for_more -cmd=exit -tcf=$(TCF_FILE) -- $(RUN_ARGS)
 endif
 
-.PHONY:	app run
+.PHONY:	app $(RUN_TARGETS)
@@ -144,15 +144,15 @@ For this reason you can build and check application with 8 and 16 bit depth of N
 
 * 16 bit depth of coefficients and data (FX16) (default):
 
-       gmake TCF_FILE=../../hw/em9d.tcf EXT_CFLAGS="-DMODEL_BIT_DEPTH=16"
+       gmake run_FX16 TCF_FILE=../../hw/em9d.tcf
 
 * 8 bit depth of coefficients and data (SA8):
 
-       gmake TCF_FILE=../../hw/em9d.tcf EXT_CFLAGS="-DMODEL_BIT_DEPTH=8"
+       gmake run_SA8 TCF_FILE=../../hw/em9d.tcf
 
 * 8x16: 8 bit depth of coefficients and 16 bit depth of data (FX8 weights and FX16 data):
 
-       gmake TCF_FILE=../../hw/em9d.tcf EXT_CFLAGS="-DMODEL_BIT_DEPTH=816"
+       gmake run_FX16_FX8_FX8 TCF_FILE=../../hw/em9d.tcf
 
 Example application may be used in three modes:
 1. **Built-in input processing.** Uses only hard-coded vector for the single input model inference. 
 
@@ -73,24 +73,32 @@ set(MLI_LIB_PRIVATE_INCLUDES
     ${MLI_LIB_CMAKE_DIR}/src/pal
 )
 
+set(MLI_LIB_PRIVATE_COMPILE_OPTIONS )
+
 if (ARC)
-    set(MLI_LIB_PRIVATE_COMPILE_OPTIONS
+    list(APPEND MLI_LIB_PRIVATE_COMPILE_OPTIONS
         -Hnocopyr
         -Hpurge
         -Hsdata0
         -Hdense_prologue
+        -tcf_core_config
+)
+endif()
+
+if (ARC)
+    list(APPEND MLI_LIB_PRIVATE_COMPILE_OPTIONS
+        -Werror
         -Wall
         -Wno-nonportable-include-path
-        -tcf_core_config
     )
 elseif (MSVC)
-    set(MLI_LIB_PRIVATE_COMPILE_OPTIONS
-        /W3
+    list(APPEND MLI_LIB_PRIVATE_COMPILE_OPTIONS
+        /W2
+        /WX
     )
 else()
-    set(MLI_LIB_PRIVATE_COMPILE_OPTIONS
+    list(APPEND MLI_LIB_PRIVATE_COMPILE_OPTIONS
         -Werror
-        -Wno-nonportable-include-path
     )
 endif()
 
 
@@ -170,6 +170,53 @@ static MLI_FORCE_INLINE acc_T dotprod2D_vv(
 #pragma clang diagnostic pop
 }
 
+template <typename io_T, typename w_T, typename acc_T>
+static MLI_FORCE_INLINE acc_T dotprod2D_vv_ptrvector(
+        const MLI_PTR(io_T) __restrict in,
+        const MLI_PTR(w_T)  __restrict krn,
+        acc_T accu,
+        const int width,
+        const int height,
+        int in_col_step,
+        int in_row_step,
+        int kern_col_step,
+        int kern_row_step) {
+    int in_row_step_orig = in_row_step;
+    in_row_step -= width * in_col_step;
+    kern_row_step -= width * kern_col_step;
+
+    vNint_t addr_vec = 0;
+    int i = 0;
+    int offset = in_row_step_orig * sizeof(io_T);
+#pragma clang loop unroll(full)
+    for (int row = 1; row < height; row++) {
+        addr_vec[i++] = offset;
+        offset += in_row_step_orig * sizeof(io_T);
+    }
+    i = 0;
+    addr_vec += (int)in;
+
+    for (int clmn = 0; clmn < width; clmn++) {
+        accu = mli_prv_mac_load_v_v(accu, krn, in);
+        in += in_col_step;
+        krn += kern_col_step;
+    }
+    krn += kern_row_step;
+
+#pragma clang loop unroll(full)
+    for (int row = 1; row < height; row++) {
+        MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T))addr_vec[i++];
+#pragma clang loop unroll(full)
+        for (int clmn = 0; clmn < width; clmn++) {
+            accu = mli_prv_mac_load_v_v(accu, krn, in_ptr);
+            in_ptr += in_col_step;
+            krn += kern_col_step;
+        }
+        krn += kern_row_step;
+    }
+    return accu;
+}
+
 template < typename in_T, typename w_T, typename acc_T >
 static MLI_FORCE_INLINE acc_T dotprod3D_v_pad (
         const MLI_PTR (in_T) __restrict in,
 
@@ -122,9 +122,11 @@ static inline void rnn_dense_op(
     }
 
     for (int o_idx = 0; o_idx < out_elements; o_idx++) {
-        io_T out_val = 0; 
+
         acc_T accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
-        acc_T prev_step = mli_math_mul_fx<io_T, acc_T>(0, 0);
+        acc_T acc_ir = mli_math_mul_fx<io_T, acc_T>(0, 0);
+        acc_T acc_res_ir = mli_math_mul_fx<io_T, acc_T>(0, 0);
+
         accu = mli::krn::bias_additive(&bias[o_idx], accu, &in_to_out_quant_params[0]);
 
         for(int idx = 0; idx < inputs_num; idx++) {
@@ -137,20 +139,14 @@ static inline void rnn_dense_op(
                     in_elements[idx], /* height= */ 1, /* ch= */ 1, w_ch_out_mem_strides[idx], 
                     /* row_step= */ 1, /* ch_step= */ 1);
             accu = mli_math_add_fx(accu, other_additives[idx]);
-            accu = mli_math_add_fx(accu, prev_step);
-
-            if(inputs_num - idx != 1) {
-                prev_step = mli::krn::ref::ir_rnn_result_requantize(accu, &in_to_out_quant_params[idx],
-                                &in_to_out_quant_params[idx+1], /* krn_idx= */ 0);
-                accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
-            } else {
-                out_val = mli::krn::ref::result_cast<io_T, acc_T, quant_T>(accu, &in_to_out_quant_params[idx]);
-            }
+
+            acc_ir = mli::krn::ir_rnn_result_requantize<acc_T>(accu, &in_to_out_quant_params[idx]);
+            acc_res_ir = mli_math_add_fx(acc_res_ir, acc_ir);
+            accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
         }
 
-        out_val = MIN(out_val, val_max_limit);
-        out_val = MAX(out_val, val_min_limit);
-        out[o_idx] = out_val;
+        out[o_idx] = mli::krn::ir_result_cast_relu_store<io_T, acc_T, quant_T>(acc_res_ir,
+        		&in_to_out_quant_params[inputs_num - 1], val_min_limit, val_max_limit);
     }
 }
 
 
@@ -94,6 +94,31 @@ static inline void rnn_dense_op_stacked(
     dense_out_ptr -= gates_num * out_elements;
 }
 
+MLI_FORCE_INLINE vNx4int_t mli_math_add_accus(vNx4int_t L, vNx4int_t R) {
+    return mli_math_add_fx(L, R);
+}
+
+MLI_FORCE_INLINE vNx2accint_t mli_math_add_accus(vNx2accint_t L, vNx2accint_t R) {
+	return mli_math_add(L, R);
+}
+
+MLI_FORCE_INLINE vNx4accint_t mli_math_add_accus(vNx4accint_t L, vNx4accint_t R) {
+	return mli_math_add(L, R);
+}
+
+MLI_FORCE_INLINE vNx4accshort_t mli_math_add_accus(vNx4accshort_t L, vNx4accshort_t R) {
+#if (__Xvec_guard_bit_option == 0)
+	vNx4short_t L_short = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(L);
+	vNx4short_t R_short = mli_math_acc_cast_fx<vNx4short_t, vNx4accshort_t>(R);
+
+	vNx4short_t res = mli_math_add_fx<vNx4short_t>(L_short, R_short);
+
+	return mli_math_init_accu_add<vNx4short_t, vNx4accshort_t>(res, (vNx4short_t)0);
+#else
+	return mli_math_add(L, R);
+#endif
+}
+
 template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T>
 static inline void rnn_dense_op(
         const MLI_PTR(io_T) __restrict * inputs,
@@ -107,14 +132,16 @@ static inline void rnn_dense_op(
         quant_T * in_to_out_quant_params,
         const io_T val_min_limit,
         const io_T val_max_limit) {
-
+    typedef typename std::conditional<std::is_same<acc_T, vNx4accshort_t>::value, vNx4int_t, acc_T>::type ir_T;
     int num_lanes = get_number_lanes<acc_T>();
+
     for (int o_idx = 0; o_idx < out_elements; o_idx += num_lanes) {
         int remaining_ch = out_elements - o_idx;
         int current_chs = MIN(remaining_ch, num_lanes); // number of channels computed in this loop iteration
 
-        acc_T accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
-        acc_T prev_step = mli_math_mul_fx<io_T, acc_T>(0, 0);
+        acc_T accu = mli_prv_init_accu<acc_T>();
+        ir_T acc_ir = mli_prv_init_accu<ir_T>();
+        ir_T acc_res_ir = mli_prv_init_accu<ir_T>();
 
         auto output_params = adjust_quant_params_v(&in_to_out_quant_params[0], 0);
         accu = mli::krn::bias_additive(&bias[o_idx], accu, &output_params, /* add_preshift_rnd */ false);
@@ -124,20 +151,18 @@ static inline void rnn_dense_op(
             output_params = adjust_quant_params_v(&in_to_out_quant_params[idx], 0);
             accu = dotprod_inputzp_1D_v(inputs[idx], &weights[idx][o_idx], accu, in_elements[idx],
                     1, w_ch_out_mem_strides[idx], &in_to_out_quant_params[idx]);
-            accu = mli_math_add(accu, prev_step);
-
-            if(inputs_num - idx != 1) {
-                mli::krn::ref::adjust_quant_params(&in_to_out_quant_params[idx], o_idx);
-                prev_step = mli::krn::ir_rnn_result_requantize(accu, &in_to_out_quant_params[idx],
-                                &in_to_out_quant_params[idx + 1], /* krn_idx= */ 0);
-                accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
-            } else {
-                // Cast result to output type with scaling
-                mli::krn::result_cast_relu_store_v(&out[o_idx], accu, &output_params,
-                        val_min_limit, val_max_limit, current_chs, /* add_preshift_rnd */ true);
-            }
+
+            /* TODO: can be optimized using adjust_quant_params_v, and also optimize ir_rnn_result_requantize function */
+            mli::krn::ref::adjust_quant_params(&in_to_out_quant_params[idx], o_idx);
+            acc_ir = mli::krn::ir_rnn_result_requantize<acc_T, ir_T>(accu, &in_to_out_quant_params[idx]);
+
+            acc_res_ir = mli_math_add_accus(acc_res_ir, acc_ir);
+            accu = mli_prv_init_accu<acc_T>();
         }
 
+        // Cast result to output type with scaling
+        mli::krn::ir_result_cast_relu_store_v(&out[o_idx], acc_res_ir, &output_params,
+                                val_min_limit, val_max_limit, current_chs);
     }
 }