foss-for-synopsys-dwc-arc-processors
diff --git a/‎include/api/mli_helpers_api.h‎
Lines changed: 4 additions & 0 deletions b/‎include/api/mli_helpers_api.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/api/mli_kernels_api.h‎
Lines changed: 64 additions & 5 deletions b/‎include/api/mli_kernels_api.h‎
Lines changed: 64 additions & 5 deletions
diff --git a/‎include/mli_types.h‎
Lines changed: 0 additions & 1 deletion b/‎include/mli_types.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎lib/mli_lib.cmake‎
Lines changed: 1 addition & 0 deletions b/‎lib/mli_lib.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_ref.h‎
Lines changed: 5 additions & 5 deletions b/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_ref.h‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_vdsp.h‎
Lines changed: 73 additions & 0 deletions b/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_vdsp.h‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎lib/src/bricks/mli_krn_rnn_dense_op.h‎
Lines changed: 1 addition & 1 deletion b/‎lib/src/bricks/mli_krn_rnn_dense_op.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/src/bricks/mli_krn_rnn_dense_op_decl.h‎
Lines changed: 12 additions & 0 deletions b/‎lib/src/bricks/mli_krn_rnn_dense_op_decl.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎lib/src/kernels/common/mli_krn_gru_cell.cc‎
Lines changed: 96 additions & 0 deletions b/‎lib/src/kernels/common/mli_krn_gru_cell.cc‎
Lines changed: 96 additions & 0 deletions
@@ -54,6 +54,10 @@ extern "C" {
 #define KRNL_DW_D_DIM_HW1N 2 // Depthwise convolution hwc kernel depth (must be == 1)
 #define KRNL_DW_N_DIM_HW1N 3 // Depthwise convolution hwc output channels
 
+// for Recurrent kernels
+#define KRNL_RNN_W_IN_ELEMS_DIM 1 // Input elements dimension of RNN weights 
+#define KRNL_RNN_W_OUT_ELEMS_DIM 2 // Output elements dimension of RNN weights
+
 /** 
  * @brief Count Number of Elements in Tensor
  *
 
@@ -600,14 +600,15 @@ mli_status mli_krn_fully_connected_sa8_sa8_sa32_ext_bias(
 /**
  * @brief Long Short Term Memory (LSTM) Cell
  *
- * @detail This kernel implements the default non-peephole implementation of long short term memory (LSTM) cell 
+ * @detail This kernel implements the default non-peephole implementation of long short term memory (LSTM) cell
+ * with input (i), gate (g), forget (f) and out (o) gates  
  *
  * This kernel implies sequential processing of the set of inputs vectors which is passed by input tensor of shape 
  * (batch_size, N) where N is the length of the single frame. Both directions of processing (forward and backward) 
- * are supported and defined by cfg structure. Kernel can output the bunch of results for according to each step of 
- * processing, or only the last one in the sequence. Dense part of calculations uses scratch data from configuration 
- * structure for results, and consequently output and previous output tensors might use the same memory if it is 
- * acceptable to rewrite previous output data.
+ * are supported and defined by cfg structure. Kernel can output the intermediate results of each step, or only the result 
+ * of the last step. Dense part of calculations uses scratch data from configuration structure for results, 
+ * and consequently output and previous output tensors might use the same memory if it is acceptable to rewrite 
+ * previous output data.
  *
  * For more info on primitive see MLI Documentation.
  *
@@ -658,6 +659,64 @@ mli_status mli_krn_lstm_cell_sa8_sa8_sa32(
         mli_tensor * cell,
         mli_tensor * out);
 
+/**
+ * @brief Gated Recurrent Unit (GRU) Cell
+ *
+ * @detail This kernel implements the Gated Recurrent Unit (GRU) cell with update (z), reset (r) and new (n) gates 
+ * in version where a reset gate is applied on the hidden state before matrix multiplication
+ * 
+ * This kernel implies sequential processing of the set of inputs vectors which is passed by input tensor 
+ * of shape (batch_size, N) where N is the length of the single frame. Both directions of processing (forward and backward) 
+ * are supported and defined by cfg structure. Kernel can output the intermediate results of each step, or only the result 
+ * of the last step.
+ *
+ * For more info on primitive see MLI Documentation.
+ *
+ * @param in          [I] Input feature tensor. Must be a tensor of shape (batch_size, input_elements).
+ * @param prev_out    [I] Previous output feature tensor. Must be a one-dimensional tensor of shape (out_elements).
+ * @param weights_in  [I] Input Weights tensor (set of 3 matrixes in the [z,r,n] order: 3-dimensional tensor)
+ * @param weights_out [I] Hidden Weights tensor (set of 3 matrixes in the [z,r,n] order: 3-dimensional tensor)
+ * @param bias        [I] Biases tensor (set of 3 vectors in the [z,r,n] order: 2-dimensional tensor)
+ * @param tanh_lut    [I] LUT table structure prepared for the hyperbolic tangent activation
+ * @param sigm_lut    [I] LUT table structure prepared for sigmoid activation
+ * @param cfg         [I] RNN Configuration structure (for more info see @ref mli_rnn_cell_cfg)
+ * @param out         [O] Output feature tensor. Result will be stored here (single output or batch of outputs depending on mode)
+ *
+ * @return MLI status code
+ */
+mli_status mli_krn_gru_cell_fx16(
+        const mli_tensor * in,
+        const mli_tensor * prev_out,
+        const mli_tensor * weights_in,
+        const mli_tensor * weights_out,
+        const mli_tensor * bias,
+        const mli_lut * tanh_lut,
+        const mli_lut * sigm_lut,
+        const mli_rnn_cell_cfg * cfg,
+        mli_tensor * out);
+
+mli_status mli_krn_gru_cell_fx16_fx8_fx8(
+        const mli_tensor * in,
+        const mli_tensor * prev_out,
+        const mli_tensor * weights_in,
+        const mli_tensor * weights_out,
+        const mli_tensor * bias,
+        const mli_lut * tanh_lut,
+        const mli_lut * sigm_lut,
+        const mli_rnn_cell_cfg * cfg,
+        mli_tensor * out);
+
+mli_status mli_krn_gru_cell_sa8_sa8_sa32(
+        const mli_tensor * in,
+        const mli_tensor * prev_out,
+        const mli_tensor * weights_in,
+        const mli_tensor * weights_out,
+        const mli_tensor * bias,
+        const mli_lut * tanh_lut,
+        const mli_lut * sigm_lut,
+        const mli_rnn_cell_cfg * cfg,
+        mli_tensor * out);
+
 /**
  * @brief Basic Recurrent Neural Network Cell
  *
 
@@ -375,7 +375,6 @@ typedef struct {
     mli_rnn_results results;            /**< Results to preserve.*/
     mli_rnn_out_activation act;         /**< Output activation type. */
     mli_data_container scratch_data;    /**< Container to keep intermediate results. */
-    uint32_t scratch_capacity;          /**< Size of a memory pointed by scratch_data field. */
 } mli_rnn_cell_cfg;
 
 
 
@@ -48,6 +48,7 @@ set(MLI_LIB_SOURCE_FILES
     ${MLI_LIB_CMAKE_DIR}/src/kernels/diverse/mli_krn_argmax.cc
     ${MLI_LIB_CMAKE_DIR}/src/kernels/diverse/mli_krn_permute_fx.cc
     ${MLI_LIB_CMAKE_DIR}/src/kernels/common/mli_krn_lstm_cell.cc
+    ${MLI_LIB_CMAKE_DIR}/src/kernels/common/mli_krn_gru_cell.cc
 )
 
 set(MLI_LIB_PUBLIC_INCLUDES
 
@@ -71,7 +71,7 @@ static inline void rnn_dense_op_stacked(
     MLI_CONV_OUT_PTR (io_T) dense_out_ptr = mli_prv_tensor_data_ptr<MLI_CONV_OUT_PTR (io_T)>(out);
 
     for (int gate = 0; gate < gates_num; ++gate) {
-        mli::krn::rnn_dense_op<io_T, w_T, b_T, acc_T, quant_T>(
+        mli::krn::ref::rnn_dense_op<io_T, w_T, b_T, acc_T, quant_T>(
             inputs_ptr, weights_ptr, bias_ptr, dense_out_ptr, inputs_num, inputs_elements,
             out_elements, w_ch_out_mem_strides, in_to_out_quant_params, 
             (io_T)val_limit.min, (io_T)val_limit.max);
@@ -128,23 +128,23 @@ static inline void rnn_dense_op(
         accu = mli::krn::bias_additive(&bias[o_idx], accu, &in_to_out_quant_params[0]);
 
         for(int idx = 0; idx < inputs_num; idx++) {
-            mli::krn::adjust_quant_params(&in_to_out_quant_params[idx], /* krn_idx= */ 0);
+            mli::krn::ref::adjust_quant_params(&in_to_out_quant_params[idx], /* krn_idx= */ 0);
 
             accu = dotprod1D(inputs[idx], &weights[idx][o_idx], accu, in_elements[idx], 
                          1, w_ch_out_mem_strides[idx]);
 
-            accu = mli::krn::weights_additive(&weights[idx][o_idx], accu, &in_to_out_quant_params[idx],
+            accu = mli::krn::ref::weights_additive(&weights[idx][o_idx], accu, &in_to_out_quant_params[idx],
                     in_elements[idx], /* height= */ 1, /* ch= */ 1, w_ch_out_mem_strides[idx], 
                     /* row_step= */ 1, /* ch_step= */ 1);
             accu = mli_math_add_fx(accu, other_additives[idx]);
             accu = mli_math_add_fx(accu, prev_step);
 
             if(inputs_num - idx != 1) {
-                prev_step = mli::krn::ir_rnn_result_requantize(accu, &in_to_out_quant_params[idx],
+                prev_step = mli::krn::ref::ir_rnn_result_requantize(accu, &in_to_out_quant_params[idx],
                                 &in_to_out_quant_params[idx+1], /* krn_idx= */ 0);
                 accu = mli_math_mul_fx<io_T, acc_T>(0, 0);
             } else {
-                out_val = mli::krn::result_cast<io_T, acc_T, quant_T>(accu, &in_to_out_quant_params[idx]);
+                out_val = mli::krn::ref::result_cast<io_T, acc_T, quant_T>(accu, &in_to_out_quant_params[idx]);
             }
         }
 
 
@@ -21,6 +21,79 @@ namespace mli {
 namespace krn {
 namespace vdsp {
 
+static inline void adjust_weights_dim_for_rnn_dense(fx_quant_specific_params* params) {
+	return;
+}
+
+static inline void adjust_weights_dim_for_rnn_dense(s8asym_quant_specific_params* params) {
+	params->weight_dim = -1;
+}
+
+template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T>
+static inline void rnn_dense_op_stacked(
+        const MLI_PTR (io_T) * inputs_ptr,
+        const mli_tensor ** weights,
+        const mli_tensor * bias,
+        const int gates_num,
+        const int inputs_num,
+        const int * inputs_elements,
+        quant_T * in_to_out_quant_params,
+        const int * w_ch_out_mem_strides,
+        mli_tensor * out) {
+
+    constexpr bool asym = std::is_same<quant_T, s8asym_quant_specific_params>::value;
+
+    mli_relu_cfg relu_none = {MLI_RELU_NONE};
+    mli_minmax_t val_limit = mli_prv_get_relu_limits<io_T, asym>(&relu_none, out);
+
+    const MLI_PTR (w_T) weights_ptr[MLI_RNN_MAX_INPUT];
+    uint32_t weights_shift[MLI_RNN_MAX_INPUT];
+
+    const int16_t * weights_scales[MLI_RNN_MAX_INPUT];
+    const int8_t * weights_scale_frac_bits[MLI_RNN_MAX_INPUT];
+
+    int out_elements = mli_prv_count_elem_num_part(bias, 1);
+
+    for(int idx = 0; idx < inputs_num; ++idx) {
+        weights_ptr[idx] = mli_prv_tensor_data_ptr<MLI_PTR (w_T)>(weights[idx]);
+        weights_shift[idx] = mli_prv_count_elem_num_part(weights[idx], 1);
+
+        weights_scales[idx] = weights[idx]->el_params.sa.scale.mem.pi16;
+        weights_scale_frac_bits[idx] = weights[idx]->el_params.sa.scale_frac_bits.mem.pi8;
+
+        adjust_weights_dim_for_rnn_dense(&in_to_out_quant_params[idx]);
+    }
+
+    const MLI_PTR (b_T) bias_ptr = mli_prv_tensor_data_ptr<MLI_PTR (b_T)>(bias);
+    MLI_CONV_OUT_PTR (io_T) dense_out_ptr = mli_prv_tensor_data_ptr<MLI_CONV_OUT_PTR (io_T)>(out);
+
+    for (int gate = 0; gate < gates_num; ++gate) {
+        rnn_dense_op<io_T, w_T, b_T, acc_T, quant_T>(
+            inputs_ptr, weights_ptr, bias_ptr, dense_out_ptr, inputs_num, inputs_elements,
+            out_elements, w_ch_out_mem_strides, in_to_out_quant_params, 
+            (io_T)val_limit.min, (io_T)val_limit.max);
+
+        for (int weight_idx = 0; weight_idx < inputs_num; ++weight_idx)
+            weights_ptr[weight_idx] += weights_shift[weight_idx];
+        
+        bias_ptr += out_elements;
+        dense_out_ptr += out_elements;
+
+        if (asym) {
+            for (int weight_idx = 0; weight_idx < inputs_num; ++weight_idx) {
+                weights_scales[weight_idx]++;
+                weights_scale_frac_bits[weight_idx]++;
+            }      
+        }
+    }
+
+    for (int weight_idx = 0; weight_idx < inputs_num; ++weight_idx)
+        weights_ptr[weight_idx] -= gates_num * weights_shift[weight_idx];
+
+    bias_ptr -= gates_num * out_elements;
+    dense_out_ptr -= gates_num * out_elements;
+}
+
 template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T>
 static inline void rnn_dense_op(
         const MLI_PTR(io_T) __restrict * inputs,
 
@@ -24,7 +24,7 @@ namespace mli {
 namespace krn {
 #if !defined(MLI_BUILD_REFERENCE) && defined(__Xvec_width)
 using mli::krn::vdsp::rnn_dense_op;
-using mli::krn::ref::rnn_dense_op_stacked;
+using mli::krn::vdsp::rnn_dense_op_stacked;
 
 #elif !defined(MLI_BUILD_REFERENCE) && defined(__FXAPI__)
 using mli::krn::ref::rnn_dense_op;
 
@@ -97,6 +97,18 @@ static MLI_FORCE_INLINE void rnn_dense_op(
         const io_T val_min_limit,
         const io_T val_max_limit);
 
+template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T>
+static MLI_FORCE_INLINE void rnn_dense_op_stacked(
+        const MLI_PTR (io_T) * inputs_ptr,
+        const mli_tensor ** weights,
+        const mli_tensor * bias,
+        const int gates_num,
+        const int inputs_num,
+        const int * inputs_elements,
+        quant_T * in_to_out_quant_params,
+        const int * w_ch_out_mem_strides,
+        mli_tensor * out);
+
 } // namespace vdsp
 
 } // namespace krn
 
@@ -0,0 +1,96 @@
+/*
+* Copyright 2021, Synopsys, Inc.
+* All rights reserved.
+*
+* This source code is licensed under the BSD-3-Clause license found in
+* the LICENSE file in the root directory of this source tree.
+*
+*/
+
+#include "mli_krn_gru_cell.h"
+
+#include "mli_check.h"
+#include "mli_config.h"
+#include "mli_debug.h"
+#include "mli_helpers_api.h"
+#include "mli_prv_activation_lut.h"
+#include "mli_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef mli_acc32_t mli_sa8_sa8_sa32_accu_t;
+typedef mli_acc40_t mli_fx16_accu_t;
+typedef mli_acc32_t mli_fx16_fx8_fx8_accu_t;
+
+#pragma MLI_CODE_SECTION_START(".mli_lib")
+
+mli_status mli_krn_gru_cell_fx16 (
+        const mli_tensor * in,
+        const mli_tensor * prev_out,
+        const mli_tensor * weights_in,
+        const mli_tensor * weights_out,
+        const mli_tensor * bias,
+        const mli_lut * tanh_lut,
+        const mli_lut * sigm_lut,
+        const mli_rnn_cell_cfg * cfg,
+        mli_tensor * out) {
+    mli_status ret = MLI_CHECK_STATUS(mli_chk_gru_cell_fx16
+        (in, prev_out, weights_in, weights_out, bias, tanh_lut, sigm_lut, cfg, out), __func__);
+    if (ret != MLI_STATUS_OK) return ret;
+
+    mli::krn::gru_cell_prepare_and_run<int16_t, int16_t, int16_t, mli_fx16_accu_t, 
+        mli::krn::fx_quant_specific_params>
+        (in, prev_out, weights_in, weights_out, bias, tanh_lut, sigm_lut, cfg, out);
+
+    return ret;
+}
+
+mli_status mli_krn_gru_cell_fx16_fx8_fx8 (
+        const mli_tensor * in,
+        const mli_tensor * prev_out,
+        const mli_tensor * weights_in,
+        const mli_tensor * weights_out,
+        const mli_tensor * bias,
+        const mli_lut * tanh_lut,
+        const mli_lut * sigm_lut,
+        const mli_rnn_cell_cfg * cfg,
+        mli_tensor * out) {
+    mli_status ret = MLI_CHECK_STATUS(mli_chk_gru_cell_fx16_fx8_fx8
+        (in, prev_out, weights_in, weights_out, bias, tanh_lut, sigm_lut, cfg, out), __func__);
+    if (ret != MLI_STATUS_OK) return ret;
+
+    mli::krn::gru_cell_prepare_and_run<int16_t, int8_t, int8_t, mli_fx16_fx8_fx8_accu_t, 
+        mli::krn::fx_quant_specific_params>
+        (in, prev_out, weights_in, weights_out, bias, tanh_lut, sigm_lut, cfg, out);
+
+    return ret;
+}
+
+mli_status mli_krn_gru_cell_sa8_sa8_sa32 (
+        const mli_tensor * in,
+        const mli_tensor * prev_out,
+        const mli_tensor * weights_in,
+        const mli_tensor * weights_out,
+        const mli_tensor * bias,
+        const mli_lut * tanh_lut,
+        const mli_lut * sigm_lut,
+        const mli_rnn_cell_cfg * cfg,
+        mli_tensor * out) {
+    mli_status ret = MLI_CHECK_STATUS(mli_chk_gru_cell_sa8_sa8_sa32
+        (in, prev_out, weights_in, weights_out, bias, tanh_lut, sigm_lut, cfg, out), __func__);
+    if (ret != MLI_STATUS_OK) return ret;
+
+    mli::krn::gru_cell_prepare_and_run<int8_t, int8_t, int32_t, mli_sa8_sa8_sa32_accu_t, 
+        mli::krn::s8asym_quant_specific_params>
+        (in, prev_out, weights_in, weights_out, bias, tanh_lut, sigm_lut, cfg, out);
+
+    return ret;
+}
+
+#pragma MLI_CODE_SECTION_END()
+
+#ifdef __cplusplus
+}
+#endif
Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,7 @@ set(MLI_LIB_SOURCE_FILES`
`48`	`48`	`${MLI_LIB_CMAKE_DIR}/src/kernels/diverse/mli_krn_argmax.cc`
`49`	`49`	`${MLI_LIB_CMAKE_DIR}/src/kernels/diverse/mli_krn_permute_fx.cc`
`50`	`50`	`${MLI_LIB_CMAKE_DIR}/src/kernels/common/mli_krn_lstm_cell.cc`
	`51`	`+ ${MLI_LIB_CMAKE_DIR}/src/kernels/common/mli_krn_gru_cell.cc`
`51`	`52`	`)`
`52`	`53`
`53`	`54`	`set(MLI_LIB_PUBLIC_INCLUDES`