foss-for-synopsys-dwc-arc-processors
diff --git a/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_ref.h‎
Lines changed: 42 additions & 17 deletions b/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_ref.h‎
Lines changed: 42 additions & 17 deletions
diff --git a/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_vdsp.h‎
Lines changed: 42 additions & 17 deletions b/‎lib/src/bricks/impl/mli_krn_rnn_dense_op_vdsp.h‎
Lines changed: 42 additions & 17 deletions
diff --git a/‎lib/src/bricks/mli_krn_rnn_dense_op_decl.h‎
Lines changed: 2 additions & 0 deletions b/‎lib/src/bricks/mli_krn_rnn_dense_op_decl.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/src/kernels/common/mli_krn_gru_cell.h‎
Lines changed: 36 additions & 9 deletions b/‎lib/src/kernels/common/mli_krn_gru_cell.h‎
Lines changed: 36 additions & 9 deletions
diff --git a/‎lib/src/kernels/common/mli_krn_lstm_cell.h‎
Lines changed: 14 additions & 3 deletions b/‎lib/src/kernels/common/mli_krn_lstm_cell.h‎
Lines changed: 14 additions & 3 deletions
@@ -32,6 +32,38 @@ static inline void adjust_weights_dim_for_rnn_dense(s8asym_quant_specific_params
 	params->weight_dim = -1;
 }
 
+static inline void adjust_weights_scale_for_rnn_dense(
+    fx_quant_specific_params* params, 
+    fx_quant_specific_params* initial_params) {
+	return;
+}
+
+static inline void adjust_weights_scale_for_rnn_dense(
+    s8asym_quant_specific_params* params, 
+    s8asym_quant_specific_params* initial_params) {
+	if (initial_params->weight_dim != -1) {
+        params->weight_scales++;
+        params->weight_shifts++;
+    }
+}
+
+static inline void adjust_weights_scale_back_for_rnn_dense(
+    fx_quant_specific_params* params, 
+    fx_quant_specific_params* initial_params, 
+    int gates) {
+	return;
+}
+
+static inline void adjust_weights_scale_back_for_rnn_dense(
+    s8asym_quant_specific_params* params, 
+    s8asym_quant_specific_params* initial_params, 
+    int gates) {
+	if(initial_params->weight_dim != -1) {
+        params->weight_scales -= gates;
+        params->weight_shifts -= gates;
+    }
+}
+
 template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T>
 static inline void rnn_dense_op_stacked(
         const MLI_PTR (io_T) * inputs_ptr,
@@ -42,6 +74,7 @@ static inline void rnn_dense_op_stacked(
         const int * inputs_elements,
         quant_T * in_to_out_quant_params,
         const int * w_ch_out_mem_strides,
+        const int * w_gate_mem_strides,
         mli_tensor * out) {
 
     constexpr bool asym = std::is_same<quant_T, s8asym_quant_specific_params>::value;
@@ -50,20 +83,15 @@ static inline void rnn_dense_op_stacked(
     mli_minmax_t val_limit = mli_prv_get_relu_limits<io_T, asym>(&relu_none, out);
 
     const MLI_PTR (w_T) weights_ptr[MLI_RNN_MAX_INPUT];
+    quant_T initial_params[MLI_RNN_MAX_INPUT];
     uint32_t weights_shift[MLI_RNN_MAX_INPUT];
 
-    const int16_t * weights_scales[MLI_RNN_MAX_INPUT];
-    const int8_t * weights_scale_frac_bits[MLI_RNN_MAX_INPUT];
-
     int out_elements = mli_prv_count_elem_num_part(bias, 1);
 
     for(int idx = 0; idx < inputs_num; ++idx) {
         weights_ptr[idx] = mli_prv_tensor_data_ptr<MLI_PTR (w_T)>(weights[idx]);
-        weights_shift[idx] = mli_prv_count_elem_num_part(weights[idx], 1);
-
-        weights_scales[idx] = weights[idx]->el_params.sa.scale.mem.pi16;
-        weights_scale_frac_bits[idx] = weights[idx]->el_params.sa.scale_frac_bits.mem.pi8;
-
+        weights_shift[idx] = w_gate_mem_strides[idx];
+        initial_params[idx] = in_to_out_quant_params[idx];
         adjust_weights_dim_for_rnn_dense(&in_to_out_quant_params[idx]);
     }
 
@@ -76,22 +104,19 @@ static inline void rnn_dense_op_stacked(
             out_elements, w_ch_out_mem_strides, in_to_out_quant_params, 
             (io_T)val_limit.min, (io_T)val_limit.max);
 
-        for (int weight_idx = 0; weight_idx < inputs_num; ++weight_idx)
+        for (int weight_idx = 0; weight_idx < inputs_num; ++weight_idx) {
             weights_ptr[weight_idx] += weights_shift[weight_idx];
+            adjust_weights_scale_for_rnn_dense(&in_to_out_quant_params[weight_idx], &initial_params[weight_idx]);
+        }
 
         bias_ptr += out_elements;
         dense_out_ptr += out_elements;
-
-        if (asym) {
-            for (int weight_idx = 0; weight_idx < inputs_num; ++weight_idx) {
-                weights_scales[weight_idx]++;
-                weights_scale_frac_bits[weight_idx]++;
-            }      
-        }
     }
 
-    for (int weight_idx = 0; weight_idx < inputs_num; ++weight_idx)
+    for (int weight_idx = 0; weight_idx < inputs_num; ++weight_idx) {
         weights_ptr[weight_idx] -= gates_num * weights_shift[weight_idx];
+        adjust_weights_scale_back_for_rnn_dense(&in_to_out_quant_params[weight_idx], &initial_params[weight_idx], gates_num);
+    }
 
     bias_ptr -= gates_num * out_elements;
     dense_out_ptr -= gates_num * out_elements;
 
@@ -29,6 +29,38 @@ static inline void adjust_weights_dim_for_rnn_dense(s8asym_quant_specific_params
 	params->weight_dim = -1;
 }
 
+static inline void adjust_weights_scale_for_rnn_dense(
+    fx_quant_specific_params* params, 
+    fx_quant_specific_params* initial_params) {
+	return;
+}
+
+static inline void adjust_weights_scale_for_rnn_dense(
+    s8asym_quant_specific_params* params, 
+    s8asym_quant_specific_params* initial_params) {
+	if (initial_params->weight_dim != -1) {
+        params->weight_scales++;
+        params->weight_shifts++;
+    }
+}
+
+static inline void adjust_weights_scale_back_for_rnn_dense(
+    fx_quant_specific_params* params, 
+    fx_quant_specific_params* initial_params, 
+    int gates) {
+	return;
+}
+
+static inline void adjust_weights_scale_back_for_rnn_dense(
+    s8asym_quant_specific_params* params, 
+    s8asym_quant_specific_params* initial_params, 
+    int gates) {
+	if(initial_params->weight_dim != -1) {
+        params->weight_scales -= gates;
+        params->weight_shifts -= gates;
+    }
+}
+
 template <typename io_T, typename w_T, typename b_T, typename acc_T, typename quant_T>
 static inline void rnn_dense_op_stacked(
         const MLI_PTR (io_T) * inputs_ptr,
@@ -39,6 +71,7 @@ static inline void rnn_dense_op_stacked(
         const int * inputs_elements,
         quant_T * in_to_out_quant_params,
         const int * w_ch_out_mem_strides,
+        const int * w_gate_mem_strides,
         mli_tensor * out) {
 
     constexpr bool asym = std::is_same<quant_T, s8asym_quant_specific_params>::value;
@@ -47,20 +80,15 @@ static inline void rnn_dense_op_stacked(
     mli_minmax_t val_limit = mli_prv_get_relu_limits<io_T, asym>(&relu_none, out);
 
     const MLI_PTR (w_T) weights_ptr[MLI_RNN_MAX_INPUT];
+    quant_T initial_params[MLI_RNN_MAX_INPUT];
     uint32_t weights_shift[MLI_RNN_MAX_INPUT];
 
-    const int16_t * weights_scales[MLI_RNN_MAX_INPUT];
-    const int8_t * weights_scale_frac_bits[MLI_RNN_MAX_INPUT];
-
     int out_elements = mli_prv_count_elem_num_part(bias, 1);
 
     for(int idx = 0; idx < inputs_num; ++idx) {
         weights_ptr[idx] = mli_prv_tensor_data_ptr<MLI_PTR (w_T)>(weights[idx]);
-        weights_shift[idx] = mli_prv_count_elem_num_part(weights[idx], 1);
-
-        weights_scales[idx] = weights[idx]->el_params.sa.scale.mem.pi16;
-        weights_scale_frac_bits[idx] = weights[idx]->el_params.sa.scale_frac_bits.mem.pi8;
-
+        weights_shift[idx] = w_gate_mem_strides[idx];
+        initial_params[idx] = in_to_out_quant_params[idx];
         adjust_weights_dim_for_rnn_dense(&in_to_out_quant_params[idx]);
     }
 
@@ -73,22 +101,19 @@ static inline void rnn_dense_op_stacked(
             out_elements, w_ch_out_mem_strides, in_to_out_quant_params, 
             (io_T)val_limit.min, (io_T)val_limit.max);
 
-        for (int weight_idx = 0; weight_idx < inputs_num; ++weight_idx)
+        for (int weight_idx = 0; weight_idx < inputs_num; ++weight_idx) {
             weights_ptr[weight_idx] += weights_shift[weight_idx];
+            adjust_weights_scale_for_rnn_dense(&in_to_out_quant_params[weight_idx], &initial_params[weight_idx]);
+        }
 
         bias_ptr += out_elements;
         dense_out_ptr += out_elements;
-
-        if (asym) {
-            for (int weight_idx = 0; weight_idx < inputs_num; ++weight_idx) {
-                weights_scales[weight_idx]++;
-                weights_scale_frac_bits[weight_idx]++;
-            }      
-        }
     }
 
-    for (int weight_idx = 0; weight_idx < inputs_num; ++weight_idx)
+    for (int weight_idx = 0; weight_idx < inputs_num; ++weight_idx) {
         weights_ptr[weight_idx] -= gates_num * weights_shift[weight_idx];
+        adjust_weights_scale_back_for_rnn_dense(&in_to_out_quant_params[weight_idx], &initial_params[weight_idx], gates_num);
+    }
 
     bias_ptr -= gates_num * out_elements;
     dense_out_ptr -= gates_num * out_elements;
 
@@ -53,6 +53,7 @@ static MLI_FORCE_INLINE void rnn_dense_op_stacked(
         const int * inputs_elements,
         quant_T * in_to_out_quant_params,
         const int * w_ch_out_mem_strides,
+        const int * w_gate_mem_strides,
         mli_tensor * out);
 
 } // namespace ref
@@ -107,6 +108,7 @@ static MLI_FORCE_INLINE void rnn_dense_op_stacked(
         const int * inputs_elements,
         quant_T * in_to_out_quant_params,
         const int * w_ch_out_mem_strides,
+        const int * w_gate_mem_strides,
         mli_tensor * out);
 
 } // namespace vdsp
 
@@ -110,8 +110,19 @@ MLI_FORCE_INLINE void gru_cell_prepare_and_run(
 
     const int w_ch_out_mem_stride_from_tensors[] = {(int)weights_in->mem_stride[KRNL_RNN_W_IN_ELEMS_DIM], 
                                                     (int)weights_out->mem_stride[KRNL_RNN_W_IN_ELEMS_DIM]};
-    const int w_ch_out_mem_strides[] = {(w_ch_out_mem_stride_from_tensors[0] != 0) ? w_ch_out_mem_stride_from_tensors[0] : gru_out_elements, 
-                                  (w_ch_out_mem_stride_from_tensors[1] != 0) ? w_ch_out_mem_stride_from_tensors[1] : gru_out_elements};
+
+    const int w_gate_mem_stride_from_tensors[] = {(int)weights_in->mem_stride[0], 
+                                                  (int)weights_out->mem_stride[0]};
+
+    const int w_ch_out_mem_strides[] = {(w_ch_out_mem_stride_from_tensors[0] != 0) 
+                                            ? w_ch_out_mem_stride_from_tensors[0] : gru_out_elements, 
+                                        (w_ch_out_mem_stride_from_tensors[1] != 0) 
+                                            ? w_ch_out_mem_stride_from_tensors[1] : gru_out_elements};
+    
+    const int w_gate_mem_strides[] = {(w_gate_mem_stride_from_tensors[0] != 0) 
+                                        ? w_gate_mem_stride_from_tensors[0] : gru_out_elements * inputs_elements[0], 
+                                      (w_gate_mem_stride_from_tensors[1] != 0) 
+                                        ? w_gate_mem_stride_from_tensors[1]: gru_out_elements * inputs_elements[1]};
 
     // Paricular subtensors of intermediate tensor (mli_tensor.mem_stride[] should be zero and cannot be left uninitialized)
     mli_tensor reset_gate = {{ 0 }}, update_gate = {{ 0 }}, new_gate = {{ 0 }}; // Various gates to control info flow
@@ -123,13 +134,29 @@ MLI_FORCE_INLINE void gru_cell_prepare_and_run(
     mli_hlp_point_to_subtensor(&ir_tensor, &iterator, &update_gate); iterator.start_coord[0]++;
     mli_hlp_point_to_subtensor(&ir_tensor, &iterator, &reset_gate); iterator.start_coord[0]++;
     mli_hlp_point_to_subtensor(&ir_tensor, &iterator, &new_gate); iterator.start_coord[0]++;
-
-    mli_hlp_point_to_subtensor(weights_in, &weight_iterator, &w_in_new_g);
-    mli_hlp_point_to_subtensor(weights_out, &weight_iterator, &w_out_new_g);
     mli_hlp_point_to_subtensor(bias, &weight_iterator, &b_new_g);
 
-    const MLI_PTR (w_T) w_new_g_ptr[] = {mli_prv_tensor_data_ptr<MLI_PTR (w_T)>(&w_in_new_g), 
-                                         mli_prv_tensor_data_ptr<MLI_PTR (w_T)>(&w_out_new_g)};
+    w_in_new_g.data = weights_in->data; 
+    w_in_new_g.rank = 2;
+    w_in_new_g.shape[0] = weights_in->shape[1];
+    w_in_new_g.shape[1] = weights_in->shape[2];
+    w_in_new_g.el_params = weights_in->el_params;
+    w_in_new_g.el_type = weights_in->el_type;
+    mli_prv_tensor_inc_data_ptr<w_T*>(&w_in_new_g, num_gates * w_gate_mem_strides[0]);
+
+    w_out_new_g.data = weights_out->data; 
+    w_out_new_g.rank = 2;
+    w_out_new_g.shape[0] = weights_out->shape[1];
+    w_out_new_g.shape[1] = weights_out->shape[2];
+    w_out_new_g.el_params = weights_out->el_params;
+    w_out_new_g.el_type = weights_out->el_type;
+    mli_prv_tensor_inc_data_ptr<w_T*>(&w_out_new_g, num_gates * w_gate_mem_strides[1]);
+
+    const MLI_PTR (w_T) w_new_g_ptr[] = {
+        mli_prv_tensor_data_ptr<MLI_PTR (w_T)>(weights_in) + num_gates * w_gate_mem_strides[0], 
+        mli_prv_tensor_data_ptr<MLI_PTR (w_T)>(weights_out) + num_gates * w_gate_mem_strides[1]
+    };
+
     const MLI_PTR (b_T) b_new_g_ptr = mli_prv_tensor_data_ptr<MLI_PTR (b_T)>(&b_new_g);
 
     mli_tensor rnn_out = {{ 0 }};
@@ -172,7 +199,7 @@ MLI_FORCE_INLINE void gru_cell_prepare_and_run(
         //=======================================
         mli::krn::rnn_dense_op_stacked<io_T, w_T, b_T, acc_T, quant_T>(
             inputs_ptr, weights, bias, num_gates, num_inputs, inputs_elements,
-            in_to_out_params, w_ch_out_mem_strides, &ir_tensor);
+            in_to_out_params, w_ch_out_mem_strides, w_gate_mem_strides, &ir_tensor);
 
         // Step 2: Applying non-linearity
         //=======================================
@@ -256,7 +283,7 @@ MLI_FORCE_INLINE void gru_cell_prepare_and_run(
         mli::krn::eltwise_prepare_and_run<io_T, ELTWISE_MUL, /*convert*/ asym>(&new_gate, &update_gate, &temp);
         mli::krn::eltwise_prepare_and_run<io_T, ELTWISE_ADD, /*convert*/ asym>(&temp, &current_out, &rnn_out);
 
-        current_hidden.data.mem.void_p = rnn_out.data.mem.void_p;
+        current_hidden.data = rnn_out.data;
         current_hidden.el_params = rnn_out.el_params;
 
         // Step 6: Update pointers and tensors for next batch
 
@@ -90,8 +90,19 @@ MLI_FORCE_INLINE void lstm_cell_prepare_and_run(
 
     const int w_ch_out_mem_stride_from_tensors[] = {(int)weights_in->mem_stride[KRNL_RNN_W_IN_ELEMS_DIM], 
                                                     (int)weights_out->mem_stride[KRNL_RNN_W_IN_ELEMS_DIM]};
-    const int w_ch_out_mem_strides[] = {(w_ch_out_mem_stride_from_tensors[0] != 0) ? w_ch_out_mem_stride_from_tensors[0] : lstm_out_elements, 
-                                  (w_ch_out_mem_stride_from_tensors[1] != 0) ? w_ch_out_mem_stride_from_tensors[1] : lstm_out_elements};
+
+    const int w_gate_mem_stride_from_tensors[] = {(int)weights_in->mem_stride[0], 
+                                                  (int)weights_out->mem_stride[0]};
+
+    const int w_ch_out_mem_strides[] = {(w_ch_out_mem_stride_from_tensors[0] != 0) 
+                                            ? w_ch_out_mem_stride_from_tensors[0] : lstm_out_elements, 
+                                        (w_ch_out_mem_stride_from_tensors[1] != 0) 
+                                            ? w_ch_out_mem_stride_from_tensors[1]: lstm_out_elements};
+
+    const int w_gate_mem_strides[] = {(w_gate_mem_stride_from_tensors[0] != 0) 
+                                        ? w_gate_mem_stride_from_tensors[0] : lstm_out_elements * inputs_elements[0], 
+                                      (w_gate_mem_stride_from_tensors[1] != 0) 
+                                        ? w_gate_mem_stride_from_tensors[1]: lstm_out_elements * inputs_elements[1]};
 
     // Paricular subtensors of intermediate tensor (mli_tensor.mem_stride[] should be zero and cannot be left uninitialized)
     mli_tensor in_gate = {{ 0 }}, forget_gate = {{ 0 }}, out_gate = {{ 0 }}; // Various gates to controll info flow
@@ -119,7 +130,7 @@ MLI_FORCE_INLINE void lstm_cell_prepare_and_run(
         //=======================================
         rnn_dense_op_stacked<io_T, w_T, b_T, acc_T, quant_T>(
             inputs_ptr, weights, bias, num_gates, num_inputs, inputs_elements,
-            in_to_out_params, w_ch_out_mem_strides, &ir_tensor);
+            in_to_out_params, w_ch_out_mem_strides, w_gate_mem_strides, &ir_tensor);
 
 
         // Step 2: Applying non-linearity