Merge pull request #118 from foss-for-synopsys-dwc-arc-processors/slicing

JaccovG · web-flow · commit 4b6c6eed6539 · 2020-03-26T13:46:01.000+01:00
Add sub tensor functionality to support slicing
diff --git a/include/api/mli_helpers_api.h b/include/api/mli_helpers_api.h
@@ -111,6 +111,24 @@ mli_status mli_hlp_convert_tensor(mli_tensor *in, mli_tensor *out);
  */
 mli_status mli_hlp_point_to_subtensor(const mli_tensor *in, const mli_point_to_subtsr_cfg *cfg, mli_tensor *out);
 
+/**
+ * @brief Create a Sub-Tensor from a larger tensor
+ *
+ * @detail This function points to sub tensors in input tensor. This function performs operations 
+ * on pointers and doesn�t copy data (only points to subsequence of data in input).
+ * For this reason, depending on the parameters, it can happen that the sub tensor contains
+ * data that is not adjacent in memory.
+ *
+ * For more info on primitive see MLI Documentation
+ *
+ * @param in      [I] Input tensor (of any shape)
+ * @param cfg     [I] Configuration structure (for more info see @ref mli_sub_tensor_cfg)
+ * @param out     [O] Output tensor. Result will be stored here
+ *
+ * @return MLI status code
+ */
+mli_status mli_hlp_create_subtensor(const mli_tensor *in, const mli_sub_tensor_cfg *cfg, mli_tensor *out);
+
 uint32_t mli_hlp_tensor_scale_shift(const mli_tensor *in);
 
 int16_t mli_hlp_tensor_scale(const mli_tensor *in, const uint32_t scale_idx);
diff --git a/include/mli_types.h b/include/mli_types.h
@@ -306,6 +306,20 @@ typedef struct {
     uint8_t first_out_dim_size;           /**< First output dimension size */
 } mli_point_to_subtsr_cfg;
 
+/**
+ * @brief Create Subtensor helper config
+ *
+ * Data structure to provide coordinates and sizes of required subtensor in the input tensor
+ * The size can be reduced in any dimension.
+ */
+typedef struct {
+    uint32_t offset[MLI_MAX_RANK];   /**< subtensor start coordinates in the input tensor 
+                                          The size of this array is determined by the rank of the input tensor*/
+    uint32_t size[MLI_MAX_RANK];     /**< Size of the sub tensor in elements per dimension
+                                          the number of entries in this array is determind by the input tensor */
+    uint32_t sub_tensor_rank;        /**< Rank of the sub tensor that will be produced */
+} mli_sub_tensor_cfg;
+
 /**
  * @brief Data layout type for vision kernels (convolutions/pooloing mostly).
  *
diff --git a/lib/src/helpers/src/mli_helpers.c b/lib/src/helpers/src/mli_helpers.c
@@ -157,6 +157,68 @@ mli_status mli_hlp_point_to_subtensor(const mli_tensor *in, const mli_point_to_s
     return MLI_STATUS_OK;
 }
 
+mli_status mli_hlp_create_subtensor(const mli_tensor *in, const mli_sub_tensor_cfg *cfg, mli_tensor *out) {
+    mli_status ret = MLI_CHECK_STATUS(mli_chk_create_subtensor(in, cfg, out), __func__);
+    if (ret != MLI_STATUS_OK)
+        return ret;
+
+    const uint32_t elem_size = mli_hlp_tensor_element_size(in);
+    const uint32_t out_rank = cfg->sub_tensor_rank;
+    uint32_t mem_strides[MLI_MAX_RANK];
+    const uint32_t input_rank = in->rank;
+    const bool isAsym = (in->el_type == MLI_EL_ASYM_I8) || (in->el_type == MLI_EL_ASYM_I32);
+
+    // compute memory strides for the input tensor if not yet provided by the input tensor.
+    mem_strides[input_rank - 1] = in->mem_stride[input_rank - 1] != 0 ? in->mem_stride[input_rank - 1] : 1;
+    for (int i = input_rank - 2; i >= 0; i--) {
+        mem_strides[i] = in->mem_stride[i] != 0 ? in->mem_stride[i] : mem_strides[i+1] * in->shape[i+1];
+    }
+
+    // compute the offset inside the buffer
+    int buf_offset = 0;
+    for (int i = 0; i < input_rank; i++) {
+        buf_offset += cfg->offset[i] * mem_strides[i];
+    }
+    buf_offset *= elem_size;
+    out->data = (void *)((char *)in->data + buf_offset);
+    out->capacity = in->capacity - buf_offset;
+
+    // Fill the shape[] of the output tensor.
+    // If the sub_tensor_rank is smaller than the input rank, the dimensions with
+    // a size of 1 will be removed in the output shape starting from the first dimension
+    // until the requested sub_tensor_rank value is reached.
+    int out_idx = 0;
+    int skip_cnt = input_rank - out_rank;
+    int out_asym_dim = -1;
+    int out_asym_offset = 0;
+    for (int in_idx = 0; in_idx < input_rank; in_idx++) {
+        if ((skip_cnt > 0) && (cfg->size[in_idx] == 1)) {
+            skip_cnt--;
+            continue;
+        }
+        out->shape[out_idx] = cfg->size[in_idx];
+        out->mem_stride[out_idx] = mem_strides[in_idx];
+        if (isAsym && (in->el_params.asym.dim == in_idx)) {
+            out_asym_dim = out_idx;
+            out_asym_offset = cfg->offset[in_idx];
+        }
+        out_idx++;
+    }
+
+    out->rank = out_rank;
+    out->el_params = in->el_params;
+    out->el_type = in->el_type;
+
+    if (isAsym){
+        if (out->el_params.asym.dim >= 0) {
+            out->el_params.asym.scale.pi16 += out_asym_offset;
+            out->el_params.asym.dim = out_asym_dim;
+            out->el_params.asym.zero_point.pi16 += out_asym_offset;
+        }
+    }
+    return MLI_STATUS_OK;
+}
+
 
 mli_status mli_hlp_convert_tensor(mli_tensor *in, mli_tensor *out) {
     mli_status ret = MLI_CHECK_STATUS(mli_chk_convert_tensor(in, out), __func__);
diff --git a/lib/src/kernels/common/mli_krn_fully_connected.h b/lib/src/kernels/common/mli_krn_fully_connected.h
@@ -130,6 +130,8 @@ static inline void ip_op(
         const int out_shift,
         const int16_t input_offset,
         const int16_t output_offset) {
+    const int left_shift = out_shift > 0 ? 0 : -out_shift;
+    const int right_shift = out_shift > 0 ? out_shift : 0;
     // Matrix-Vector multiplication
     //==============================
     if (_Rarely(in_elements < 8)) {
@@ -143,14 +145,16 @@ static inline void ip_op(
                 weights++;
             }
             in -= in_elements;
+
+            accu = mli_math_acc_ashift_fx(accu, -left_shift);
             accu = mli_math_scale_mul<acc_T, true>(accu, out_mul);
 
             // adding the output offset needs to happen after the output mul and output shift
             // but before the cast to the output container size.
             // because the cast and shift are combined in one function, the output offset is
-            // added before, and multiplied with 1<< out_shift to compensate.
-            accu = mli_math_mac_fx(accu, (int16_t)(1<<out_shift), (io_T)output_offset);
-            out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, out_shift);
+            // added before, and multiplied with 1<< right_shift to compensate.
+            accu = mli_math_mac_fx(accu, (int16_t)(1<<right_shift), (io_T)output_offset);
+            out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, right_shift);
         }
     } else {
         if ((in_elements & 0x3) == 0) {
@@ -165,15 +169,16 @@ LOOP_PIPELINE_ENABLE
                     weights += 4;
                 }
                 in -= in_elements;
-                
+
+                accu = mli_math_acc_ashift_fx(accu, -left_shift);
                 accu = mli_math_scale_mul<acc_T, true>(accu, out_mul);
 
                 // adding the output offset needs to happen after the output mul and output shift
                 // but before the cast to the output container size.
                 // because the cast and shift are combined in one function, the output offset is
-                // added before, and multiplied with 1<< out_shift to compensate.
-                accu = mli_math_mac_fx(accu, (int16_t)(1<<out_shift), (io_T)output_offset);
-                out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, out_shift);
+                // added before, and multiplied with 1<< right_shift to compensate.
+                accu = mli_math_mac_fx(accu, (int16_t)(1<<right_shift), (io_T)output_offset);
+                out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, right_shift);
             }
         } else {
             for (int o_idx = 0; o_idx < out_elements; o_idx++) {
@@ -197,14 +202,16 @@ LOOP_PIPELINE_ENABLE
                 weights += 4;
             }
             in -= in_elements;
+
+            accu = mli_math_acc_ashift_fx(accu, -left_shift);
             accu = mli_math_scale_mul<acc_T, true>(accu, out_mul);
 
             // adding the output offset needs to happen after the output mul and output shift
             // but before the cast to the output container size.
             // because the cast and shift are combined in one function, the output offset is
-            // added before, and multiplied with 1<< out_shift to compensate.
-            accu = mli_math_mac_fx(accu, (int16_t)(1<<out_shift), (io_T)output_offset);
-            out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, out_shift);
+            // added before, and multiplied with 1<< right_shift to compensate.
+            accu = mli_math_mac_fx(accu, (int16_t)(1<<right_shift), (io_T)output_offset);
+            out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, right_shift);
             }
         }
     }
@@ -224,7 +231,7 @@ static void fully_connected_prepare_and_run(
     MLI_CONV_OUT_PTR(io_T) out_ptr = (MLI_CONV_OUT_PTR(io_T)) (out->data);
 
     int ch_out = bias->shape[0];
-    int in_sz = mli_prv_count_elem_num(in);
+    int in_sz = weights->shape[1];
 
     // Define shift values
     int bias_shift = mli_prv_calc_shift(in, weights, bias);
diff --git a/lib/src/move/mli_mov_api.c b/lib/src/move/mli_mov_api.c
@@ -133,9 +133,20 @@ mli_status mli_mov_prepare(mli_mov_handle_t* h, const mli_tensor* src, const mli
         dst->mem_stride[i] = 0;
     }
 
-    dst->mem_stride[rank - 1] = cfg->dst_mem_stride[rank - 1] != 0 ? cfg->dst_mem_stride[rank - 1] : 1;
+    /* if destination memstride is provided in the configuration, use it.
+       if not, check if the output tensor provides a mem stride.
+       when no memstride is provided at all, compute the memstride based on the destination shape */
+    if (cfg->dst_mem_stride[rank - 1] != 0) {
+        dst->mem_stride[rank - 1] = cfg->dst_mem_stride[rank - 1];
+    } else if (dst->mem_stride[rank - 1] == 0) {
+        dst->mem_stride[rank - 1] = 1;
+    }
     for (int i = rank - 2; i >= 0; i--) {
-        dst->mem_stride[i] = cfg->dst_mem_stride[i] != 0 ? cfg->dst_mem_stride[i] : dst->mem_stride[i+1] * dst->shape[i + 1];
+        if (cfg->dst_mem_stride[i] != 0) {
+            dst->mem_stride[i] = cfg->dst_mem_stride[i];
+        } else if (dst->mem_stride[i] == 0) {
+            dst->mem_stride[i] = dst->mem_stride[i + 1] * dst->shape[i + 1];
+        }
     }
 
     // update state in the handle
diff --git a/lib/src/private/mli_check.h b/lib/src/private/mli_check.h
@@ -325,6 +325,7 @@ mli_status mli_chk_permute_fx16(const mli_tensor * in, const mli_permute_cfg * c
 mli_status mli_chk_count_elem_num(const mli_tensor *in, uint32_t start_dim);
 mli_status mli_chk_convert_tensor(mli_tensor *in, mli_tensor *out);
 mli_status mli_chk_point_to_subtensor(const mli_tensor *in, const mli_point_to_subtsr_cfg *cfg, mli_tensor *out);
+mli_status mli_chk_create_subtensor(const mli_tensor *in, const mli_sub_tensor_cfg *cfg, mli_tensor *out);
 
 #ifdef __cplusplus
 }
diff --git a/lib/src/private/src/mli_check.c b/lib/src/private/src/mli_check.c
@@ -1744,4 +1744,26 @@ mli_status mli_chk_point_to_subtensor(const mli_tensor *in, const mli_point_to_s
     return MLI_STATUS_OK;
 }
 
+mli_status mli_chk_create_subtensor(const mli_tensor *in, const mli_sub_tensor_cfg *cfg, mli_tensor *out) {
+    mli_status stat = MLI_STATUS_OK;
+
+    // Check that in tensor is valid and out provides valid pointers
+    stat = MLI_CHECK_STATUS(mli_chk_tensor (in), "Bad input tensor");
+    if (stat != MLI_STATUS_OK) return stat;
+    if (MLI_CHECK(out != NULL , "Bad Output tensor  pointer")) return MLI_STATUS_BAD_TENSOR;
+
+    if (MLI_CHECK(cfg != NULL , "Bad cfg pointer")) return MLI_STATUS_BAD_FUNC_CFG;
+    if (MLI_CHECK(cfg->sub_tensor_rank <= in->rank, "incorrect number of coordinates"))
+        return MLI_STATUS_BAD_FUNC_CFG;
+
+    for (int i = 0; i < in->rank; i++) {
+        if (MLI_CHECK(cfg->offset[i] < in->shape[i], "bad config"))
+            return MLI_STATUS_BAD_FUNC_CFG;
+        if (MLI_CHECK(cfg->offset[i] + cfg->size[i] <= in->shape[i], "bad config"))
+            return MLI_STATUS_BAD_FUNC_CFG;
+    }
+
+    return MLI_STATUS_OK;
+}
+
 #pragma code()

Original file line number	Diff line number	Diff line change
`@@ -325,6 +325,7 @@ mli_status mli_chk_permute_fx16(const mli_tensor * in, const mli_permute_cfg * c`
`325`	`325`	`mli_status mli_chk_count_elem_num(const mli_tensor *in, uint32_t start_dim);`
`326`	`326`	`mli_status mli_chk_convert_tensor(mli_tensor in, mli_tensor out);`
`327`	`327`	`mli_status mli_chk_point_to_subtensor(const mli_tensor in, const mli_point_to_subtsr_cfg cfg, mli_tensor *out);`
	`328`	`+mli_status mli_chk_create_subtensor(const mli_tensor in, const mli_sub_tensor_cfg cfg, mli_tensor *out);`
`328`	`329`
`329`	`330`	`#ifdef __cplusplus`
`330`	`331`	`}`