foss-for-synopsys-dwc-arc-processors
diff --git a/‎include/api/mli_ref_compiler_api.hpp‎
Lines changed: 21 additions & 21 deletions b/‎include/api/mli_ref_compiler_api.hpp‎
Lines changed: 21 additions & 21 deletions
diff --git a/‎include/api/mli_ref_runtime_api.hpp‎
Lines changed: 1 addition & 1 deletion b/‎include/api/mli_ref_runtime_api.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/mli_compiler_api.hpp‎
Lines changed: 2 additions & 77 deletions b/‎include/mli_compiler_api.hpp‎
Lines changed: 2 additions & 77 deletions
diff --git a/‎include/mli_iterator.hpp‎
Lines changed: 25 additions & 3 deletions b/‎include/mli_iterator.hpp‎
Lines changed: 25 additions & 3 deletions
diff --git a/‎include/mli_kernels_factory.hpp‎
Lines changed: 24 additions & 2 deletions b/‎include/mli_kernels_factory.hpp‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎include/mli_kernels_factory_ref.hpp‎
Lines changed: 17 additions & 2 deletions b/‎include/mli_kernels_factory_ref.hpp‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎include/mli_types.hpp‎
Lines changed: 35 additions & 7 deletions b/‎include/mli_types.hpp‎
Lines changed: 35 additions & 7 deletions
@@ -35,7 +35,6 @@ class Conv2d_CS : public lib_mli::Conv2d_CS {
      * of all values in the related perception area of all channels of the input tensor.
      *
      * @deprected
-     * Be carefull - you need to use another deprected method to set tiling - SetIterators
      * Be carefull - conv2d I/O tensors of rank 4 are deprecated - new interfaces use rank 5 
      * Be carefull - this is the most deprecated Constructor
      *
@@ -165,18 +164,6 @@ class Conv2d_CS : public lib_mli::Conv2d_CS {
 
     unsigned GetRuntimeObjectSize() const override;
 
-    /**
-     * @deprecated
-     * Be carefull - conv2d I/O tensors of rank 4 are deprecated - new interfaces use rank 5
-     * Be carefull - don't use this method with new Conv2d_CS ctors - only with deprecated ctor that takes tensors
-     */
-    mli_status SetIterators(uint32_t output_total_size[4],
-                            uint32_t iteration_order[4],
-                            uint32_t input_first_inc[4],
-                            uint32_t input_inc[4],
-                            uint32_t output_first_inc[4],
-                            uint32_t output_inc[4],
-                            uint32_t weights_inc[4]) override;
 private:
 
     // Input, weights, weights zp(s), output tensors with offset buffer attached
@@ -1342,14 +1329,6 @@ class Clip_CS : public lib_mli::Clip_CS {
                                    const OffsetBuffer& encoded_params,
                                    const OffsetBuffer& descr) override;
 
-    /**
-     * @deprecated
-     */
-    mli_status SetIterators(uint32_t output_total_size[kClipIterRank],
-                            uint32_t iteration_order[kClipIterRank],
-                            uint32_t output_first_inc[kClipIterRank],
-                            uint32_t output_inc[kClipIterRank]) override;
-
 private:
     TensorIterator<OffsetBuffer, kClipRank, kClipIterRank> m_input;
     TensorIterator<OffsetBuffer, kClipRank, kClipIterRank> m_output;
@@ -1404,6 +1383,27 @@ class Prelu_CS : public lib_mli::Prelu_CS {
      * @param cfg [IN] PreluOpConfig structure
      * @param output [OUT] Output tensor (tile shape)
      */
+    Prelu_CS(const lib_mli::PlatformDescription pd,
+             const TensorIterator<NoBuffer, 4, 4> &input,
+             const PreluOpConfig &cfg,
+             const TensorIterator<NoBuffer, 4, 4> &output);
+
+    /**
+     * @brief Constructor to create a PReLU compiler support object.
+     *
+     * This constructor can be used to create a PReLU compiler support
+     * object. This kernel computes values of the output tensor scaled by 
+     * positive scale and shifted by positive shift if the input value is 
+     * greater than the input bias,
+     * Otherwise It will apply negative scale and negative shift
+     * for all values in the desired axis of the input tensor 
+     *
+     * @param pd        [IN]  Platform description
+     * @param input     [IN]  Input tensor (full shape)
+     * @param cfg       [IN]  PreluOpConfig structure
+     * @param enc_param [IN]  Encoded parameters tensor
+     * @param output    [OUT] Output tensor (tile shape)
+     */
     Prelu_CS(const lib_mli::PlatformDescription pd,
              const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &input,
              const PreluOpConfig &cfg,
 
@@ -973,7 +973,7 @@ class Prelu : public ExecutionInterface {
     void GetIOSizesAndOffsets(uint32_t &enc_param_size, uint32_t &inp_bias_offset, 
                               uint32_t &posscale_offset, uint32_t &negscale_offset,
                               uint32_t &posshift_offset, uint32_t &negshift_offset, 
-                              uint32_t &out_bias_offset) const;
+                              uint32_t &out_bias_offset);
 
 private:
 
 
@@ -78,18 +78,6 @@ class CompilerGenericInterface {
         return MLI_STATUS_OK;
     }
 
-    /**
-     * @brief this function will return the vectorization in the input channel
-     *        dimension that is used by the platform.
-     */
-    virtual unsigned GetInputChannelMultiple() { return 1; };
-
-    /**
-     * @brief this function will return the vectorization in the output channel
-     *        dimension that is used by the platform.
-     */
-    virtual unsigned GetOutputChannelMultiple() { return 1; };
-
 // TODO add virtual destructor
 protected:
     bool m_issue_enable{false};
@@ -314,31 +302,6 @@ class Conv2d_CS : public CompilerGenericInterface {
                                             NOT_IMPLEMENTED_METHOD;
                                             return MLI_STATUS_OK; };
 
-    /**
-     * @brief Method to set iteration information used in the .Update()
-     *
-     * NOTE: the use of this method is optional. if there is a single tile, and the .Update() is not used,
-     *       this data doesn't need to be set.     
-     * All the increments are following the output tile iterator.
-     * @deprecated
-     * Be carefull - don't use this method with new Conv2d_CS ctors - only with deprecated ctor that takes tensors
-     *
-     * @param output_total_size[4] [I] total size in each dimension
-     * @param iteration_order[4] [I] which dimension of the output to iterate first.
-     * @param input_first_inc[4] [I] increment of the input buffer pointer for the first iteration in each dimension
-     * @param input_inc[4] [I] increment of the input buffer pointer for the other iterations in each dimension
-     * @param output_first_inc[4] [I] increment of the output buffer pointer for the first iteration in each dimension
-     * @param output_inc[4] [I] increment of the output buffer pointer for the other iterations in each dimension
-     * @param weights_inc[4] [I] increment of the weights buffer pointer for the other iterations in each dimension of the output iterator
-     */
-    virtual mli_status SetIterators(uint32_t output_total_size[4],
-                                    uint32_t iteration_order[4],
-                                    uint32_t input_first_inc[4],
-                                    uint32_t input_inc[4],
-                                    uint32_t output_first_inc[4],
-                                    uint32_t output_inc[4],
-                                    uint32_t weights_inc[4]) = 0;
-
 };
 
 /**
@@ -410,8 +373,8 @@ class Prelu_CS : public CompilerGenericInterface {
      * the weights buffer passed to the encode_weights function is in compiler memoryspace because the
      * encode function will write the encoded weights data there.
      */
-    virtual mli_status AttachBufferOffsets(Tensor<OffsetBuffer, kPreluRank> &input,
-                                           Tensor<OffsetBuffer, kPreluRank> &output,
+    virtual mli_status AttachBufferOffsets(Tensor<OffsetBuffer, 4> &input,
+                                           Tensor<OffsetBuffer, 4> &output,
                                            OffsetBuffer &params,
                                            OffsetBuffer &ctrl_buffer) { return MLI_STATUS_OK; }
 
@@ -434,26 +397,6 @@ class Prelu_CS : public CompilerGenericInterface {
                                            const OffsetBuffer &params,
                                            const OffsetBuffer &ctrl_buffer) { return MLI_STATUS_OK; }
 
-    /**
-     * @brief Method to set iteration information used in the .Update()
-     * @deprecated
-     *
-     * NOTE: the use of this method is optional. if there is a single tile, and the .Update() is not used,
-     *       this data doesn't need to be set.     
-     * All the increments are following the output tile iterator.
-     * @param output_total_size[4] [I] total size in each dimension
-     * @param iteration_order[4] [I] which dimension of the output to iterate first.
-     * @param input_first_inc[4] [I] increment of the input buffer pointer for the first iteration in each dimension
-     * @param input_inc[4] [I] increment of the input buffer pointer for the other iterations in each dimension
-     * @param output_first_inc[4] [I] increment of the output buffer pointer for the first iteration in each dimension
-     * @param output_inc[4] [I] increment of the output buffer pointer for the other iterations in each dimension
-     */
-    virtual mli_status SetIterators(uint32_t output_total_size[4],
-                                    uint32_t iteration_order[4],
-                                    uint32_t input_first_inc[4],
-                                    uint32_t input_inc[4],
-                                    uint32_t output_first_inc[4],
-                                    uint32_t output_inc[4]) { return MLI_STATUS_OK; }
 };
 
 
@@ -959,24 +902,6 @@ class Clip_CS : public CompilerGenericInterface {
                                            const OffsetBuffer& encoded_params,
                                            const OffsetBuffer& descr) = 0;
 
-    /**
-     * @brief Method to set iteration information used in the .Update()
-     *
-     * NOTE: the use of this method is optional. if there is a single tile, and the .Update() is not used,
-     *       this data doesn't need to be set.
-     * All the increments are following the output tile iterator.
-     * 
-     * @deprecated
-     * @param output_total_size[4] [I] total size in each dimension
-     * @param iteration_order[4] [I] which dimension of the output to iterate first.
-     * @param output_first_inc[4] [I] increment of the output buffer pointer for the first iteration in each dimension
-     * @param output_inc[4] [I] increment of the output buffer pointer for the other iterations in each dimension
-     */
-    virtual mli_status SetIterators(uint32_t output_total_size[kClipIterRank],
-                                    uint32_t iteration_order[kClipIterRank],
-                                    uint32_t output_first_inc[kClipIterRank],
-                                    uint32_t output_inc[kClipIterRank]) = 0;
-
 };
 
 /**
 
@@ -286,7 +286,15 @@ class IteratorCfg {
       }
     }
 
-    // Method to convert iterator granulatiries along certain dimension
+    /**
+     * @brief 
+     * 
+     * Method to convert IteratorCfg granularities along specific dim
+     * 
+     * @param tnsDim            [I] Tensor Dimension to apply Vectorization to it.
+     * @param old_vector_size   [I] Old Vector Size in bytes.
+     * @param new_vector_size   [I] New Vector Size in bytes.
+     */
     void ConvertGran(int32_t tnsDim, int32_t old_vector_size, int32_t new_vector_size) {
       for (uint32_t i = 0; i < iterRank; ++i) if (m_order[i] == tnsDim) {
         m_first_size[i] = CEIL_DIV(m_first_size[i] * old_vector_size, new_vector_size);
@@ -858,11 +866,25 @@ class TensorIterator {
     }
 
     void ApplyPrePadding(const uint32_t pre_padding[tensorRank]) {
-      m_config.ApplyPrePadding(pre_padding);
+      m_config.template ApplyPrePadding<tensorRank>(pre_padding);
     }
 
     void ApplyAlignsToSizes(const uint32_t aligns[tensorRank]) {
-      m_config.ApplyAlignsToSizes(aligns);
+      m_config.template ApplyAlignsToSizes<tensorRank>(aligns);
+    }
+
+    /**
+     * @brief 
+     * 
+     * Method to convert TensorIterator granularities along inner most dimension
+     * 
+     * @param new_vector_size   [I] New Vector Size in bytes.
+     * @param reverse_order     [I] if false Vectorization is applied on the last dim, otherwise on first dim.
+     */
+    void ConvertGran(int new_vector_size, bool reverse_order = false) {
+      int tns_dim = reverse_order ? 0 : get_rank() - 1;
+      m_config.ConvertGran(tns_dim, get_elem_size(), new_vector_size);
+      m_full_tensor.ConvertGran(new_vector_size, reverse_order);
     }
 
     /**
 
@@ -198,6 +198,7 @@ class KernelsFactory {
                                         int groups) { return nullptr; }
 
     /**
+     * @deprecated
      * @brief PReLU kernel Compiler Support interface factory
      * method
      *
@@ -212,11 +213,32 @@ class KernelsFactory {
      *
      * @return PReLU kernel Compiler Support interface object
      */
+    virtual lib_mli::Prelu_CS* Prelu_CS(void *kernel_buffer,
+                                        const TensorIterator<NoBuffer, 4, 4> &input,
+                                        const PreluOpConfig &cfg,
+                                        const TensorIterator<NoBuffer, 4, 4> &output,
+                                        int groups) { return nullptr; }
+
+    /**
+     * @brief PReLU kernel Compiler Support interface factory
+     * method
+     *
+     * @param kernel_buffer [I] Pointer to the pre-allocated memory to store
+     *                          kernel Compiler Support object
+     * @param input         [I] TensorIterator object containing input Tensor shape and
+     *                          memory strides and IteratorCfg
+     * @param cfg           [I] Kernel configuration structure
+     * @param enc_param     [I] TensorIterator object containing encoded parameters Tensor shape
+     *                          and memory strides and IteratorCfg
+     * @param output        [I] TensorIterator object containing output Tensor shape
+     *                          and memory strides and IteratorCfg
+     *
+     * @return PReLU kernel Compiler Support interface object
+     */
     virtual lib_mli::Prelu_CS* Prelu_CS(void *kernel_buffer,
                                         const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &input,
                                         const PreluOpConfig &cfg,
-                                        const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &output,
-                                        int groups) { return nullptr; }                                    
+                                        const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &output) { return nullptr; }                                                                 
 
     /**
      * @brief Prelu kernel Compiler Support interface
 
@@ -85,10 +85,14 @@ class KernelsFactory : public lib_mli::KernelsFactory {
 
     uint32_t Prelu_CS_GetSize() const override { return sizeof(lib_ref::Prelu_CS); }
 
+    /**
+     * @deprecated
+     * Be carefull - Prelu I/O tensors of rank 4 are deprecated - new interfaces use rank 5
+     */
     lib_mli::Prelu_CS* Prelu_CS(void *kernel_buffer,
-                                const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &input,
+                                const TensorIterator<NoBuffer, 4, 4> &input,
                                 const PreluOpConfig &cfg,
-                                const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &output,
+                                const TensorIterator<NoBuffer, 4, 4> &output,
                                 int groups) override {
         /**
          * The MLI classes need to be 32 bit aligned
@@ -98,6 +102,17 @@ class KernelsFactory : public lib_mli::KernelsFactory {
         return new(kernel_buffer) lib_ref::Prelu_CS(m_pd, input, cfg, output);
     }
 
+    lib_mli::Prelu_CS* Prelu_CS(void *kernel_buffer,
+                                const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &input,
+                                const PreluOpConfig &cfg,
+                                const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &output) override {
+        /**
+         * The MLI classes need to be 32 bit aligned
+         */
+        assert(kernel_buffer != nullptr);
+        assert(((size_t) kernel_buffer % kMliAlignment) == 0);
+        return new(kernel_buffer) lib_ref::Prelu_CS(m_pd, input, cfg, output); 
+    }
     uint32_t Move_CS_GetSize() const override { return sizeof(lib_ref::Move_CS); }
 
     lib_mli::Move_CS* Move_CS(void *kernel_buffer,
 
@@ -11,7 +11,7 @@
 
 #include <stdint.h>
 #include <assert.h>
-
+#include "mli_math_macros.h"
 
 namespace snps_arc::metaware::mli {
 
@@ -109,10 +109,9 @@ constexpr unsigned kRescaleRank = 4;
 constexpr unsigned kRescaleIterRank = 4;
 constexpr unsigned kRescaleParamRank = 1;
 
-constexpr unsigned kPreluRank = 4;
-constexpr unsigned kPreluIterRank = 4;
+constexpr unsigned kPreluRank = 5;
+constexpr unsigned kPreluIterRank = 5;
 constexpr unsigned kPreluParamRank = 2;
-constexpr unsigned kPreluParamIterRank = 2;
 
 constexpr unsigned kPoolRank = 4;
 constexpr unsigned kPoolIterRank = 4;
@@ -778,17 +777,21 @@ class Tensor {
   }
 
   // combine 'axis' and 'axis'+1 dimensions into one dimension if possible
-  Tensor<buf_T, maxRank-1> combine(uint32_t axis) const {
+  Tensor<buf_T, maxRank-1> combine(uint32_t axis, bool reverse_order = true) const {
     assert(axis < maxRank - 1);
     Tensor<buf_T, maxRank-1> tns;
     int s = 0;
-    for (int r = 0; r < maxRank; ++r) {
+    for (uint32_t r = 0; r < maxRank; ++r) {
       if (r < axis || r > axis) {
         tns.set_dim(s, shape_[r]);
         tns.set_mem_stride(s, mem_stride_[r]);
       } else {
         // combine 2 adjacent axis into 1
-        assert(mem_stride_[r+1] == shape_[r]*mem_stride_[r]);
+        if (reverse_order) {
+          assert(mem_stride_[r+1] == (int)shape_[r]*mem_stride_[r]);
+        } else {
+          assert(mem_stride_[r] == (int)shape_[r+1]*mem_stride_[r+1]);
+        }
         tns.set_dim(s, shape_[r]*shape_[r+1]);
         tns.set_mem_stride(s, mem_stride_[r]);
         ++r;
@@ -801,6 +804,31 @@ class Tensor {
     return tns;
   }
 
+  /**
+   * @brief 
+   * 
+   * Method to convert Tensor granularities along inner most dimension
+   * 
+   * @param new_vector_size   [I] New Vector Size in bytes.
+   * @param reverse_order     [I] if false Vectorization is applied on the last dim, otherwise on first dim.
+   */
+  void ConvertGran(int new_vector_size, bool reverse_order = false) {
+    int tns_dim = reverse_order ? 0 : rank_ - 1;
+    uint32_t old_vector_size = buf_.get_elem_size();
+    buf_.set_elem_size(new_vector_size);
+    buf_.set_size(buf_.get_size()*old_vector_size/new_vector_size);
+    assert(mem_stride_[tns_dim] == 1);
+    shape_[tns_dim] = CEIL_DIV(shape_[tns_dim] * old_vector_size, new_vector_size);
+    if (rank_ > 1) {
+      int32_t oldstride = mem_stride_[reverse_order ? tns_dim + 1 : tns_dim - 1];
+      int32_t newstride = CEIL_DIV(oldstride * old_vector_size, new_vector_size);
+      if (reverse_order)
+        for (int i = 1; i < rank_; ++i) mem_stride_[i] = mem_stride_[i] / oldstride * newstride;
+      else
+        for (int i = rank_ - 2; i >= 0; --i) mem_stride_[i] = mem_stride_[i] / oldstride * newstride;
+    }
+  }
+
   template <typename T>
   T read(uint32_t offset) const {
     return buf_.template read<T>(offset + offset_*get_elem_size()/sizeof(T));