Skip to content

Commit e7eae75

Browse files
essayedJaccovG
authored andcommitted
Conv2d/Prelu/ConvDW Updates
1 parent b26da3a commit e7eae75

File tree

14 files changed

+371
-339
lines changed

14 files changed

+371
-339
lines changed

include/api/mli_ref_compiler_api.hpp

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ class Conv2d_CS : public lib_mli::Conv2d_CS {
3535
* of all values in the related perception area of all channels of the input tensor.
3636
*
3737
* @deprected
38-
* Be carefull - you need to use another deprected method to set tiling - SetIterators
3938
* Be carefull - conv2d I/O tensors of rank 4 are deprecated - new interfaces use rank 5
4039
* Be carefull - this is the most deprecated Constructor
4140
*
@@ -165,18 +164,6 @@ class Conv2d_CS : public lib_mli::Conv2d_CS {
165164

166165
unsigned GetRuntimeObjectSize() const override;
167166

168-
/**
169-
* @deprecated
170-
* Be carefull - conv2d I/O tensors of rank 4 are deprecated - new interfaces use rank 5
171-
* Be carefull - don't use this method with new Conv2d_CS ctors - only with deprecated ctor that takes tensors
172-
*/
173-
mli_status SetIterators(uint32_t output_total_size[4],
174-
uint32_t iteration_order[4],
175-
uint32_t input_first_inc[4],
176-
uint32_t input_inc[4],
177-
uint32_t output_first_inc[4],
178-
uint32_t output_inc[4],
179-
uint32_t weights_inc[4]) override;
180167
private:
181168

182169
// Input, weights, weights zp(s), output tensors with offset buffer attached
@@ -1342,14 +1329,6 @@ class Clip_CS : public lib_mli::Clip_CS {
13421329
const OffsetBuffer& encoded_params,
13431330
const OffsetBuffer& descr) override;
13441331

1345-
/**
1346-
* @deprecated
1347-
*/
1348-
mli_status SetIterators(uint32_t output_total_size[kClipIterRank],
1349-
uint32_t iteration_order[kClipIterRank],
1350-
uint32_t output_first_inc[kClipIterRank],
1351-
uint32_t output_inc[kClipIterRank]) override;
1352-
13531332
private:
13541333
TensorIterator<OffsetBuffer, kClipRank, kClipIterRank> m_input;
13551334
TensorIterator<OffsetBuffer, kClipRank, kClipIterRank> m_output;
@@ -1404,6 +1383,27 @@ class Prelu_CS : public lib_mli::Prelu_CS {
14041383
* @param cfg [IN] PreluOpConfig structure
14051384
* @param output [OUT] Output tensor (tile shape)
14061385
*/
1386+
Prelu_CS(const lib_mli::PlatformDescription pd,
1387+
const TensorIterator<NoBuffer, 4, 4> &input,
1388+
const PreluOpConfig &cfg,
1389+
const TensorIterator<NoBuffer, 4, 4> &output);
1390+
1391+
/**
1392+
* @brief Constructor to create a PReLU compiler support object.
1393+
*
1394+
* This constructor can be used to create a PReLU compiler support
1395+
* object. This kernel computes values of the output tensor scaled by
1396+
* positive scale and shifted by positive shift if the input value is
1397+
* greater than the input bias,
1398+
* Otherwise It will apply negative scale and negative shift
1399+
* for all values in the desired axis of the input tensor
1400+
*
1401+
* @param pd [IN] Platform description
1402+
* @param input [IN] Input tensor (full shape)
1403+
* @param cfg [IN] PreluOpConfig structure
1404+
* @param enc_param [IN] Encoded parameters tensor
1405+
* @param output [OUT] Output tensor (tile shape)
1406+
*/
14071407
Prelu_CS(const lib_mli::PlatformDescription pd,
14081408
const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &input,
14091409
const PreluOpConfig &cfg,

include/api/mli_ref_runtime_api.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -973,7 +973,7 @@ class Prelu : public ExecutionInterface {
973973
void GetIOSizesAndOffsets(uint32_t &enc_param_size, uint32_t &inp_bias_offset,
974974
uint32_t &posscale_offset, uint32_t &negscale_offset,
975975
uint32_t &posshift_offset, uint32_t &negshift_offset,
976-
uint32_t &out_bias_offset) const;
976+
uint32_t &out_bias_offset);
977977

978978
private:
979979

include/mli_compiler_api.hpp

Lines changed: 2 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -78,18 +78,6 @@ class CompilerGenericInterface {
7878
return MLI_STATUS_OK;
7979
}
8080

81-
/**
82-
* @brief this function will return the vectorization in the input channel
83-
* dimension that is used by the platform.
84-
*/
85-
virtual unsigned GetInputChannelMultiple() { return 1; };
86-
87-
/**
88-
* @brief this function will return the vectorization in the output channel
89-
* dimension that is used by the platform.
90-
*/
91-
virtual unsigned GetOutputChannelMultiple() { return 1; };
92-
9381
// TODO add virtual destructor
9482
protected:
9583
bool m_issue_enable{false};
@@ -314,31 +302,6 @@ class Conv2d_CS : public CompilerGenericInterface {
314302
NOT_IMPLEMENTED_METHOD;
315303
return MLI_STATUS_OK; };
316304

317-
/**
318-
* @brief Method to set iteration information used in the .Update()
319-
*
320-
* NOTE: the use of this method is optional. if there is a single tile, and the .Update() is not used,
321-
* this data doesn't need to be set.
322-
* All the increments are following the output tile iterator.
323-
* @deprecated
324-
* Be carefull - don't use this method with new Conv2d_CS ctors - only with deprecated ctor that takes tensors
325-
*
326-
* @param output_total_size[4] [I] total size in each dimension
327-
* @param iteration_order[4] [I] which dimension of the output to iterate first.
328-
* @param input_first_inc[4] [I] increment of the input buffer pointer for the first iteration in each dimension
329-
* @param input_inc[4] [I] increment of the input buffer pointer for the other iterations in each dimension
330-
* @param output_first_inc[4] [I] increment of the output buffer pointer for the first iteration in each dimension
331-
* @param output_inc[4] [I] increment of the output buffer pointer for the other iterations in each dimension
332-
* @param weights_inc[4] [I] increment of the weights buffer pointer for the other iterations in each dimension of the output iterator
333-
*/
334-
virtual mli_status SetIterators(uint32_t output_total_size[4],
335-
uint32_t iteration_order[4],
336-
uint32_t input_first_inc[4],
337-
uint32_t input_inc[4],
338-
uint32_t output_first_inc[4],
339-
uint32_t output_inc[4],
340-
uint32_t weights_inc[4]) = 0;
341-
342305
};
343306

344307
/**
@@ -410,8 +373,8 @@ class Prelu_CS : public CompilerGenericInterface {
410373
* the weights buffer passed to the encode_weights function is in compiler memoryspace because the
411374
* encode function will write the encoded weights data there.
412375
*/
413-
virtual mli_status AttachBufferOffsets(Tensor<OffsetBuffer, kPreluRank> &input,
414-
Tensor<OffsetBuffer, kPreluRank> &output,
376+
virtual mli_status AttachBufferOffsets(Tensor<OffsetBuffer, 4> &input,
377+
Tensor<OffsetBuffer, 4> &output,
415378
OffsetBuffer &params,
416379
OffsetBuffer &ctrl_buffer) { return MLI_STATUS_OK; }
417380

@@ -434,26 +397,6 @@ class Prelu_CS : public CompilerGenericInterface {
434397
const OffsetBuffer &params,
435398
const OffsetBuffer &ctrl_buffer) { return MLI_STATUS_OK; }
436399

437-
/**
438-
* @brief Method to set iteration information used in the .Update()
439-
* @deprecated
440-
*
441-
* NOTE: the use of this method is optional. if there is a single tile, and the .Update() is not used,
442-
* this data doesn't need to be set.
443-
* All the increments are following the output tile iterator.
444-
* @param output_total_size[4] [I] total size in each dimension
445-
* @param iteration_order[4] [I] which dimension of the output to iterate first.
446-
* @param input_first_inc[4] [I] increment of the input buffer pointer for the first iteration in each dimension
447-
* @param input_inc[4] [I] increment of the input buffer pointer for the other iterations in each dimension
448-
* @param output_first_inc[4] [I] increment of the output buffer pointer for the first iteration in each dimension
449-
* @param output_inc[4] [I] increment of the output buffer pointer for the other iterations in each dimension
450-
*/
451-
virtual mli_status SetIterators(uint32_t output_total_size[4],
452-
uint32_t iteration_order[4],
453-
uint32_t input_first_inc[4],
454-
uint32_t input_inc[4],
455-
uint32_t output_first_inc[4],
456-
uint32_t output_inc[4]) { return MLI_STATUS_OK; }
457400
};
458401

459402

@@ -959,24 +902,6 @@ class Clip_CS : public CompilerGenericInterface {
959902
const OffsetBuffer& encoded_params,
960903
const OffsetBuffer& descr) = 0;
961904

962-
/**
963-
* @brief Method to set iteration information used in the .Update()
964-
*
965-
* NOTE: the use of this method is optional. if there is a single tile, and the .Update() is not used,
966-
* this data doesn't need to be set.
967-
* All the increments are following the output tile iterator.
968-
*
969-
* @deprecated
970-
* @param output_total_size[4] [I] total size in each dimension
971-
* @param iteration_order[4] [I] which dimension of the output to iterate first.
972-
* @param output_first_inc[4] [I] increment of the output buffer pointer for the first iteration in each dimension
973-
* @param output_inc[4] [I] increment of the output buffer pointer for the other iterations in each dimension
974-
*/
975-
virtual mli_status SetIterators(uint32_t output_total_size[kClipIterRank],
976-
uint32_t iteration_order[kClipIterRank],
977-
uint32_t output_first_inc[kClipIterRank],
978-
uint32_t output_inc[kClipIterRank]) = 0;
979-
980905
};
981906

982907
/**

include/mli_iterator.hpp

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,15 @@ class IteratorCfg {
286286
}
287287
}
288288

289-
// Method to convert iterator granulatiries along certain dimension
289+
/**
290+
* @brief
291+
*
292+
* Method to convert IteratorCfg granularities along specific dim
293+
*
294+
* @param tnsDim [I] Tensor Dimension to apply Vectorization to it.
295+
* @param old_vector_size [I] Old Vector Size in bytes.
296+
* @param new_vector_size [I] New Vector Size in bytes.
297+
*/
290298
void ConvertGran(int32_t tnsDim, int32_t old_vector_size, int32_t new_vector_size) {
291299
for (uint32_t i = 0; i < iterRank; ++i) if (m_order[i] == tnsDim) {
292300
m_first_size[i] = CEIL_DIV(m_first_size[i] * old_vector_size, new_vector_size);
@@ -858,11 +866,25 @@ class TensorIterator {
858866
}
859867

860868
void ApplyPrePadding(const uint32_t pre_padding[tensorRank]) {
861-
m_config.ApplyPrePadding(pre_padding);
869+
m_config.template ApplyPrePadding<tensorRank>(pre_padding);
862870
}
863871

864872
void ApplyAlignsToSizes(const uint32_t aligns[tensorRank]) {
865-
m_config.ApplyAlignsToSizes(aligns);
873+
m_config.template ApplyAlignsToSizes<tensorRank>(aligns);
874+
}
875+
876+
/**
877+
* @brief
878+
*
879+
* Method to convert TensorIterator granularities along inner most dimension
880+
*
881+
* @param new_vector_size [I] New Vector Size in bytes.
882+
* @param reverse_order [I] if false Vectorization is applied on the last dim, otherwise on first dim.
883+
*/
884+
void ConvertGran(int new_vector_size, bool reverse_order = false) {
885+
int tns_dim = reverse_order ? 0 : get_rank() - 1;
886+
m_config.ConvertGran(tns_dim, get_elem_size(), new_vector_size);
887+
m_full_tensor.ConvertGran(new_vector_size, reverse_order);
866888
}
867889

868890
/**

include/mli_kernels_factory.hpp

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ class KernelsFactory {
198198
int groups) { return nullptr; }
199199

200200
/**
201+
* @deprecated
201202
* @brief PReLU kernel Compiler Support interface factory
202203
* method
203204
*
@@ -212,11 +213,32 @@ class KernelsFactory {
212213
*
213214
* @return PReLU kernel Compiler Support interface object
214215
*/
216+
virtual lib_mli::Prelu_CS* Prelu_CS(void *kernel_buffer,
217+
const TensorIterator<NoBuffer, 4, 4> &input,
218+
const PreluOpConfig &cfg,
219+
const TensorIterator<NoBuffer, 4, 4> &output,
220+
int groups) { return nullptr; }
221+
222+
/**
223+
* @brief PReLU kernel Compiler Support interface factory
224+
* method
225+
*
226+
* @param kernel_buffer [I] Pointer to the pre-allocated memory to store
227+
* kernel Compiler Support object
228+
* @param input [I] TensorIterator object containing input Tensor shape and
229+
* memory strides and IteratorCfg
230+
* @param cfg [I] Kernel configuration structure
231+
* @param enc_param [I] TensorIterator object containing encoded parameters Tensor shape
232+
* and memory strides and IteratorCfg
233+
* @param output [I] TensorIterator object containing output Tensor shape
234+
* and memory strides and IteratorCfg
235+
*
236+
* @return PReLU kernel Compiler Support interface object
237+
*/
215238
virtual lib_mli::Prelu_CS* Prelu_CS(void *kernel_buffer,
216239
const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &input,
217240
const PreluOpConfig &cfg,
218-
const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &output,
219-
int groups) { return nullptr; }
241+
const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &output) { return nullptr; }
220242

221243
/**
222244
* @brief Prelu kernel Compiler Support interface

include/mli_kernels_factory_ref.hpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,14 @@ class KernelsFactory : public lib_mli::KernelsFactory {
8585

8686
uint32_t Prelu_CS_GetSize() const override { return sizeof(lib_ref::Prelu_CS); }
8787

88+
/**
89+
* @deprecated
90+
* Be carefull - Prelu I/O tensors of rank 4 are deprecated - new interfaces use rank 5
91+
*/
8892
lib_mli::Prelu_CS* Prelu_CS(void *kernel_buffer,
89-
const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &input,
93+
const TensorIterator<NoBuffer, 4, 4> &input,
9094
const PreluOpConfig &cfg,
91-
const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &output,
95+
const TensorIterator<NoBuffer, 4, 4> &output,
9296
int groups) override {
9397
/**
9498
* The MLI classes need to be 32 bit aligned
@@ -98,6 +102,17 @@ class KernelsFactory : public lib_mli::KernelsFactory {
98102
return new(kernel_buffer) lib_ref::Prelu_CS(m_pd, input, cfg, output);
99103
}
100104

105+
lib_mli::Prelu_CS* Prelu_CS(void *kernel_buffer,
106+
const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &input,
107+
const PreluOpConfig &cfg,
108+
const TensorIterator<NoBuffer, kPreluRank, kPreluIterRank> &output) override {
109+
/**
110+
* The MLI classes need to be 32 bit aligned
111+
*/
112+
assert(kernel_buffer != nullptr);
113+
assert(((size_t) kernel_buffer % kMliAlignment) == 0);
114+
return new(kernel_buffer) lib_ref::Prelu_CS(m_pd, input, cfg, output);
115+
}
101116
uint32_t Move_CS_GetSize() const override { return sizeof(lib_ref::Move_CS); }
102117

103118
lib_mli::Move_CS* Move_CS(void *kernel_buffer,

include/mli_types.hpp

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
#include <stdint.h>
1313
#include <assert.h>
14-
14+
#include "mli_math_macros.h"
1515

1616
namespace snps_arc::metaware::mli {
1717

@@ -109,10 +109,9 @@ constexpr unsigned kRescaleRank = 4;
109109
constexpr unsigned kRescaleIterRank = 4;
110110
constexpr unsigned kRescaleParamRank = 1;
111111

112-
constexpr unsigned kPreluRank = 4;
113-
constexpr unsigned kPreluIterRank = 4;
112+
constexpr unsigned kPreluRank = 5;
113+
constexpr unsigned kPreluIterRank = 5;
114114
constexpr unsigned kPreluParamRank = 2;
115-
constexpr unsigned kPreluParamIterRank = 2;
116115

117116
constexpr unsigned kPoolRank = 4;
118117
constexpr unsigned kPoolIterRank = 4;
@@ -778,17 +777,21 @@ class Tensor {
778777
}
779778

780779
// combine 'axis' and 'axis'+1 dimensions into one dimension if possible
781-
Tensor<buf_T, maxRank-1> combine(uint32_t axis) const {
780+
Tensor<buf_T, maxRank-1> combine(uint32_t axis, bool reverse_order = true) const {
782781
assert(axis < maxRank - 1);
783782
Tensor<buf_T, maxRank-1> tns;
784783
int s = 0;
785-
for (int r = 0; r < maxRank; ++r) {
784+
for (uint32_t r = 0; r < maxRank; ++r) {
786785
if (r < axis || r > axis) {
787786
tns.set_dim(s, shape_[r]);
788787
tns.set_mem_stride(s, mem_stride_[r]);
789788
} else {
790789
// combine 2 adjacent axis into 1
791-
assert(mem_stride_[r+1] == shape_[r]*mem_stride_[r]);
790+
if (reverse_order) {
791+
assert(mem_stride_[r+1] == (int)shape_[r]*mem_stride_[r]);
792+
} else {
793+
assert(mem_stride_[r] == (int)shape_[r+1]*mem_stride_[r+1]);
794+
}
792795
tns.set_dim(s, shape_[r]*shape_[r+1]);
793796
tns.set_mem_stride(s, mem_stride_[r]);
794797
++r;
@@ -801,6 +804,31 @@ class Tensor {
801804
return tns;
802805
}
803806

807+
/**
808+
* @brief
809+
*
810+
* Method to convert Tensor granularities along inner most dimension
811+
*
812+
* @param new_vector_size [I] New Vector Size in bytes.
813+
* @param reverse_order [I] if false Vectorization is applied on the last dim, otherwise on first dim.
814+
*/
815+
void ConvertGran(int new_vector_size, bool reverse_order = false) {
816+
int tns_dim = reverse_order ? 0 : rank_ - 1;
817+
uint32_t old_vector_size = buf_.get_elem_size();
818+
buf_.set_elem_size(new_vector_size);
819+
buf_.set_size(buf_.get_size()*old_vector_size/new_vector_size);
820+
assert(mem_stride_[tns_dim] == 1);
821+
shape_[tns_dim] = CEIL_DIV(shape_[tns_dim] * old_vector_size, new_vector_size);
822+
if (rank_ > 1) {
823+
int32_t oldstride = mem_stride_[reverse_order ? tns_dim + 1 : tns_dim - 1];
824+
int32_t newstride = CEIL_DIV(oldstride * old_vector_size, new_vector_size);
825+
if (reverse_order)
826+
for (int i = 1; i < rank_; ++i) mem_stride_[i] = mem_stride_[i] / oldstride * newstride;
827+
else
828+
for (int i = rank_ - 2; i >= 0; --i) mem_stride_[i] = mem_stride_[i] / oldstride * newstride;
829+
}
830+
}
831+
804832
template <typename T>
805833
T read(uint32_t offset) const {
806834
return buf_.template read<T>(offset + offset_*get_elem_size()/sizeof(T));

0 commit comments

Comments
 (0)