Skip to content

Commit 297fce6

Browse files
hanafyessayed
authored andcommitted
Adding Tiling for SumPool2D ref kernel
1 parent aed7477 commit 297fce6

File tree

13 files changed

+574
-412
lines changed

13 files changed

+574
-412
lines changed

include/api/mli_ref_compiler_api.hpp

Lines changed: 43 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -455,9 +455,9 @@ class MaxPool2D_CS : public lib_mli::MaxPool2D_CS {
455455
* @param output_tile_shape [O] Output tensor (tile shape, BHWC layout)
456456
*/
457457
MaxPool2D_CS(const lib_mli::PlatformDescription pd,
458-
const Tensor<NoBuffer, kMaxpoolRank> in,
458+
const Tensor<NoBuffer, kPoolRank> in,
459459
const PoolOpConfig &cfg,
460-
const Tensor<NoBuffer, kMaxpoolRank> output_tile_shape);
460+
const Tensor<NoBuffer, kPoolRank> output_tile_shape);
461461

462462
/**
463463
* @brief Constructor to create a MaxPool2D compiler support object.
@@ -472,21 +472,16 @@ class MaxPool2D_CS : public lib_mli::MaxPool2D_CS {
472472
* @param out [O] Output tensor iterator (BHWC layout)
473473
*/
474474
MaxPool2D_CS(const lib_mli::PlatformDescription pd,
475-
const TensorIterator<NoBuffer, kMaxpoolRank, kMaxpoolIterRank> in,
475+
const TensorIterator<NoBuffer, kPoolRank, kPoolIterRank> in,
476476
const PoolOpConfig& cfg,
477-
const TensorIterator<NoBuffer, kMaxpoolRank, kMaxpoolIterRank> out);
477+
const TensorIterator<NoBuffer, kPoolRank, kPoolIterRank> out);
478478

479479
unsigned GetKernelPrivateDataSize() const override;
480480
unsigned GetRuntimeObjectSize() const override;
481-
482-
/**
483-
* Tensor buffer sizes could depend on the platform and/or parameters.
484-
* These functions can be used to query how much memory needs to be allocated for
485-
* the input, weights and output tensors.
486-
* Note, that these sizes are for full tensors, not tiles.
487-
*/
481+
488482
unsigned GetInputBufferSize() const override;
489483
unsigned GetOutputBufferSize() const override;
484+
490485
/**
491486
* @return Always returns zero for reference kernel.
492487
*/
@@ -496,55 +491,72 @@ class MaxPool2D_CS : public lib_mli::MaxPool2D_CS {
496491
/**
497492
* @deprecated
498493
*/
499-
mli_status AttachBufferOffsets(const Tensor<OffsetBuffer, kMaxpoolRank> &input,
500-
const Tensor<OffsetBuffer, kMaxpoolRank> &output,
494+
mli_status AttachBufferOffsets(const Tensor<OffsetBuffer, kPoolRank> &input,
495+
const Tensor<OffsetBuffer, kPoolRank> &output,
501496
const OffsetBuffer &ctrl_buffer) override;
502497

503498
mli_status AttachBufferOffsets(const OffsetBuffer& input,
504499
const OffsetBuffer& output,
505500
const OffsetBuffer& ctrl_buffer) override;
506501

507502
private:
508-
TensorIterator<OffsetBuffer, kMaxpoolRank, kMaxpoolIterRank> m_input;
509-
TensorIterator<OffsetBuffer, kMaxpoolRank, kMaxpoolIterRank> m_output;
503+
TensorIterator<OffsetBuffer, kPoolRank, kPoolIterRank> m_input;
504+
TensorIterator<OffsetBuffer, kPoolRank, kPoolIterRank> m_output;
510505

511506
PoolOpConfig m_config;
512507

513-
uint32_t m_input_buffer_size;
514-
uint32_t m_output_buffer_size;
515-
516508
lib_mli::PlatformDescription m_pd;
517509
};
518510

519511
class SumPool2D_CS : public lib_mli::SumPool2D_CS {
520512
public:
521-
522513
SumPool2D_CS(const lib_mli::PlatformDescription pd,
523-
const Tensor<NoBuffer, 4> in,
514+
const Tensor<NoBuffer, kPoolRank> in,
524515
const PoolOpConfig &cfg,
525-
const Tensor<NoBuffer, 4> output_tile_shape);
516+
const Tensor<NoBuffer, kPoolRank> output_tile_shape);
517+
/**
518+
* @brief Constructor to create a SumPool2D compiler support object.
519+
*
520+
* This constructor can be used to create a Sum Pooling 2D compiler support
521+
* object. This kernel computes each value of the output tensor as the sum
522+
* of all values in the related perception area of a single channel of the input tensor.
523+
*
524+
* @param pd [I] Platform description
525+
* @param in [I] Input tensor iterator (BHWC layout)
526+
* @param cfg [I] PoolOpConfig structure
527+
* @param out [I] Output tensor iterator (BHWC layout)
528+
*/
529+
SumPool2D_CS(const lib_mli::PlatformDescription pd,
530+
const TensorIterator<NoBuffer, kPoolRank, kPoolIterRank> &in,
531+
const PoolOpConfig &cfg,
532+
const TensorIterator<NoBuffer, kPoolRank, kPoolIterRank> &out);
526533

527534
// From CompilerGenericInterface
528535
unsigned GetKernelPrivateDataSize() const override;
529536
unsigned GetRuntimeObjectSize() const override;
530-
mli_status GetKernelPrivateData(void* kernel_private_data_buffer) override;
531-
mli_status AttachBufferOffsets(const Tensor<OffsetBuffer, 4> &input,
532-
const Tensor<OffsetBuffer, 4> &output,
533-
const OffsetBuffer &ctrl_buffer) override;
534537

535-
// From SumPool2D_CS
536538
unsigned GetInputBufferSize() const override;
537539
unsigned GetOutputBufferSize() const override;
538540

541+
mli_status GetKernelPrivateData(void* kernel_private_data_buffer) override;
542+
543+
/**
544+
* @deprecated
545+
*/
546+
mli_status AttachBufferOffsets(const Tensor<OffsetBuffer, kPoolRank> &input,
547+
const Tensor<OffsetBuffer, kPoolRank> &output,
548+
const OffsetBuffer &ctrl_buffer) override;
549+
550+
mli_status AttachBufferOffsets(const OffsetBuffer& input,
551+
const OffsetBuffer& output,
552+
const OffsetBuffer& ctrl_buffer) override;
553+
539554
private:
540-
Tensor<OffsetBuffer, 4> m_in;
541-
Tensor<OffsetBuffer, 4> m_output;
555+
TensorIterator<OffsetBuffer, kPoolRank, kPoolIterRank> m_input;
556+
TensorIterator<OffsetBuffer, kPoolRank, kPoolIterRank> m_output;
542557

543558
PoolOpConfig m_config;
544559

545-
uint32_t m_input_buffer_size;
546-
uint32_t m_output_buffer_size;
547-
548560
lib_mli::PlatformDescription m_pd;
549561
};
550562

include/api/mli_ref_runtime_api.hpp

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -284,14 +284,14 @@ class MaxPool2D : public ExecutionInterface {
284284
mli_status Update() override;
285285

286286
// TODO: remove this method and replace with usage of Move kernel (not possible now)
287-
void GetIOSizesAndOffsets(uint32_t input_size[kMaxpoolRank], uint32_t output_size[kMaxpoolRank],
288-
int32_t input_offsets[kMaxpoolRank], int32_t output_offsets[kMaxpoolRank]);
287+
void GetIOSizesAndOffsets(uint32_t input_size[kPoolRank], uint32_t output_size[kPoolRank],
288+
int32_t input_offsets[kPoolRank], int32_t output_offsets[kPoolRank]);
289289

290290
private:
291291
void UpdateTilePaddings();
292292

293-
TensorIterator<OffsetBuffer, kMaxpoolRank, kMaxpoolIterRank> m_input;
294-
TensorIterator<OffsetBuffer, kMaxpoolRank, kMaxpoolIterRank> m_output;
293+
TensorIterator<OffsetBuffer, kPoolRank, kPoolIterRank> m_input;
294+
TensorIterator<OffsetBuffer, kPoolRank, kPoolIterRank> m_output;
295295

296296
mli_pool_cfg m_cfg;
297297
int32_t m_input_batch_offset;
@@ -356,6 +356,30 @@ class FullyConnected : public ExecutionInterface {
356356
class SumPool2D : public ExecutionInterface {
357357

358358
public:
359+
/**
360+
* @brief Construct a new Sum Pooling 2D object
361+
*
362+
* This method will create and initialize the Sum Pooling 2D object using the information
363+
* stored in the kernel_private_data_buffer that has been computed at compile time
364+
* by the GetKernelPrivateData() method of SumPool2D_CS class
365+
*
366+
* This kernel computes each value of the output tensor as the sum
367+
* of all values in the related perception area of a single channel of the input tensor.
368+
*
369+
* @param kernel_private_data_buffer [I] Pointer to the compilation time computed initialization data.
370+
* @param size [I] Size of the data is used to check for coding errors.
371+
* @param membases[] [I] The kernel private data may contain offsets inside a (vector) memory.
372+
* At run-time specific locations in memory are allocated for
373+
* the graph, the membase array contains the start of
374+
* each memory region.
375+
* This base will be added to all memory offsets in the constructor
376+
* according to the memory ID associated with that offset.
377+
* Each platform can have different (number of) memories. For mli
378+
* this is completely transparent. Compiler needs to use the same
379+
* memory id's when attaching the buffers as are used by the
380+
* xop-interpreter to set the membases.
381+
* @param num_mems [I] Number of memory regions passed with membases array.
382+
*/
359383
SumPool2D(void* kernel_private_data_buffer, size_t size, uint64_t membases[], int num_mems);
360384

361385
mli_status Issue() override;
@@ -364,15 +388,29 @@ class SumPool2D : public ExecutionInterface {
364388

365389
mli_status Update() override;
366390

391+
// TODO: remove this method and replace with usage of Move kernel (not possible now)
392+
void GetIOSizesAndOffsets(uint32_t input_size[kPoolRank], uint32_t output_size[kPoolRank],
393+
int32_t input_offsets[kPoolRank], int32_t output_offsets[kPoolRank]);
394+
367395
private:
396+
void UpdateTilePaddings();
397+
368398
mli_pool_cfg m_cfg;
369-
mli_tensor m_input;
370-
mli_tensor m_output;
399+
400+
TensorIterator<OffsetBuffer, kPoolRank, kPoolIterRank> m_input;
401+
TensorIterator<OffsetBuffer, kPoolRank, kPoolIterRank> m_output;
402+
371403
int32_t m_input_batch_offset;
372404
int32_t m_output_batch_offset;
373-
uint32_t m_batch_number;
405+
374406
uint32_t m_i_elem_size;
375407
uint32_t m_o_elem_size;
408+
409+
// Tile state
410+
uint32_t m_tile_batch_size;
411+
mli_tensor m_tile_input;
412+
mli_tensor m_tile_output;
413+
mli_pool_cfg m_tile_cfg;
376414
};
377415

378416
/**

include/internal/mli_ref_private_types.hpp

Lines changed: 3 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -185,70 +185,15 @@ class MovePrivateData : public PrivateData {
185185
TensorIterator<OffsetBuffer, kMoveRank, kMoveIterRank> dst_it;
186186
};
187187

188-
/**
189-
* TODO: to remove this after Pool2DPrivateData will be updated
190-
* to do this SumPool2D_CS needs to be updated same was as MaxPool2D_CS
191-
*/
192-
class MaxPool2DPrivateData : public PrivateData {
193-
194-
public:
195-
MaxPool2DPrivateData(kernel_id_t id)
196-
: PrivateData(id, sizeof(MaxPool2DPrivateData)) {}
197-
198-
TensorIterator<OffsetBuffer, kMaxpoolRank, kMaxpoolIterRank> input;
199-
TensorIterator<OffsetBuffer, kMaxpoolRank, kMaxpoolIterRank> output;
200-
PoolOpConfig config;
201-
};
202-
203188
class Pool2DPrivateData : public PrivateData {
204189

205190
public:
206191
Pool2DPrivateData(kernel_id_t id)
207192
: PrivateData(id, sizeof(Pool2DPrivateData)) {}
208193

209-
OffsetBuffer input_buffer;
210-
OffsetBuffer output_buffer;
211-
212-
uint32_t input_b;
213-
uint32_t input_h;
214-
uint32_t input_w;
215-
uint32_t input_c;
216-
217-
int32_t input_b_stride;
218-
int32_t input_h_stride;
219-
int32_t input_w_stride;
220-
int32_t input_c_stride;
221-
222-
uint32_t output_b;
223-
uint32_t output_h;
224-
uint32_t output_w;
225-
uint32_t output_c;
226-
227-
int32_t output_b_stride;
228-
int32_t output_h_stride;
229-
int32_t output_w_stride;
230-
int32_t output_c_stride;
231-
232-
uint8_t kernel_height;
233-
uint8_t kernel_width;
234-
uint8_t stride_height;
235-
uint8_t stride_width;
236-
uint8_t padding_top;
237-
uint8_t padding_bottom;
238-
uint8_t padding_left;
239-
uint8_t padding_right;
240-
241-
// Tile Parameters BHWC
242-
bool m_use_tiling;
243-
uint32_t m_tile_total_input_size[4];
244-
uint32_t m_tile_total_output_size[4];
245-
uint32_t m_tile_iteration_order[4];
246-
uint32_t m_tile_first_size[4];
247-
uint32_t m_tile_size[4];
248-
uint32_t m_tile_input_first_inc[4];
249-
uint32_t m_tile_input_inc[4];
250-
uint32_t m_tile_output_first_inc[4];
251-
uint32_t m_tile_output_inc[4];
194+
TensorIterator<OffsetBuffer, kPoolRank, kPoolIterRank> input;
195+
TensorIterator<OffsetBuffer, kPoolRank, kPoolIterRank> output;
196+
PoolOpConfig config;
252197
};
253198

254199
class EltwisePrivateData : public PrivateData {

include/mli_compiler_api.hpp

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -723,14 +723,18 @@ class MaxPool2D_CS : public CompilerGenericInterface {
723723

724724
/**
725725
* @brief Method to get the input buffer size
726-
*
726+
* @deprecated
727+
*
728+
* TODO: will be removed
727729
* @return Size of the input buffer in bytes
728730
*/
729731
virtual unsigned GetInputBufferSize() const = 0;
730732

731733
/**
732734
* @brief Method to get the output buffer size
733-
*
735+
* @deprecated
736+
*
737+
* TODO: will be removed
734738
* @return Size of the output buffer in bytes
735739
*/
736740
virtual unsigned GetOutputBufferSize() const = 0;
@@ -751,8 +755,8 @@ class MaxPool2D_CS : public CompilerGenericInterface {
751755
*
752756
* @return MLI status code
753757
*/
754-
virtual mli_status AttachBufferOffsets(const Tensor<OffsetBuffer, kMaxpoolRank> &input,
755-
const Tensor<OffsetBuffer, kMaxpoolRank> &output,
758+
virtual mli_status AttachBufferOffsets(const Tensor<OffsetBuffer, kPoolRank> &input,
759+
const Tensor<OffsetBuffer, kPoolRank> &output,
756760
const OffsetBuffer &ctrl_buffer) = 0;
757761

758762
/**
@@ -788,18 +792,56 @@ class SumPool2D_CS : public CompilerGenericInterface {
788792

789793
/**
790794
* @brief Methods to get buffer sizes
795+
*
796+
* @deprecated
797+
*
798+
* TODO: to be removed later
791799
*/
792800

793801
virtual unsigned GetInputBufferSize() const = 0;
794802
virtual unsigned GetOutputBufferSize() const = 0;
803+
795804
/**
796-
797-
* @brief Methods to set buffer offsets
805+
* @brief Method to set buffer memory offsets and memory IDs for the kernel
806+
*
807+
* @deprecated
808+
*
809+
* Compiler computes a memory map and buffer offsets are set using this method.
810+
* Compiler also needs to indicate in which memory the buffers reside.
811+
* These ID's need to match the array of memory bases that the xop-interpreter passes to
812+
* the Create function.
798813
*
814+
* In this method you specify offsets for tensors passed to the constructor.
815+
*
816+
* @param input [I] Tensor descriptor containing input OffsetBuffer and tensor shape and memory strides
817+
* @param output [I] Tensor descriptor containing output OffsetBuffer and tensor shape and memory strides
818+
* @param ctrl_buffer [I] data OffsetBuffer
819+
*
820+
* @return MLI status code
799821
*/
800-
virtual mli_status AttachBufferOffsets(const Tensor<OffsetBuffer, 4> &input,
801-
const Tensor<OffsetBuffer, 4> &output,
822+
virtual mli_status AttachBufferOffsets(const Tensor<OffsetBuffer, kPoolRank> &input,
823+
const Tensor<OffsetBuffer, kPoolRank> &output,
802824
const OffsetBuffer &ctrl_buffer) = 0;
825+
826+
/**
827+
* @brief Method to set buffer memory offsets and memory IDs for the kernel
828+
*
829+
* Compiler computes a memory map and buffer offsets are set using this method.
830+
* Compiler also needs to indicate in which memory the buffers reside.
831+
* These ID's need to match the array of memory bases that the xop-interpreter passes to
832+
* the Create function.
833+
*
834+
* In this method you specify offsets for tensors passed to the constructor.
835+
*
836+
* @param input [I] OffsetBuffer containing Memory Identifier and Offset in that memory
837+
* @param output [I] OffsetBuffer containing Memory Identifier and Offset in that memory
838+
* @param ctrl_buffer [I] OffsetBuffer containing Memory Identifier and Offset in that memory
839+
*
840+
* @return MLI status code
841+
*/
842+
virtual mli_status AttachBufferOffsets(const OffsetBuffer &input,
843+
const OffsetBuffer &output,
844+
const OffsetBuffer &ctrl_buffer) = 0;
803845
};
804846

805847

0 commit comments

Comments
 (0)