Skip to content

Commit 076baf8

Browse files
lamyaaJaccovG
authored andcommitted
MoveBroadcast With Tiling
1 parent 6008b95 commit 076baf8

File tree

4 files changed

+162
-103
lines changed

4 files changed

+162
-103
lines changed

include/api/mli_ref_runtime_api.hpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1137,12 +1137,19 @@ class MoveBroadcast : public ExecutionInterface {
11371137

11381138
mli_status Update() override;
11391139

1140+
// TODO: remove this method and replace with usage of Move kernel once it implemented.
1141+
void GetIOSizesAndOffsets(uint32_t input_size[kMoveBroadcastRank], uint32_t output_size[kMoveBroadcastRank],
1142+
int32_t input_offsets[kMoveBroadcastRank], int32_t output_offsets[kMoveBroadcastRank]);
1143+
1144+
11401145
private:
1141-
TensorIterator<InternalBuffer, kMoveBroadcastRank, kMoveBroadcastIterRank> m_src;
1142-
TensorIterator<InternalBuffer, kMoveBroadcastRank, kMoveBroadcastIterRank> m_dst;
1146+
TensorIterator<OffsetBuffer, kMoveBroadcastRank, kMoveBroadcastIterRank> m_src;
1147+
TensorIterator<OffsetBuffer, kMoveBroadcastRank, kMoveBroadcastIterRank> m_dst;
1148+
Tensor<InternalBuffer, kMoveBroadcastRank> m_tile_src;
1149+
Tensor<InternalBuffer, kMoveBroadcastRank> m_tile_dst;
11431150

11441151
template <typename buf_T, unsigned N>
1145-
void MoveBroadcastRun(TensorIterator<buf_T, N, N> src, TensorIterator<buf_T, N, N> dst);
1152+
void MoveBroadcastRun(Tensor<buf_T, N> &src, Tensor<buf_T, N> &dst);
11461153
};
11471154

11481155
} // namespace snps_arc::metaware::mli::ref

include/mli_types.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,8 @@ constexpr unsigned kEltwiseRank = 4;
137137
constexpr short int kReduceMaxRank = 4;
138138
constexpr short int kReduceMaxIterRank = 4;
139139

140-
constexpr unsigned kMoveBroadcastRank = 5;
141-
constexpr unsigned kMoveBroadcastIterRank = 5;
140+
constexpr unsigned kMoveBroadcastRank = 4; // ToDo: when mli_tensor takes [rank=5] -> change rank from 4 to 5.
141+
constexpr unsigned kMoveBroadcastIterRank = 4; // ToDo: when mli_tensor takes [rank=5] -> change rank from 4 to 5.
142142

143143
constexpr short int kResizeDim = 2;
144144
constexpr short int kResizeBilinearRank = 4;

lib/src/move/mli_move_broadcast_runtime.cc

Lines changed: 59 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -19,30 +19,30 @@ namespace snps_arc::metaware::mli::ref {
1919

2020
MoveBroadcast::MoveBroadcast(void* kernel_private_data_buffer, size_t size, uint64_t membases[], int num_mems) {
2121
MLI_ASSERT(size == sizeof(MoveBroadcastPrivateData));
22-
MoveBroadcastPrivateData private_data;
23-
memcpy(&private_data, kernel_private_data_buffer, sizeof(MoveBroadcastPrivateData));
24-
MLI_ASSERT(private_data.kernel_id == kMoveBroadcastId);
25-
MLI_ASSERT(private_data.size == sizeof(MoveBroadcastPrivateData));
26-
27-
m_src = TensorIterator<InternalBuffer, kMoveBroadcastRank, kMoveBroadcastIterRank>(private_data.src, membases, num_mems);
28-
m_dst = TensorIterator<InternalBuffer, kMoveBroadcastRank, kMoveBroadcastIterRank>(private_data.dst, membases, num_mems);
29-
m_src.Reset();
30-
m_dst.Reset();
22+
MoveBroadcastPrivateData private_buffer;
23+
memcpy(&private_buffer, kernel_private_data_buffer, sizeof(MoveBroadcastPrivateData));
24+
MLI_ASSERT(private_buffer.kernel_id == kMoveBroadcastId);
25+
MLI_ASSERT(private_buffer.size == sizeof(MoveBroadcastPrivateData));
26+
27+
m_src = private_buffer.src;
28+
m_dst = private_buffer.dst;
29+
m_tile_src = Tensor<InternalBuffer, kMoveBroadcastRank>(m_src.GetSubTensor(), membases, num_mems);
30+
m_tile_dst = Tensor<InternalBuffer, kMoveBroadcastRank>(m_dst.GetSubTensor(), membases, num_mems);
3131
}
3232

3333
template <typename buf_T, unsigned N>
34-
int32_t tensor_read(TensorIterator<buf_T, N, N> tsr, uint32_t *index) {
34+
int32_t tensor_read(Tensor<buf_T, N> tsr, uint32_t *index) {
3535
int32_t result = 0;
36-
int32_t offset = tsr.get_tensor().get_offset(index);
37-
switch (tsr.get_tensor().get_elem_size()) {
36+
int32_t offset = tsr.get_offset(index);
37+
switch (tsr.get_elem_size()) {
3838
case sizeof(int8_t):
39-
result = tsr.get_tensor().template read<int8_t>(offset);
39+
result = tsr.template read<int8_t>(offset);
4040
break;
4141
case sizeof(int16_t):
42-
result = tsr.get_tensor().template read<int16_t>(offset);
42+
result = tsr.template read<int16_t>(offset);
4343
break;
4444
case sizeof(int32_t):
45-
result = tsr.get_tensor().template read<int32_t>(offset);
45+
result = tsr.template read<int32_t>(offset);
4646
break;
4747
default:
4848
MLI_ASSERT(false);
@@ -51,17 +51,17 @@ int32_t tensor_read(TensorIterator<buf_T, N, N> tsr, uint32_t *index) {
5151
}
5252

5353
template <typename buf_T, unsigned N>
54-
void tensor_write(TensorIterator<buf_T, N, N> tsr, uint32_t *index, int32_t value) {
55-
int32_t offset = tsr.get_tensor().get_offset(index);
56-
switch (tsr.get_tensor().get_elem_size()) {
54+
void tensor_write(Tensor<buf_T, N> tsr, uint32_t *index, int32_t value) {
55+
int32_t offset = tsr.get_offset(index);
56+
switch (tsr.get_elem_size()) {
5757
case sizeof(int8_t):
58-
tsr.get_tensor().template write<int8_t>(offset, value);
58+
tsr.template write<int8_t>(offset, value);
5959
break;
6060
case sizeof(int16_t):
61-
tsr.get_tensor().template write<int16_t>(offset, value);
61+
tsr.template write<int16_t>(offset, value);
6262
break;
6363
case sizeof(int32_t):
64-
tsr.get_tensor().template write<int32_t>(offset, value);
64+
tsr.template write<int32_t>(offset, value);
6565
break;
6666
default:
6767
MLI_ASSERT(false);
@@ -70,19 +70,19 @@ void tensor_write(TensorIterator<buf_T, N, N> tsr, uint32_t *index, int32_t valu
7070

7171
// Move Broadcast Core Function
7272
template <typename buf_T, unsigned N>
73-
void MoveBroadcast::MoveBroadcastRun(TensorIterator<buf_T, N, N> src, TensorIterator<buf_T, N, N> dst) {
73+
void MoveBroadcast::MoveBroadcastRun(Tensor<buf_T, N> &src, Tensor<buf_T, N> &dst) {
7474
uint32_t src_idx[N] = {0};
7575
uint32_t dst_idx[N] = {0};
7676
uint32_t src_shape[N] = {0};
7777
uint32_t dst_shape[N] = {0};
78-
uint32_t src_rank = src.get_tensor().get_rank();
79-
uint32_t dst_rank = dst.get_tensor().get_rank();
78+
uint32_t src_rank = src.get_rank();
79+
uint32_t dst_rank = dst.get_rank();
8080

8181
MLI_ASSERT(src_rank == dst_rank);
8282

8383
// get shapes
84-
src.get_full_shape(src_shape);
85-
dst.get_full_shape(dst_shape);
84+
src.get_dims(src_shape);
85+
dst.get_dims(dst_shape);
8686

8787
// Tensors with rank less than MLI_MAX_RANK, the tensor is automatically filled with 1's
8888
for (uint32_t i = src_rank; i < kMoveBroadcastRank; i++) {
@@ -96,12 +96,14 @@ void MoveBroadcast::MoveBroadcastRun(TensorIterator<buf_T, N, N> src, TensorIter
9696
for (int d1_cnt = 0; d1_cnt < (int)dst_shape[1]; d1_cnt++) {
9797
for (int d2_cnt = 0; d2_cnt < (int)dst_shape[2]; d2_cnt++) {
9898
for (int d3_cnt = 0; d3_cnt < (int)dst_shape[3]; d3_cnt++) {
99-
for (int d4_cnt = 0; d4_cnt < (int)dst_shape[4]; d4_cnt++) {
99+
// ToDo: when mli_tensor takes [rank=5]
100+
// for (int d4_cnt = 0; d4_cnt < (int)dst_shape[4]; d4_cnt++) {
100101
dst_idx[0] = d0_cnt;
101102
dst_idx[1] = d1_cnt;
102103
dst_idx[2] = d2_cnt;
103104
dst_idx[3] = d3_cnt;
104-
dst_idx[4] = d4_cnt;
105+
// dst_idx[4] = d4_cnt;
106+
105107
// inner loop for move broad cast.
106108
for (uint32_t i = 0; i < dst_rank; i++) {
107109
if(src_shape[i] != dst_shape[i]) {
@@ -114,15 +116,15 @@ void MoveBroadcast::MoveBroadcastRun(TensorIterator<buf_T, N, N> src, TensorIter
114116
}
115117
int32_t value = tensor_read<buf_T, N>(src, src_idx);
116118
tensor_write<buf_T, N>(dst, dst_idx, value);
117-
}
119+
// }
118120
}
119121
}
120122
}
121123
}
122124
}
123125

124-
mli_status MoveBroadcast::Issue() {
125-
MoveBroadcastRun<InternalBuffer, kMoveBroadcastRank>(m_src, m_dst);
126+
mli_status MoveBroadcast::Issue() {
127+
MoveBroadcastRun<InternalBuffer, kMoveBroadcastRank>(m_tile_src, m_tile_dst);
126128
return MLI_STATUS_OK;
127129
}
128130

@@ -131,7 +133,33 @@ mli_status MoveBroadcast::Prefetch() {
131133
}
132134

133135
mli_status MoveBroadcast::Update() {
136+
m_src.Next();
137+
m_dst.Next();
138+
139+
const auto src_tile_tensor = m_src.GetSubTensor();
140+
uint32_t src_tile_shape[kMoveBroadcastRank];
141+
src_tile_tensor.get_dims(src_tile_shape);
142+
m_tile_src = Tensor<InternalBuffer, kMoveBroadcastRank>(m_tile_src, src_tile_shape);
143+
144+
const auto dst_tile_tensor = m_dst.GetSubTensor();
145+
uint32_t dst_tile_shape[kMoveBroadcastRank];
146+
dst_tile_tensor.get_dims(dst_tile_shape);
147+
m_tile_dst = Tensor<InternalBuffer, kMoveBroadcastRank>(m_tile_dst, dst_tile_shape);
148+
134149
return MLI_STATUS_OK;
135150
}
136151

152+
void MoveBroadcast::GetIOSizesAndOffsets(uint32_t src_size[kMoveBroadcastRank], uint32_t dst_size[kMoveBroadcastRank],
153+
int32_t src_offsets[kMoveBroadcastRank], int32_t dst_offsets[kMoveBroadcastRank]) {
154+
155+
m_src.get_pos(src_offsets);
156+
m_dst.get_pos(dst_offsets);
157+
158+
const auto src_tile_tensor = m_src.GetSubTensor();
159+
src_tile_tensor.get_dims(src_size);
160+
161+
const auto dst_tile_tensor = m_dst.GetSubTensor();
162+
dst_tile_tensor.get_dims(dst_size);
163+
}
164+
137165
} // namespace snps_arc::metaware::mli::ref

0 commit comments

Comments
 (0)