Skip to content

Commit fdbe2f0

Browse files
msayedJaccovG
authored andcommitted
MatMul tiling
1 parent e7eae75 commit fdbe2f0

File tree

4 files changed

+132
-62
lines changed

4 files changed

+132
-62
lines changed

lib/src/kernels/convolution/mli_krn_matmul_compiler.cc

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
#include "mli_ref_compiler_api.hpp"
1212
#include "mli_ref_runtime_api.hpp"
1313
#include "mli_service_functions.hpp"
14+
15+
1416
namespace snps_arc::metaware::mli::ref {
1517

1618
MatMul_CS::MatMul_CS(const lib_mli::PlatformDescription &pd,
@@ -38,11 +40,13 @@ mli_status MatMul_CS::GetKernelPrivateData(void* kernel_private_data_buffer) {
3840

3941
MatMulPrivateData prv_data;
4042

43+
4144
prv_data.m_in_left = m_in_left;
4245
prv_data.m_in_right = m_in_right;
4346
prv_data.m_output = m_output;
4447
prv_data.encoded_params = m_encoded_params;
4548

49+
4650
std::memcpy(kernel_private_data_buffer, (void *)&prv_data, sizeof(prv_data));
4751

4852
return MLI_STATUS_OK;
@@ -69,10 +73,11 @@ mli_status MatMul_CS::EncodeParams(const Buffer &in_bias1,
6973
assert(in_bias1.get_size() + in_bias2.get_size() == encoded_params.get_size());
7074
assert(in_bias1.get_size() == in_bias2.get_size() == 1);
7175

72-
// in_zp must be int8_t
76+
77+
// in_zp type must be int8_t
7378
assert(in_bias1.get_elem_size() == sizeof(int8_t));
74-
encoded_params.write<int8_t>(0, in_bias1.read<int8_t>(0));
75-
encoded_params.write<int8_t>(1, in_bias2.read<int8_t>(0));
79+
encoded_params.write<int8_t>(kMatMulHeightDim, in_bias1.read<int8_t>(0));
80+
encoded_params.write<int8_t>(kMatMulWidthDim, in_bias2.read<int8_t>(0));
7681

7782
return MLI_STATUS_OK;
7883
}

lib/src/kernels/convolution/mli_krn_matmul_ref.hpp

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,17 @@ namespace ref {
2828
#pragma MLI_CODE_SECTION_START(".mli_lib")
2929

3030

31-
template <typename in1_t, typename in2_t,typename out_t, uint32_t rank>
32-
void MatMul_prepare_and_run(Tensor<InternalBuffer, rank> &in_left,
33-
Tensor<InternalBuffer, rank> &in_right,
34-
Tensor<InternalBuffer, rank> &output,
31+
template <typename in1_t, typename in2_t,typename out_t>
32+
void MatMul_prepare_and_run(Tensor<InternalBuffer, kMatMulRank> &in_left,
33+
Tensor<InternalBuffer, kMatMulRank> &in_right,
34+
Tensor<InternalBuffer, kMatMulRank> &output,
3535
InternalBuffer &encoded_params) {
3636
/**
3737
* layout = HW
3838
* H of left = W of right
3939
* output shape must be of shape Hr * Wl
4040
* rank = 2
4141
*/
42-
MLI_ASSERT(rank == kMatMulRank);
4342
MLI_ASSERT(in_left.get_dim(kMatMulWidthDim) == in_right.get_dim(kMatMulHeightDim));
4443
MLI_ASSERT(output.get_dim(kMatMulHeightDim) == in_left.get_dim(kMatMulHeightDim));
4544
MLI_ASSERT(output.get_dim(kMatMulWidthDim) == in_right.get_dim(kMatMulWidthDim));
@@ -49,20 +48,29 @@ void MatMul_prepare_and_run(Tensor<InternalBuffer, rank> &in_left,
4948
in1_t val1;
5049
in2_t val2;
5150
out_t acc;
51+
52+
/**
53+
* leftzp is the first element of the encoded buffer.
54+
* rightzp is the second element of the encoded buffer.
55+
*/
5256
int8_t in_left_zp = encoded_params.read<int8_t>(kMatMulHeightDim);
5357
int8_t in_right_zp = encoded_params.read<int8_t>(kMatMulWidthDim);
5458
uint32_t left_h = in_left.get_dim(kMatMulHeightDim);
5559
uint32_t right_w = in_right.get_dim(kMatMulWidthDim);
5660
uint32_t left_w = in_left.get_dim(kMatMulWidthDim);
57-
for(uint32_t i = 0; i < left_h; ++i) {
58-
for (uint32_t j = 0; j < right_w; ++j) {
61+
int32_t left_mem_strides[kMatMulRank];
62+
int32_t right_mem_strides[kMatMulRank];
63+
in_left.get_mem_strides(left_mem_strides);
64+
in_right.get_mem_strides(right_mem_strides);
65+
for(uint32_t left_height_index = 0; left_height_index < left_h; ++left_height_index) {
66+
for (uint32_t right_width_index = 0; right_width_index < right_w; ++right_width_index) {
5967
acc = 0;
60-
for (uint32_t k = 0; k < left_w; ++k) {
61-
val1 = in_left.template read<in1_t>(i * left_w + k) - in_left_zp;
62-
val2 = in_right.template read<in2_t>(k * right_w + j) - in_right_zp;
68+
for (uint32_t left_width_index = 0, right_height_index = 0; left_width_index < left_w; ++left_width_index, ++right_height_index) {
69+
val1 = in_left.template read<in1_t>(left_height_index * left_mem_strides[0] + left_width_index) - in_left_zp;
70+
val2 = in_right.template read<in2_t>(right_height_index * right_mem_strides[0] + right_width_index) - in_right_zp;
6371
acc += val1 * val2;
6472
}
65-
output.template write<out_t>( i * right_w + j, static_cast<out_t>(acc) );
73+
output.template write<out_t>( left_height_index * right_w + right_width_index, static_cast<out_t>(acc) );
6674
}
6775
}
6876
}

lib/src/kernels/convolution/mli_krn_matmul_runtime.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ mli_status MatMul::Issue() {
4848
if (m_i_elem_size == sizeof(int8_t) &&
4949
m_o_elem_size == sizeof(int32_t)) {
5050

51-
MatMul_prepare_and_run<int8_t, int8_t, int32_t, kMatMulRank>
51+
MatMul_prepare_and_run<int8_t, int8_t, int32_t>
5252
(m_tile_input_left, m_tile_input_right, m_tile_output, m_encoded_params);
5353
} else {
5454
// not supported yet

user_tests/tests/mli_krn_matmul_30/tests_mli_krn_matmul_30.cc

Lines changed: 104 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ struct MatMul_test_operands {
5151
int8_t in2_zp;
5252
tensor_quantizer out;
5353
uint32_t data_size;
54-
5554
const quality_metrics threshold;
5655
const crc32_calc check_sum;
5756
};
@@ -90,19 +89,19 @@ static IO_DATA_ATTR int8_t g_mem_pool[kMemSize] = {0};
9089

9190
constexpr int kTestsNum = sizeof(tests_list) / sizeof(tests_list[0]);
9291

93-
template <typename in1_t, typename in2_t,typename out_t, uint32_t rank>
94-
void MatMul_prepare_and_run(Tensor<InternalBuffer, rank> &in_left,
95-
Tensor<InternalBuffer, rank> &in_right,
96-
Tensor<InternalBuffer, rank> &output,
97-
InternalBuffer &encoded_params);
9892

9993
void prepare_phase(MatMul_test_operands* cur_test,
10094
void*& MatMul_instance,
10195
uint32_t& MatMul_instance_size,
10296
void*& MatMul_conf_private,
10397
uint32_t& MatMul_conf_private_size,
104-
uint32_t& output_size,
105-
uint32_t& output_offset ) {
98+
lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank, kMatMulIterRank> &input1_tensor,
99+
lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank, kMatMulIterRank> &input2_tensor,
100+
lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank, kMatMulIterRank> &output_tensor,
101+
uint32_t *input_tile_shape,
102+
uint32_t *output_tile_shape,
103+
int32_t *iteration_order ) {
104+
106105

107106
mli_data_container temp_in1_container{0};
108107
mli_data_container temp_in2_container{0};
@@ -125,9 +124,13 @@ void prepare_phase(MatMul_test_operands* cur_test,
125124
const lib_mli::Tensor<lib_mli::NoBuffer, kMatMulRank> in2_tensor(temp_input2_tensor.shape, temp_input2_tensor.mem_stride);
126125
const lib_mli::Tensor<lib_mli::NoBuffer, kMatMulRank> out_tensor(temp_output_tensor.shape, temp_output_tensor.mem_stride);
127126

128-
lib_mli::TensorIterator<lib_mli::NoBuffer, kMatMulRank, kMatMulIterRank> in1_tensor_it(in1_tensor);
129-
lib_mli::TensorIterator<lib_mli::NoBuffer, kMatMulRank, kMatMulIterRank> in2_tensor_it(in2_tensor);
130-
lib_mli::TensorIterator<lib_mli::NoBuffer, kMatMulRank, kMatMulIterRank> out_tensor_it(out_tensor);
127+
lib_mli::TensorIterator<lib_mli::NoBuffer, kMatMulRank, kMatMulIterRank> in1_tensor_it(in1_tensor, input_tile_shape, iteration_order);
128+
lib_mli::TensorIterator<lib_mli::NoBuffer, kMatMulRank, kMatMulIterRank> in2_tensor_it(in2_tensor, temp_input2_tensor.shape, iteration_order);
129+
lib_mli::TensorIterator<lib_mli::NoBuffer, kMatMulRank, kMatMulIterRank> out_tensor_it(out_tensor, output_tile_shape, iteration_order);
130+
131+
input1_tensor = in1_tensor_it;
132+
input2_tensor = in2_tensor_it;
133+
output_tensor = out_tensor_it;
131134

132135
lib_mli::PlatformDescription pd;
133136
lib_ref::KernelsFactory kernel_factory(pd);
@@ -156,26 +159,29 @@ void prepare_phase(MatMul_test_operands* cur_test,
156159

157160
// MatMul Input1
158161
offset = &offsets[0];
159-
uint32_t in1_size = lib_mli::service::GetBufferSize(lib_mli::kMatMulRank, temp_input1_tensor.shape, temp_input1_tensor.mem_stride) * elem_size;
162+
163+
uint32_t in1_size = lib_mli::service::GetBufferSize(lib_mli::kMatMulRank, input_tile_shape, temp_input1_tensor.mem_stride) * elem_size;
160164
lib_mli::OffsetBuffer MatMul_in1_buf{*offset, 0, in1_size, elem_size};
165+
input1_tensor.set_buf(MatMul_in1_buf);
161166
uint32_t in1_mem_offset = *offset;
162167
*offset += in1_size;
163168

164169
// MatMul Input2
165170
offset = &offsets[0];
166171
uint32_t in2_size = lib_mli::service::GetBufferSize(lib_mli::kMatMulRank, temp_input2_tensor.shape, temp_input2_tensor.mem_stride) * elem_size;
167172
lib_mli::OffsetBuffer MatMul_in2_buf{*offset, 0, in2_size, elem_size};
173+
input2_tensor.set_buf(MatMul_in2_buf);
168174
uint32_t in2_mem_offset = *offset;
169175
*offset += in2_size;
170176

171177
// MatMul Output
172178
offset = &offsets[0];
173-
uint32_t out_size = lib_mli::service::GetBufferSize(lib_mli::kMatMulRank, temp_output_tensor.shape, temp_output_tensor.mem_stride) * sizeof(int32_t);
179+
180+
uint32_t out_size = lib_mli::service::GetBufferSize(lib_mli::kMatMulRank, output_tile_shape, temp_output_tensor.mem_stride) * sizeof(int32_t);
174181
lib_mli::OffsetBuffer MatMul_out_buf{*offset, 0, out_size, sizeof(int32_t)};
182+
output_tensor.set_buf(MatMul_out_buf);
175183
uint32_t out_mem_offset = *offset;
176184
*offset += out_size;
177-
output_offset = out_mem_offset;
178-
output_size = out_size;
179185

180186
// MatMul input zero point
181187
uint32_t inpzp_size = MatMul_op->GetEncodedParamsSize() * elem_size;
@@ -215,20 +221,6 @@ void prepare_phase(MatMul_test_operands* cur_test,
215221
const uint32_t idx = inpzp_mem_offset + i;
216222
g_mem_pool[idx] = encoded_zp_buf.read<int8_t>(i);
217223
}
218-
/*copy in1 from scratch memory to g_mem_pool*/
219-
int8_t* temp_mem = (int8_t*)g_scratch_mem_in1;
220-
int8_t* dst_buffer = (int8_t*) (g_mem_pool + in1_mem_offset);
221-
for (int idx = 0; idx < in1_size; idx++) {
222-
dst_buffer[idx] = temp_mem[idx];
223-
}
224-
225-
/*copy in2 from scratch memory to g_mem_pool*/
226-
temp_mem = (int8_t*)g_scratch_mem_in2;
227-
dst_buffer = (int8_t*) (g_mem_pool + in2_mem_offset);
228-
for (int idx = 0; idx < in2_size; idx++) {
229-
dst_buffer[idx] = temp_mem[idx];
230-
231-
}
232224

233225
MatMul_instance = (int8_t*)g_mem_pool;
234226
MatMul_instance_size = MatMul_op->GetRuntimeObjectSize();
@@ -243,10 +235,13 @@ void prepare_phase(MatMul_test_operands* cur_test,
243235
void execution_phase(const MatMul_test_operands* cur_test,
244236
void* MatMul_instance,
245237
uint32_t MatMul_instance_size,
246-
void* MatMul_conf_private,
247-
uint32_t MatMul_conf_private_size,
248-
uint32_t& output_size,
249-
uint32_t& output_offset) {
238+
void*& MatMul_conf_private,
239+
uint32_t& MatMul_conf_private_size,
240+
lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank, kMatMulIterRank> &input1_tensor,
241+
lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank, kMatMulIterRank> &input2_tensor,
242+
lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank, kMatMulIterRank> &output_tensor,
243+
uint32_t& num_tiles,
244+
int32_t* iteration_order) {
250245
// STEP 3: Execution phase
251246
//==================================================================
252247

@@ -257,24 +252,57 @@ void execution_phase(const MatMul_test_operands* cur_test,
257252
MatMul_conf_private,
258253
MatMul_conf_private_size,
259254
membasis, sizeof(membasis) / sizeof(membasis[0]));
255+
256+
260257
assert(MatMul_run_op != nullptr);
261258
mli_status status = MLI_STATUS_OK;
262259

260+
uint32_t input1_tile_size[kMatMulRank]{};
261+
uint32_t input2_tile_size[kMatMulRank]{};
262+
uint32_t output_tile_size[kMatMulRank]{};
263+
int32_t input1_tile_offsets[kMatMulRank]{};
264+
int32_t input2_tile_offsets[kMatMulRank]{};
265+
int32_t output_tile_offsets[kMatMulRank]{};
266+
int32_t tile_input1_strides[kMatMulRank]{};
267+
int32_t tile_input2_strides[kMatMulRank]{};
268+
int32_t tile_output_strides[kMatMulRank]{};
269+
const int32_t zero_offsets[kMatMulRank]{};
270+
271+
input1_tensor.get_mem_strides(tile_input1_strides);
272+
input2_tensor.get_mem_strides(tile_input2_strides);
273+
output_tensor.get_mem_strides(tile_output_strides);
274+
275+
for (size_t i = 0; i < num_tiles; i++)
276+
{
277+
lib_ref::MatMul* pimpl = dynamic_cast<lib_ref::MatMul*>(MatMul_run_op);
278+
pimpl->GetIOSizesAndOffsets(input1_tile_size, input2_tile_size, output_tile_size,
279+
input1_tile_offsets, input2_tile_offsets, output_tile_offsets);
280+
input2_tile_offsets[0] = input2_tile_offsets[1] = 0;
281+
263282
status = MatMul_run_op->Prefetch();
264283
assert(status == MLI_STATUS_OK);
265284

285+
// copy inputs from global buffer to local tile buffer
286+
strided_copy_with_offsets(kMatMulRank, input1_tensor.get_buf().get_elem_size(),
287+
g_scratch_mem_in1, input1_tile_offsets, zero_offsets, tile_input1_strides,
288+
input1_tile_size, (int8_t*)(g_mem_pool + input1_tensor.get_buf().get_offset()));
289+
290+
strided_copy_with_offsets(kMatMulRank, input2_tensor.get_buf().get_elem_size(),
291+
g_scratch_mem_in2, input2_tile_offsets, zero_offsets, tile_input2_strides,
292+
input2_tile_size, (int8_t*)(g_mem_pool + input2_tensor.get_buf().get_offset()));
293+
294+
266295
status = MatMul_run_op->Issue();
267296
assert(status == MLI_STATUS_OK);
268297

269-
status = MatMul_run_op->Update();
270-
assert(status == MLI_STATUS_OK);
271-
298+
// copy output from local tile buffer to global buffer
299+
strided_copy_with_offsets(kMatMulRank, output_tensor.get_buf().get_elem_size(),
300+
(int8_t*)(g_mem_pool + output_tensor.get_buf().get_offset()),
301+
zero_offsets, output_tile_offsets, tile_output_strides,
302+
output_tile_size, (int8_t*)g_scratch_mem_out);
272303

273-
// copy output from g_mem_pool to scratch mem
274-
int8_t *dest_mem = (int8_t*)g_scratch_mem_out;
275-
int8_t *temp_buffer = (int8_t*) (g_mem_pool + output_offset);
276-
for (int idx = 0; idx < output_size; idx++) {
277-
dest_mem[idx] = temp_buffer[idx];
304+
status = MatMul_run_op->Update();
305+
assert(status == MLI_STATUS_OK);
278306
}
279307
}
280308

@@ -343,16 +371,45 @@ int main() {
343371

344372
void* MatMul_conf_private = nullptr;
345373
uint32_t MatMul_conf_private_size = 0;
346-
uint32_t output_size = 0;
347-
uint32_t output_offset = 0;
374+
375+
lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank, kMatMulIterRank> in1_tensor_iter;
376+
lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank, kMatMulIterRank> in2_tensor_iter;
377+
lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank, kMatMulIterRank> out_tensor_iter;
378+
379+
380+
mli_data_container temp_in1_container{0};
381+
mli_data_container temp_in2_container{0};
382+
temp_in1_container.capacity = cur_test->in1.get_not_quantized_tensor(temp_in1_container).data.capacity;
383+
temp_in2_container.capacity = cur_test->in2.get_not_quantized_tensor(temp_in2_container).data.capacity;
384+
385+
mli_tensor temp_input1_tensor = cur_test->in1.get_quantized_tensor(temp_in1_container);
386+
mli_tensor temp_input2_tensor = cur_test->in2.get_quantized_tensor(temp_in2_container);
387+
388+
uint32_t input_tile_size[kMatMulRank] = {1, temp_input1_tensor.shape[1]};
389+
uint32_t output_tile_size[kMatMulRank] = {1, temp_input2_tensor.shape[1]};
390+
int32_t iteration_order[kMatMulRank] = {0, 1};
391+
uint32_t shape[kMatMulRank] = {temp_input1_tensor.shape[0], temp_input1_tensor.shape[1]};
392+
393+
//tiling the Height
394+
assert(input_tile_size[1] == temp_input1_tensor.shape[1]);
395+
assert(output_tile_size[1] == temp_input2_tensor.shape[1]);
396+
397+
// calculate number of tiles needed
398+
uint32_t num_tiles = 1;
399+
for(int i = 0; i < kMatMulRank; i++) {
400+
uint32_t tiles_per_dim = 1 + CEIL_DIV(shape[i] - input_tile_size[i], input_tile_size[i]);
401+
num_tiles *= tiles_per_dim;
402+
}
403+
404+
348405
/************ Prepare Phase *************/
349-
prepare_phase(cur_test, MatMul_instance, MatMul_instance_size,
350-
MatMul_conf_private, MatMul_conf_private_size, output_size, output_offset);
406+
prepare_phase(cur_test, MatMul_instance, MatMul_instance_size, MatMul_conf_private, MatMul_conf_private_size,
407+
in1_tensor_iter, in2_tensor_iter, out_tensor_iter, input_tile_size, output_tile_size, iteration_order);
351408

352409

353410
/************ Execution Phase *************/
354-
execution_phase(cur_test, MatMul_instance, MatMul_instance_size,
355-
MatMul_conf_private, MatMul_conf_private_size, output_size, output_offset);
411+
execution_phase(cur_test, MatMul_instance, MatMul_instance_size, MatMul_conf_private, MatMul_conf_private_size,
412+
in1_tensor_iter, in2_tensor_iter, out_tensor_iter, num_tiles, iteration_order);
356413

357414

358415
/************ Postprocess Phase *************/

0 commit comments

Comments
 (0)