@@ -51,7 +51,6 @@ struct MatMul_test_operands {
5151 int8_t in2_zp;
5252 tensor_quantizer out;
5353 uint32_t data_size;
54-
5554 const quality_metrics threshold;
5655 const crc32_calc check_sum;
5756};
@@ -90,19 +89,19 @@ static IO_DATA_ATTR int8_t g_mem_pool[kMemSize] = {0};
9089
9190constexpr int kTestsNum = sizeof (tests_list) / sizeof (tests_list[0 ]);
9291
93- template <typename in1_t , typename in2_t ,typename out_t , uint32_t rank>
94- void MatMul_prepare_and_run (Tensor<InternalBuffer, rank> &in_left,
95- Tensor<InternalBuffer, rank> &in_right,
96- Tensor<InternalBuffer, rank> &output,
97- InternalBuffer &encoded_params);
9892
9993void prepare_phase (MatMul_test_operands* cur_test,
10094 void *& MatMul_instance,
10195 uint32_t & MatMul_instance_size,
10296 void *& MatMul_conf_private,
10397 uint32_t & MatMul_conf_private_size,
104- uint32_t & output_size,
105- uint32_t & output_offset ) {
98+ lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank , kMatMulIterRank > &input1_tensor,
99+ lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank , kMatMulIterRank > &input2_tensor,
100+ lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank , kMatMulIterRank > &output_tensor,
101+ uint32_t *input_tile_shape,
102+ uint32_t *output_tile_shape,
103+ int32_t *iteration_order ) {
104+
106105
107106 mli_data_container temp_in1_container{0 };
108107 mli_data_container temp_in2_container{0 };
@@ -125,9 +124,13 @@ void prepare_phase(MatMul_test_operands* cur_test,
125124 const lib_mli::Tensor<lib_mli::NoBuffer, kMatMulRank > in2_tensor (temp_input2_tensor.shape , temp_input2_tensor.mem_stride );
126125 const lib_mli::Tensor<lib_mli::NoBuffer, kMatMulRank > out_tensor (temp_output_tensor.shape , temp_output_tensor.mem_stride );
127126
128- lib_mli::TensorIterator<lib_mli::NoBuffer, kMatMulRank , kMatMulIterRank > in1_tensor_it (in1_tensor);
129- lib_mli::TensorIterator<lib_mli::NoBuffer, kMatMulRank , kMatMulIterRank > in2_tensor_it (in2_tensor);
130- lib_mli::TensorIterator<lib_mli::NoBuffer, kMatMulRank , kMatMulIterRank > out_tensor_it (out_tensor);
127+ lib_mli::TensorIterator<lib_mli::NoBuffer, kMatMulRank , kMatMulIterRank > in1_tensor_it (in1_tensor, input_tile_shape, iteration_order);
128+ lib_mli::TensorIterator<lib_mli::NoBuffer, kMatMulRank , kMatMulIterRank > in2_tensor_it (in2_tensor, temp_input2_tensor.shape , iteration_order);
129+ lib_mli::TensorIterator<lib_mli::NoBuffer, kMatMulRank , kMatMulIterRank > out_tensor_it (out_tensor, output_tile_shape, iteration_order);
130+
131+ input1_tensor = in1_tensor_it;
132+ input2_tensor = in2_tensor_it;
133+ output_tensor = out_tensor_it;
131134
132135 lib_mli::PlatformDescription pd;
133136 lib_ref::KernelsFactory kernel_factory (pd);
@@ -156,26 +159,29 @@ void prepare_phase(MatMul_test_operands* cur_test,
156159
157160 // MatMul Input1
158161 offset = &offsets[0 ];
159- uint32_t in1_size = lib_mli::service::GetBufferSize (lib_mli::kMatMulRank , temp_input1_tensor.shape , temp_input1_tensor.mem_stride ) * elem_size;
162+
163+ uint32_t in1_size = lib_mli::service::GetBufferSize (lib_mli::kMatMulRank , input_tile_shape, temp_input1_tensor.mem_stride ) * elem_size;
160164 lib_mli::OffsetBuffer MatMul_in1_buf{*offset, 0 , in1_size, elem_size};
165+ input1_tensor.set_buf (MatMul_in1_buf);
161166 uint32_t in1_mem_offset = *offset;
162167 *offset += in1_size;
163168
164169 // MatMul Input2
165170 offset = &offsets[0 ];
166171 uint32_t in2_size = lib_mli::service::GetBufferSize (lib_mli::kMatMulRank , temp_input2_tensor.shape , temp_input2_tensor.mem_stride ) * elem_size;
167172 lib_mli::OffsetBuffer MatMul_in2_buf{*offset, 0 , in2_size, elem_size};
173+ input2_tensor.set_buf (MatMul_in2_buf);
168174 uint32_t in2_mem_offset = *offset;
169175 *offset += in2_size;
170176
171177 // MatMul Output
172178 offset = &offsets[0 ];
173- uint32_t out_size = lib_mli::service::GetBufferSize (lib_mli::kMatMulRank , temp_output_tensor.shape , temp_output_tensor.mem_stride ) * sizeof (int32_t );
179+
180+ uint32_t out_size = lib_mli::service::GetBufferSize (lib_mli::kMatMulRank , output_tile_shape, temp_output_tensor.mem_stride ) * sizeof (int32_t );
174181 lib_mli::OffsetBuffer MatMul_out_buf{*offset, 0 , out_size, sizeof (int32_t )};
182+ output_tensor.set_buf (MatMul_out_buf);
175183 uint32_t out_mem_offset = *offset;
176184 *offset += out_size;
177- output_offset = out_mem_offset;
178- output_size = out_size;
179185
180186 // MatMul input zero point
181187 uint32_t inpzp_size = MatMul_op->GetEncodedParamsSize () * elem_size;
@@ -215,20 +221,6 @@ void prepare_phase(MatMul_test_operands* cur_test,
215221 const uint32_t idx = inpzp_mem_offset + i;
216222 g_mem_pool[idx] = encoded_zp_buf.read <int8_t >(i);
217223 }
218- /* copy in1 from scratch memory to g_mem_pool*/
219- int8_t * temp_mem = (int8_t *)g_scratch_mem_in1;
220- int8_t * dst_buffer = (int8_t *) (g_mem_pool + in1_mem_offset);
221- for (int idx = 0 ; idx < in1_size; idx++) {
222- dst_buffer[idx] = temp_mem[idx];
223- }
224-
225- /* copy in2 from scratch memory to g_mem_pool*/
226- temp_mem = (int8_t *)g_scratch_mem_in2;
227- dst_buffer = (int8_t *) (g_mem_pool + in2_mem_offset);
228- for (int idx = 0 ; idx < in2_size; idx++) {
229- dst_buffer[idx] = temp_mem[idx];
230-
231- }
232224
233225 MatMul_instance = (int8_t *)g_mem_pool;
234226 MatMul_instance_size = MatMul_op->GetRuntimeObjectSize ();
@@ -243,10 +235,13 @@ void prepare_phase(MatMul_test_operands* cur_test,
243235void execution_phase (const MatMul_test_operands* cur_test,
244236 void * MatMul_instance,
245237 uint32_t MatMul_instance_size,
246- void * MatMul_conf_private,
247- uint32_t MatMul_conf_private_size,
248- uint32_t & output_size,
249- uint32_t & output_offset) {
238+ void *& MatMul_conf_private,
239+ uint32_t & MatMul_conf_private_size,
240+ lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank , kMatMulIterRank > &input1_tensor,
241+ lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank , kMatMulIterRank > &input2_tensor,
242+ lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank , kMatMulIterRank > &output_tensor,
243+ uint32_t & num_tiles,
244+ int32_t * iteration_order) {
250245 // STEP 3: Execution phase
251246 // ==================================================================
252247
@@ -257,24 +252,57 @@ void execution_phase(const MatMul_test_operands* cur_test,
257252 MatMul_conf_private,
258253 MatMul_conf_private_size,
259254 membasis, sizeof (membasis) / sizeof (membasis[0 ]));
255+
256+
260257 assert (MatMul_run_op != nullptr );
261258 mli_status status = MLI_STATUS_OK;
262259
260+ uint32_t input1_tile_size[kMatMulRank ]{};
261+ uint32_t input2_tile_size[kMatMulRank ]{};
262+ uint32_t output_tile_size[kMatMulRank ]{};
263+ int32_t input1_tile_offsets[kMatMulRank ]{};
264+ int32_t input2_tile_offsets[kMatMulRank ]{};
265+ int32_t output_tile_offsets[kMatMulRank ]{};
266+ int32_t tile_input1_strides[kMatMulRank ]{};
267+ int32_t tile_input2_strides[kMatMulRank ]{};
268+ int32_t tile_output_strides[kMatMulRank ]{};
269+ const int32_t zero_offsets[kMatMulRank ]{};
270+
271+ input1_tensor.get_mem_strides (tile_input1_strides);
272+ input2_tensor.get_mem_strides (tile_input2_strides);
273+ output_tensor.get_mem_strides (tile_output_strides);
274+
275+ for (size_t i = 0 ; i < num_tiles; i++)
276+ {
277+ lib_ref::MatMul* pimpl = dynamic_cast <lib_ref::MatMul*>(MatMul_run_op);
278+ pimpl->GetIOSizesAndOffsets (input1_tile_size, input2_tile_size, output_tile_size,
279+ input1_tile_offsets, input2_tile_offsets, output_tile_offsets);
280+ input2_tile_offsets[0 ] = input2_tile_offsets[1 ] = 0 ;
281+
263282 status = MatMul_run_op->Prefetch ();
264283 assert (status == MLI_STATUS_OK);
265284
285+ // copy inputs from global buffer to local tile buffer
286+ strided_copy_with_offsets (kMatMulRank , input1_tensor.get_buf ().get_elem_size (),
287+ g_scratch_mem_in1, input1_tile_offsets, zero_offsets, tile_input1_strides,
288+ input1_tile_size, (int8_t *)(g_mem_pool + input1_tensor.get_buf ().get_offset ()));
289+
290+ strided_copy_with_offsets (kMatMulRank , input2_tensor.get_buf ().get_elem_size (),
291+ g_scratch_mem_in2, input2_tile_offsets, zero_offsets, tile_input2_strides,
292+ input2_tile_size, (int8_t *)(g_mem_pool + input2_tensor.get_buf ().get_offset ()));
293+
294+
266295 status = MatMul_run_op->Issue ();
267296 assert (status == MLI_STATUS_OK);
268297
269- status = MatMul_run_op->Update ();
270- assert (status == MLI_STATUS_OK);
271-
298+ // copy output from local tile buffer to global buffer
299+ strided_copy_with_offsets (kMatMulRank , output_tensor.get_buf ().get_elem_size (),
300+ (int8_t *)(g_mem_pool + output_tensor.get_buf ().get_offset ()),
301+ zero_offsets, output_tile_offsets, tile_output_strides,
302+ output_tile_size, (int8_t *)g_scratch_mem_out);
272303
273- // copy output from g_mem_pool to scratch mem
274- int8_t *dest_mem = (int8_t *)g_scratch_mem_out;
275- int8_t *temp_buffer = (int8_t *) (g_mem_pool + output_offset);
276- for (int idx = 0 ; idx < output_size; idx++) {
277- dest_mem[idx] = temp_buffer[idx];
304+ status = MatMul_run_op->Update ();
305+ assert (status == MLI_STATUS_OK);
278306 }
279307 }
280308
@@ -343,16 +371,45 @@ int main() {
343371
344372 void * MatMul_conf_private = nullptr ;
345373 uint32_t MatMul_conf_private_size = 0 ;
346- uint32_t output_size = 0 ;
347- uint32_t output_offset = 0 ;
374+
375+ lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank , kMatMulIterRank > in1_tensor_iter;
376+ lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank , kMatMulIterRank > in2_tensor_iter;
377+ lib_mli::TensorIterator<lib_mli::OffsetBuffer, kMatMulRank , kMatMulIterRank > out_tensor_iter;
378+
379+
380+ mli_data_container temp_in1_container{0 };
381+ mli_data_container temp_in2_container{0 };
382+ temp_in1_container.capacity = cur_test->in1 .get_not_quantized_tensor (temp_in1_container).data .capacity ;
383+ temp_in2_container.capacity = cur_test->in2 .get_not_quantized_tensor (temp_in2_container).data .capacity ;
384+
385+ mli_tensor temp_input1_tensor = cur_test->in1 .get_quantized_tensor (temp_in1_container);
386+ mli_tensor temp_input2_tensor = cur_test->in2 .get_quantized_tensor (temp_in2_container);
387+
388+ uint32_t input_tile_size[kMatMulRank ] = {1 , temp_input1_tensor.shape [1 ]};
389+ uint32_t output_tile_size[kMatMulRank ] = {1 , temp_input2_tensor.shape [1 ]};
390+ int32_t iteration_order[kMatMulRank ] = {0 , 1 };
391+ uint32_t shape[kMatMulRank ] = {temp_input1_tensor.shape [0 ], temp_input1_tensor.shape [1 ]};
392+
393+ // tiling the Height
394+ assert (input_tile_size[1 ] == temp_input1_tensor.shape [1 ]);
395+ assert (output_tile_size[1 ] == temp_input2_tensor.shape [1 ]);
396+
397+ // calculate number of tiles needed
398+ uint32_t num_tiles = 1 ;
399+ for (int i = 0 ; i < kMatMulRank ; i++) {
400+ uint32_t tiles_per_dim = 1 + CEIL_DIV (shape[i] - input_tile_size[i], input_tile_size[i]);
401+ num_tiles *= tiles_per_dim;
402+ }
403+
404+
348405 /* *********** Prepare Phase *************/
349- prepare_phase (cur_test, MatMul_instance, MatMul_instance_size,
350- MatMul_conf_private, MatMul_conf_private_size, output_size, output_offset );
406+ prepare_phase (cur_test, MatMul_instance, MatMul_instance_size, MatMul_conf_private, MatMul_conf_private_size,
407+ in1_tensor_iter, in2_tensor_iter, out_tensor_iter, input_tile_size, output_tile_size, iteration_order );
351408
352409
353410 /* *********** Execution Phase *************/
354- execution_phase (cur_test, MatMul_instance, MatMul_instance_size,
355- MatMul_conf_private, MatMul_conf_private_size, output_size, output_offset );
411+ execution_phase (cur_test, MatMul_instance, MatMul_instance_size, MatMul_conf_private, MatMul_conf_private_size,
412+ in1_tensor_iter, in2_tensor_iter, out_tensor_iter, num_tiles, iteration_order );
356413
357414
358415 /* *********** Postprocess Phase *************/
0 commit comments