@@ -51,47 +51,18 @@ class GET_ROW_F32 {
5151 // All data should asign to 32. It's ok because all data is align to 32.
5252 pipe.InitBuffer (input_queue, BUFFER_NUM, local_buffer_size);
5353 pipe.InitBuffer (output_queue, BUFFER_NUM, local_buffer_size);
54- // printf("f32 BLOCK_IDX:%d get_row: init: ir:%d, dr:%d, n_elements:%d.\n", op_block_idx, ir, dr, n_elements);
5554 }
5655
5756 __aicore__ inline void copy_in (uint32_t offset, size_t len) {
58- size_t origin_len = len;
5957 LocalTensor<float > input_local = input_queue.AllocTensor <float >();
6058 const size_t elem_per_block = 32 / sizeof (float );
6159 size_t tail = len % elem_per_block;
6260 len = len & ~(elem_per_block - 1 );
6361
64- // printf("f32 BLOCK_IDX:%d get_row: Copy_in: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
6562 if (len > 0 )
6663 DataCopy (input_local, input_gm[offset], len);
67- // printf("f32 BLOCK_IDX:%d get_row: Copy_in executed: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
6864 if (tail != 0 ) {
69- #if 1
70- /* //printf("f32 BLOCK_IDX:%d get_row: Copy_in ENTER tail != 0: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
71- for (int i = 0; i < elem_per_block; i++) {
72- printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, origin input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0));
73- }
74- //DumpTensor(input_gm[offset + len], 5, elem_per_block);
75- for (int i = 0; i < tail; i++) {
76- printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, input local val:%f, input_gm:%f.\n", op_block_idx, len + i, input_local[len + i].GetValue(0), input_gm[offset + len + i]);
77- } */
7865 DataCopy (input_local[len], input_gm[offset + len], elem_per_block);
79- // clean
80- /* for (int i = tail; i < elem_per_block; i++) {
81- input_local[len + i].SetValue(0, 0);
82- }
83- for (int i = 0; i < elem_per_block; i++) {
84- printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, after clean and copy, input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0));
85- } */
86- #endif
87- #if 0
88- DataCopyExtParams dataCopyParams;
89- dataCopyParams.blockCount = 1;
90- dataCopyParams.blockLen = tail * sizeof(float);
91- DataCopyPadExtParams<float> padParams;
92- DataCopyPad(input_local[len], input_gm[offset + len],
93- dataCopyParams, padParams);
94- #endif
9566 }
9667 input_queue.EnQue (input_local);
9768 }
@@ -104,31 +75,15 @@ class GET_ROW_F32 {
10475 if (len > 0 ) {
10576 DataCopy (output_gm[offset], output_local, len);
10677 }
107-
108- #if 1
10978 if (tail != 0 ) {
11079 for (size_t i = tail; i < elem_per_block; i++) {
11180 output_local[len + i].SetValue (0 , 0 );
11281 }
113- // printf("\nf32 BLOCK_IDX:%d get_row: Copy_Out AtomicAdd: offset:%d, len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, tail, elem_per_block);
114- /* DumpTensor(output_gm[offset + len], 5, elem_per_block);
115- DumpTensor(output_local[len], 5, elem_per_block); */
11682 SetAtomicAdd<float >();
11783 DataCopy (output_gm[offset + len], output_local[len], elem_per_block);
11884 SetAtomicNone ();
119- /* DumpTensor(output_gm[offset + len], 5, elem_per_block); */
12085 }
121- #endif
122- #if 0
123- if(tail != 0) {
12486
125- DataCopyExtParams dataCopyParams;
126- dataCopyParams.blockCount = 1;
127- dataCopyParams.blockLen = tail * sizeof(float);
128- DataCopyPad(output_gm[offset + len], output_local[len],
129- dataCopyParams);
130- }
131- #endif
13287 output_queue.FreeTensor (output_local);
13388 }
13489
0 commit comments