@@ -51,47 +51,18 @@ class GET_ROW_F32 {
5151        //  All data should asign to 32. It's ok because all data is align to 32.
5252        pipe.InitBuffer (input_queue, BUFFER_NUM, local_buffer_size);
5353        pipe.InitBuffer (output_queue, BUFFER_NUM, local_buffer_size);
54-         //  printf("f32 BLOCK_IDX:%d get_row: init: ir:%d, dr:%d, n_elements:%d.\n", op_block_idx, ir, dr, n_elements);
5554    }
5655
5756    __aicore__ inline  void  copy_in (uint32_t  offset, size_t  len) {
58-         size_t  origin_len = len;
5957        LocalTensor<float > input_local = input_queue.AllocTensor <float >();
6058        const  size_t  elem_per_block = 32  / sizeof (float );
6159        size_t  tail = len % elem_per_block;
6260        len = len & ~(elem_per_block - 1 );
6361
64-         // printf("f32 BLOCK_IDX:%d get_row: Copy_in: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
6562        if  (len > 0 )
6663            DataCopy (input_local, input_gm[offset], len);
67-         // printf("f32 BLOCK_IDX:%d get_row: Copy_in executed: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
6864        if (tail != 0 ) {
69- #if  1 
70- /*              //printf("f32 BLOCK_IDX:%d get_row: Copy_in ENTER tail != 0: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
71-             for (int i = 0; i < elem_per_block; i++) { 
72-                 printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, origin input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0)); 
73-             } 
74-             //DumpTensor(input_gm[offset + len], 5, elem_per_block); 
75-             for (int i = 0; i < tail; i++) { 
76-                 printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, input local val:%f, input_gm:%f.\n", op_block_idx, len + i, input_local[len + i].GetValue(0), input_gm[offset + len + i]); 
77-             } */  
7865            DataCopy (input_local[len], input_gm[offset + len], elem_per_block);
79-             //  clean
80- /*              for (int i = tail; i < elem_per_block; i++) {
81-                 input_local[len + i].SetValue(0, 0); 
82-             } 
83-             for (int i = 0; i < elem_per_block; i++) { 
84-                 printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, after clean and copy, input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0)); 
85-             }  */  
86- #endif 
87- #if  0 
88-             DataCopyExtParams dataCopyParams;
89-             dataCopyParams.blockCount = 1;
90-             dataCopyParams.blockLen = tail * sizeof(float);
91-             DataCopyPadExtParams<float> padParams;
92-             DataCopyPad(input_local[len], input_gm[offset + len],
93-                         dataCopyParams, padParams);
94- #endif 
9566        }
9667        input_queue.EnQue (input_local);
9768    }
@@ -104,31 +75,15 @@ class GET_ROW_F32 {
10475        if  (len > 0 ) {
10576            DataCopy (output_gm[offset], output_local, len);
10677        }
107- 
108- #if  1 
10978        if (tail != 0 ) {
11079            for  (size_t  i = tail; i < elem_per_block; i++) {
11180                output_local[len + i].SetValue (0 , 0 );
11281            }
113-             // printf("\nf32 BLOCK_IDX:%d get_row: Copy_Out AtomicAdd: offset:%d, len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, tail, elem_per_block);
114- /*              DumpTensor(output_gm[offset + len], 5, elem_per_block);
115-             DumpTensor(output_local[len], 5, elem_per_block); */  
11682            SetAtomicAdd<float >();
11783            DataCopy (output_gm[offset + len], output_local[len], elem_per_block);
11884            SetAtomicNone ();
119- /*              DumpTensor(output_gm[offset + len], 5, elem_per_block); */ 
12085        }
121- #endif 
122- #if  0 
123-         if(tail != 0) {
12486
125-             DataCopyExtParams dataCopyParams;
126-             dataCopyParams.blockCount = 1;
127-             dataCopyParams.blockLen = tail * sizeof(float);
128-             DataCopyPad(output_gm[offset + len], output_local[len],
129-                         dataCopyParams);
130-         }
131- #endif 
13287        output_queue.FreeTensor (output_local);
13388    }
13489
0 commit comments