@@ -13,7 +13,7 @@ class GET_ROW_F32 {
1313                                int64_t  *indices_ne_ub, size_t  *indices_nb_ub,
1414                                int64_t  *output_ne_ub, size_t  *output_nb_ub) {
1515        int64_t  op_block_num = GetBlockNum ();
16-         int64_t   op_block_idx = GetBlockIdx ();
16+         op_block_idx = GetBlockIdx ();
1717
1818        for  (int  i = 0 ; i < 4 ; i++) {
1919            input_ne[i] = input_ne_ub[i];
@@ -51,36 +51,84 @@ class GET_ROW_F32 {
5151        //  All data should asign to 32. It's ok because all data is align to 32.
5252        pipe.InitBuffer (input_queue, BUFFER_NUM, local_buffer_size);
5353        pipe.InitBuffer (output_queue, BUFFER_NUM, local_buffer_size);
54+         //  printf("f32 BLOCK_IDX:%d get_row: init: ir:%d, dr:%d, n_elements:%d.\n", op_block_idx, ir, dr, n_elements);
5455    }
5556
5657    __aicore__ inline  void  copy_in (uint32_t  offset, size_t  len) {
58+         size_t  origin_len = len;
5759        LocalTensor<float > input_local = input_queue.AllocTensor <float >();
58-         size_t  tail = len % 32 ;
59-         len = len & ~31 ;
60-         DataCopy (input_local, input_gm[offset], len);
60+         const  size_t  elem_per_block = 32  / sizeof (float );
61+         size_t  tail = len % elem_per_block;
62+         len = len & ~(elem_per_block - 1 );
63+ 
64+         // printf("f32 BLOCK_IDX:%d get_row: Copy_in: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
65+         if  (len > 0 )
66+             DataCopy (input_local, input_gm[offset], len);
67+         // printf("f32 BLOCK_IDX:%d get_row: Copy_in executed: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
6168        if (tail != 0 ) {
69+ #if  1 
70+ /*              //printf("f32 BLOCK_IDX:%d get_row: Copy_in ENTER tail != 0: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
71+             for (int i = 0; i < elem_per_block; i++) { 
72+                 printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, origin input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0)); 
73+             } 
74+             //DumpTensor(input_gm[offset + len], 5, elem_per_block); 
75+             for (int i = 0; i < tail; i++) { 
76+                 printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, input local val:%f, input_gm:%f.\n", op_block_idx, len + i, input_local[len + i].GetValue(0), input_gm[offset + len + i]); 
77+             } */  
78+             DataCopy (input_local[len], input_gm[offset + len], elem_per_block);
79+             //  clean
80+ /*              for (int i = tail; i < elem_per_block; i++) {
81+                 input_local[len + i].SetValue(0, 0); 
82+             } 
83+             for (int i = 0; i < elem_per_block; i++) { 
84+                 printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, after clean and copy, input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0)); 
85+             }  */  
86+ #endif 
87+ #if  0 
6288            DataCopyExtParams dataCopyParams;
6389            dataCopyParams.blockCount = 1;
6490            dataCopyParams.blockLen = tail * sizeof(float);
6591            DataCopyPadExtParams<float> padParams;
6692            DataCopyPad(input_local[len], input_gm[offset + len],
6793                        dataCopyParams, padParams);
94+ #endif 
6895        }
6996        input_queue.EnQue (input_local);
7097    }
7198
7299    __aicore__ inline  void  copy_out (uint32_t  offset, size_t  len) {
73100        LocalTensor<float > output_local = output_queue.DeQue <float >();
74-         size_t  tail = len % 32 ;
75-         len = len & ~31 ;
76-         DataCopy (output_gm[offset], output_local, len);
101+         const  size_t  elem_per_block = 32  / sizeof (float );
102+         size_t  tail = len % elem_per_block;
103+         len = len & ~(elem_per_block - 1 );
104+         if  (len > 0 ) {
105+             DataCopy (output_gm[offset], output_local, len);
106+         }
107+ 
108+ #if  1 
77109        if (tail != 0 ) {
110+             for  (size_t  i = tail; i < elem_per_block; i++) {
111+                 output_local[len + i].SetValue (0 , 0 );
112+             }
113+             // printf("\nf32 BLOCK_IDX:%d get_row: Copy_Out AtomicAdd: offset:%d, len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, tail, elem_per_block);
114+ /*              DumpTensor(output_gm[offset + len], 5, elem_per_block);
115+             DumpTensor(output_local[len], 5, elem_per_block); */  
116+             SetAtomicAdd<float >();
117+             DataCopy (output_gm[offset + len], output_local[len], elem_per_block);
118+             SetAtomicNone ();
119+ /*              DumpTensor(output_gm[offset + len], 5, elem_per_block); */ 
120+         }
121+ #endif 
122+ #if  0 
123+         if(tail != 0) {
124+ 
78125            DataCopyExtParams dataCopyParams;
79126            dataCopyParams.blockCount = 1;
80127            dataCopyParams.blockLen = tail * sizeof(float);
81128            DataCopyPad(output_gm[offset + len], output_local[len],
82129                        dataCopyParams);
83130        }
131+ #endif 
84132        output_queue.FreeTensor (output_local);
85133    }
86134
@@ -144,6 +192,7 @@ class GET_ROW_F32 {
144192    GlobalTensor<float > output_gm;
145193    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
146194    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
195+     int64_t  op_block_idx;
147196};
148197
149198template  <typename  T>
0 commit comments