@@ -13,7 +13,7 @@ class GET_ROW_F32 {
1313 int64_t *indices_ne_ub, size_t *indices_nb_ub,
1414 int64_t *output_ne_ub, size_t *output_nb_ub) {
1515 int64_t op_block_num = GetBlockNum ();
16- int64_t op_block_idx = GetBlockIdx ();
16+ op_block_idx = GetBlockIdx ();
1717
1818 for (int i = 0 ; i < 4 ; i++) {
1919 input_ne[i] = input_ne_ub[i];
@@ -51,25 +51,38 @@ class GET_ROW_F32 {
5151 // All data should asign to 32. It's ok because all data is align to 32.
5252 pipe.InitBuffer (input_queue, BUFFER_NUM, local_buffer_size);
5353 pipe.InitBuffer (output_queue, BUFFER_NUM, local_buffer_size);
54+ // printf("f32 BLOCK_IDX:%d get_row: init: ir:%d, dr:%d, n_elements:%d.\n", op_block_idx, ir, dr, n_elements);
5455 }
5556
5657 __aicore__ inline void copy_in (uint32_t offset, size_t len) {
58+ size_t origin_len = len;
5759 LocalTensor<float > input_local = input_queue.AllocTensor <float >();
5860 const size_t elem_per_block = 32 / sizeof (float );
5961 size_t tail = len % elem_per_block;
60- len = len & ~elem_per_block;
61- DataCopy (input_local, input_gm[offset], len);
62+ len = len & ~(elem_per_block - 1 );
63+
64+ // printf("f32 BLOCK_IDX:%d get_row: Copy_in: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
65+ if (len > 0 )
66+ DataCopy (input_local, input_gm[offset], len);
67+ // printf("f32 BLOCK_IDX:%d get_row: Copy_in executed: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
6268 if (tail != 0 ) {
69+ #if 1
70+ /* //printf("f32 BLOCK_IDX:%d get_row: Copy_in ENTER tail != 0: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
71+ for (int i = 0; i < elem_per_block; i++) {
72+ printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, origin input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0));
73+ }
74+ //DumpTensor(input_gm[offset + len], 5, elem_per_block);
75+ for (int i = 0; i < tail; i++) {
76+ printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, input local val:%f, input_gm:%f.\n", op_block_idx, len + i, input_local[len + i].GetValue(0), input_gm[offset + len + i]);
77+ } */
6378 DataCopy (input_local[len], input_gm[offset + len], elem_per_block);
6479 // clean
65- for (int i = tail; i < elem_per_block; i++) {
66- input_local[len].SetValue (i , 0 );
80+ /* for (int i = tail; i < elem_per_block; i++) {
81+ input_local[len + i ].SetValue(0 , 0);
6782 }
68- #if 0
69- const float padVal = 0;
70- uint64_t mask0 = ((uint64_t)1ul << 8) - ((uint64_t)1ul << tail);
71- uint64_t mask[2] = {mask0, 0};
72- Duplicate<float>(input_local[len], padVal, mask, 1 /*no repeat*/, 1/*no gap in block*/, 8/*no gap between repeats*/);
83+ for (int i = 0; i < elem_per_block; i++) {
84+ printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, after clean and copy, input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0));
85+ } */
7386#endif
7487#if 0
7588 DataCopyExtParams dataCopyParams;
@@ -87,27 +100,32 @@ class GET_ROW_F32 {
87100 LocalTensor<float > output_local = output_queue.DeQue <float >();
88101 const size_t elem_per_block = 32 / sizeof (float );
89102 size_t tail = len % elem_per_block;
90- len = len & ~elem_per_block;
91- // DataCopy(output_gm[offset], output_local, len);
103+ len = len & ~(elem_per_block - 1 );
104+ if (len > 0 ) {
105+ DataCopy (output_gm[offset], output_local, len);
106+ }
107+
108+ #if 1
92109 if (tail != 0 ) {
93- len += elem_per_block;
110+ // printf("\nf32 BLOCK_IDX:%d get_row: Copy_Out AtomicAdd: offset:%d, len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, tail, elem_per_block);
111+ /* DumpTensor(output_gm[offset + len], 5, elem_per_block);
112+ DumpTensor(output_local[len], 5, elem_per_block); */
113+ SetAtomicAdd<float >();
114+ DataCopy (output_gm[offset + len], output_local[len], elem_per_block);
115+ SetAtomicNone ();
116+ /* DumpTensor(output_gm[offset + len], 5, elem_per_block); */
117+ }
118+ #endif
94119#if 0
120+ if(tail != 0) {
121+
95122 DataCopyExtParams dataCopyParams;
96123 dataCopyParams.blockCount = 1;
97124 dataCopyParams.blockLen = tail * sizeof(float);
98125 DataCopyPad(output_gm[offset + len], output_local[len],
99126 dataCopyParams);
100- #endif
101127 }
102- DataCopy (output_gm[offset], output_local, len);
103-
104- if (tail != 0 ) { // clean
105- for (int i = tail; i < elem_per_block; i++) {
106- output_gm[offset + len - elem_per_block].SetValue (i, 0 );
107- }
108- DataCacheCleanAndInvalid<float , CacheLine::SINGLE_CACHE_LINE>(output_gm[offset + len - elem_per_block]);
109- }
110-
128+ #endif
111129 output_queue.FreeTensor (output_local);
112130 }
113131
@@ -171,6 +189,7 @@ class GET_ROW_F32 {
171189 GlobalTensor<float > output_gm;
172190 TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
173191 TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
192+ int64_t op_block_idx;
174193};
175194
176195template <typename T>
0 commit comments