@@ -60,32 +60,64 @@ class GET_ROW_F16 {
6060
6161 __aicore__ inline void copy_in (uint32_t offset, size_t len) {
6262 LocalTensor<half> input_local = input_queue.AllocTensor <half>();
63- size_t tail = len % 32 ;
64- len = len & ~31 ;
63+ const size_t elem_per_block = 32 / sizeof (half);
64+ size_t tail = len % elem_per_block;
65+ len = len & ~elem_per_block;
6566 DataCopy (input_local, input_gm[offset], len);
6667 if (tail != 0 ) {
68+ DataCopy (input_local[len], input_gm[offset + len], elem_per_block);
69+ // clean
70+ for (int i = tail; i < elem_per_block; i++) {
71+ input_local[len].SetValue (i, 0 );
72+ }
73+ #if 0
74+ const half padVal = 0;
75+ uint64_t mask0 = ((uint64_t)1ul << 16) - ((uint64_t)1ul << tail);
76+ uint64_t mask[2] = {mask0, 0};
77+ Duplicate<half>(input_local[len], padVal, mask, 1 /*no repeat*/, 1/*no gap in block*/, 8/*no gap between ∂repeats*/);
78+ #endif
79+
80+ #if 0
6781 DataCopyExtParams dataCopyParams;
6882 dataCopyParams.blockCount = 1;
6983 dataCopyParams.blockLen = tail * sizeof(half);
7084 DataCopyPadExtParams<half> padParams;
7185 DataCopyPad(input_local[len], input_gm[offset + len],
7286 dataCopyParams, padParams);
87+
88+ uint16_t rightPadNum = 32 / sizeof(half) - tail;
89+ PadParams padParas{0, rightPadNum, 0};
90+ Pad(input_local[len], input_gm[offset + len], padParas, tilingData.padTilingData);
91+ #endif
7392 }
7493 input_queue.EnQue (input_local);
7594 }
7695
7796 __aicore__ inline void copy_out (uint32_t offset, size_t len) {
7897 LocalTensor<float > output_local = output_queue.DeQue <float >();
79- size_t tail = len % 32 ;
80- len = len & ~31 ;
81- DataCopy (output_gm[offset], output_local, len);
98+ const size_t elem_per_block = 32 / sizeof (float );
99+ size_t tail = len % elem_per_block;
100+ len = len & ~elem_per_block;
101+ // DataCopy(output_gm[offset], output_local, len);
82102 if (tail != 0 ) {
103+ len += elem_per_block;
104+ #if 0
83105 DataCopyExtParams dataCopyParams;
84106 dataCopyParams.blockCount = 1;
85107 dataCopyParams.blockLen = tail * sizeof(float);
86108 DataCopyPad(output_gm[offset + len], output_local[len],
87109 dataCopyParams);
110+ #endif
88111 }
112+ DataCopy (output_gm[offset], output_local, len);
113+ // clean
114+ if (tail != 0 ) {
115+ for (int i = tail; i < elem_per_block; i++) {
116+ output_gm[offset + len - elem_per_block].SetValue (i, 0 );
117+ }
118+ DataCacheCleanAndInvalid<float , CacheLine::SINGLE_CACHE_LINE>(output_gm[offset + len - elem_per_block]);
119+ }
120+
89121 output_queue.FreeTensor (output_local);
90122 }
91123
0 commit comments