@@ -12,6 +12,9 @@ class QUANTIZE_FLOAT_TO_Q4_0 {
1212 __aicore__ inline void init (GM_ADDR input, GM_ADDR output,
1313 int64_t *input_ne_ub, size_t *input_nb_ub,
1414 int64_t *output_ne_ub) {
15+ // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
16+ // permute=[0,0,0,0]):
17+ // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
1518 int64_t op_block_num = GetBlockNum ();
1619 int64_t op_block_idx = GetBlockIdx ();
1720
@@ -61,13 +64,13 @@ class QUANTIZE_FLOAT_TO_Q4_0 {
6164 pipe.InitBuffer (input_queue, BUFFER_NUM, Group_Size * sizeof (SRC_T));
6265 pipe.InitBuffer (output_queue, BUFFER_NUM,
6366 Group_Size * sizeof (int8_t ) / 2 );
64- pipe.InitBuffer (cast_queue , BUFFER_NUM , Group_Size * sizeof (float ));
65- pipe.InitBuffer (work_queue, BUFFER_NUM , Group_Size* sizeof (float ));
66- pipe.InitBuffer (max_queue, BUFFER_NUM , Group_Size* sizeof (float ));
67- pipe.InitBuffer (min_queue, BUFFER_NUM , Group_Size* sizeof (float ));
68- pipe.InitBuffer (scale_queue, BUFFER_NUM, 16 * sizeof (half));
69- pipe.InitBuffer (int8_queue, BUFFER_NUM , Group_Size * sizeof (int8_t ));
70- pipe.InitBuffer (half_queue, BUFFER_NUM , Group_Size * sizeof (half));
67+ pipe.InitBuffer (cast_queue , 1 , Group_Size * sizeof (float ));
68+ pipe.InitBuffer (work_queue, 1 , Group_Size * sizeof (float ));
69+ pipe.InitBuffer (max_queue, 1 , Group_Size * sizeof (float ));
70+ pipe.InitBuffer (min_queue, 1 , Group_Size * sizeof (float ));
71+ pipe.InitBuffer (scale_queue, 1 , Group_Size / 2 * sizeof (half));
72+ pipe.InitBuffer (int8_queue, 1 , Group_Size * sizeof (int8_t ));
73+ pipe.InitBuffer (half_queue, 1 , Group_Size * sizeof (half));
7174 }
7275
7376 __aicore__ inline void copy_in (uint32_t offset) {
@@ -178,13 +181,15 @@ class QUANTIZE_FLOAT_TO_Q4_0 {
178181 for (int64_t j = 0 ; j < group_size_in_row; j++) {
179182 half scale = calculate_group (i, j);
180183 scale_local.SetValue (scale_local_offset++, scale);
181- if (scale_local_offset == 16 ) {
184+ // Copy Group_Size/2 length data each time.
185+ if (scale_local_offset == Group_Size / 2 ) {
182186 scale_local_offset = 0 ;
183187 // TODO: OPTIMIZE ME
184188 pipe_barrier (PIPE_ALL);
185- DataCopy (scale_gm[scale_global_offset], scale_local, 16 );
189+ DataCopy (scale_gm[scale_global_offset], scale_local,
190+ Group_Size / 2 );
186191 pipe_barrier (PIPE_ALL);
187- scale_global_offset += 16 ;
192+ scale_global_offset += Group_Size / 2 ;
188193 }
189194 }
190195 }
0 commit comments