| 
2 | 2 | 
 
  | 
3 | 3 | // optimize me. Use template to avoid copy code.  | 
4 | 4 | using namespace AscendC;  | 
 | 5 | +#ifdef ASCEND_310P // 310P not support 4bit get row  | 
 | 6 | +    extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(  | 
 | 7 | +        GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,  | 
 | 8 | +        GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,  | 
 | 9 | +        GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {  | 
 | 10 | +        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.  | 
 | 11 | +        printf("Ascend310P not support 4bit get row.\n");  | 
 | 12 | +    }  | 
 | 13 | +#else  | 
5 | 14 | 
 
  | 
6 | 15 | #define BUFFER_NUM 2  | 
7 | 16 | 
 
  | 
@@ -110,12 +119,9 @@ class GET_ROW_Q4_0 {  | 
110 | 119 |         LocalTensor<float> output_local = output_queue.AllocTensor<float>();  | 
111 | 120 | 
 
  | 
112 | 121 |         // TODO: cast more data to speed up.  | 
113 |  | -#ifdef ASCEND_310P  | 
114 |  | -        // TODO: 310P support quantification  | 
115 |  | -#else  | 
116 | 122 |         Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);  | 
117 | 123 |         Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);  | 
118 |  | -#endif  | 
 | 124 | + | 
119 | 125 |         // Only mul need compile by group.  | 
120 | 126 |         half scale = scale_gm.GetValue(scale_offset);  | 
121 | 127 | 
 
  | 
@@ -194,3 +200,5 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(  | 
194 | 200 |             indices_nb_ub, output_ne_ub, output_nb_ub);  | 
195 | 201 |     op.calculate();  | 
196 | 202 | }  | 
 | 203 | + | 
 | 204 | +#endif // #ifdef ASCEND_310P  | 
0 commit comments