|
2 | 2 |
|
3 | 3 | // optimize me. Use template to avoid copy code. |
4 | 4 | using namespace AscendC; |
| 5 | +#ifdef ASCEND_310P // 310P not support 4bit get row |
| 6 | + extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( |
| 7 | + GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, |
| 8 | + GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, |
| 9 | + GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { |
| 10 | + // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. |
| 11 | + printf("Ascend310P not support 4bit get row.\n"); |
| 12 | + } |
| 13 | +#else |
5 | 14 |
|
6 | 15 | #define BUFFER_NUM 2 |
7 | 16 |
|
@@ -110,12 +119,9 @@ class GET_ROW_Q4_0 { |
110 | 119 | LocalTensor<float> output_local = output_queue.AllocTensor<float>(); |
111 | 120 |
|
112 | 121 | // TODO: cast more data to speed up. |
113 | | -#ifdef ASCEND_310P |
114 | | - // TODO: 310P support quantification |
115 | | -#else |
116 | 122 | Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); |
117 | 123 | Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); |
118 | | -#endif |
| 124 | + |
119 | 125 | // Only mul need compile by group. |
120 | 126 | half scale = scale_gm.GetValue(scale_offset); |
121 | 127 |
|
@@ -194,3 +200,5 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( |
194 | 200 | indices_nb_ub, output_ne_ub, output_nb_ub); |
195 | 201 | op.calculate(); |
196 | 202 | } |
| 203 | + |
| 204 | +#endif // #ifdef ASCEND_310P |
0 commit comments