Skip to content

Commit cd9a324

Browse files
committed
CANN Support run F16 model on Ascend310P
1 parent 61715d5 commit cd9a324

File tree

6 files changed

+146
-35
lines changed

6 files changed

+146
-35
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -904,6 +904,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
904904
return;
905905
}
906906
// TODO: simplify
907+
const size_t CANN_DUP_OP_SUPPORTED_MAX_ROWS = 65535;
907908
if (src->type == GGML_TYPE_F16) {
908909
if (dst->type == GGML_TYPE_Q8_0) {
909910
aclrtlaunch_ascendc_quantize_f16_q8_0(
@@ -931,7 +932,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
931932
if (src->nb[0] == src_type_size) {
932933
// src0 is contigous on first dimension, copy by rows
933934
int64_t rows_num = ggml_nrows(src);
934-
935+
GGML_ASSERT(rows_num <= CANN_DUP_OP_SUPPORTED_MAX_ROWS);
935936
aclrtlaunch_ascendc_dup_by_rows_fp16(
936937
rows_num, ctx.stream(), src->data, dst->data,
937938
((ggml_tensor*)src->extra)->ne,
@@ -956,6 +957,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
956957
if (src->nb[0] == src_type_size) {
957958
// src0 is contigous on first dimension, copy by rows
958959
int64_t rows_num = ggml_nrows(src);
960+
GGML_ASSERT(rows_num <= CANN_DUP_OP_SUPPORTED_MAX_ROWS);
959961
aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
960962
rows_num, ctx.stream(), src->data, dst->data,
961963
((ggml_tensor*)src->extra)->ne,
@@ -999,6 +1001,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
9991001
if (src->nb[0] == src_type_size) {
10001002
// src0 is contigous on first dimension, copy by rows
10011003
int64_t rows_num = ggml_nrows(src);
1004+
GGML_ASSERT(rows_num <= CANN_DUP_OP_SUPPORTED_MAX_ROWS);
10021005
aclrtlaunch_ascendc_dup_by_rows_fp32(
10031006
rows_num, ctx.stream(), src->data, dst->data,
10041007
((ggml_tensor*)src->extra)->ne,
@@ -1025,6 +1028,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
10251028
if (src->nb[0] == src_type_size) {
10261029
// src0 is contigous on first dimension, copy by rows
10271030
int64_t rows_num = ggml_nrows(src);
1031+
GGML_ASSERT(rows_num <= CANN_DUP_OP_SUPPORTED_MAX_ROWS);
10281032
aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
10291033
rows_num, ctx.stream(), src->data, dst->data,
10301034
((ggml_tensor*)src->extra)->ne,
@@ -2312,15 +2316,26 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23122316

23132317
switch (src0->type) {
23142318
case GGML_TYPE_F32:
2315-
aclrtlaunch_ascendc_get_row_f32(
2316-
24, ctx.stream(), src0->data, src1->data, dst->data,
2317-
((ggml_tensor*)src0->extra)->ne,
2318-
((ggml_tensor*)src0->extra)->nb,
2319-
((ggml_tensor*)src1->extra)->ne,
2320-
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2321-
((ggml_tensor*)dst->extra)->nb);
2319+
{
2320+
if ((src0->ne[0] % 8) != 0) {
2321+
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
2322+
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
2323+
}
2324+
aclrtlaunch_ascendc_get_row_f32(
2325+
24, ctx.stream(), src0->data, src1->data, dst->data,
2326+
((ggml_tensor*)src0->extra)->ne,
2327+
((ggml_tensor*)src0->extra)->nb,
2328+
((ggml_tensor*)src1->extra)->ne,
2329+
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2330+
((ggml_tensor*)dst->extra)->nb);
2331+
}
23222332
break;
23232333
case GGML_TYPE_F16:
2334+
{
2335+
if ((src0->ne[0] % 16) != 0) {
2336+
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
2337+
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
2338+
}
23242339
aclrtlaunch_ascendc_get_row_f16(
23252340
24, ctx.stream(), src0->data, src1->data, dst->data,
23262341
((ggml_tensor*)src0->extra)->ne,
@@ -2329,6 +2344,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23292344
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
23302345
((ggml_tensor*)dst->extra)->nb);
23312346
break;
2347+
}
23322348
case GGML_TYPE_Q4_0:
23332349
aclrtlaunch_ascendc_get_row_q4_0(
23342350
24, ctx.stream(), src0->data, src1->data, dst->data,

ggml/src/ggml-cann/kernels/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,12 @@ ascendc_library(ascendc_kernels STATIC
3030
${SRC_FILES}
3131
)
3232

33+
string(FIND "${SOC_VERSION}" "ascend310p" FIRST_310P_INDEX)
34+
if(FIRST_310P_INDEX GREATER -1)
35+
ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCEND_310P)
36+
message(STATUS "Compile for Ascend310P.")
37+
else()
38+
message(STATUS "Compile for Ascend910B.")
39+
endif()
40+
3341
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

ggml/src/ggml-cann/kernels/dup.cpp

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,23 +51,29 @@ class DupByRows {
5151

5252
__aicore__ inline void copy_in() {
5353
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
54-
55-
DataCopyExtParams dataCopyParams;
56-
dataCopyParams.blockCount = 1;
57-
dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
58-
DataCopyPadExtParams<SRC_T> padParams;
59-
DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
60-
54+
const size_t elem_per_block = 32 / sizeof(SRC_T);
55+
size_t tail = num_elem % elem_per_block;
56+
size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
57+
DataCopy(src_local, src_gm, cpy_elements_len);
6158
src_queue.EnQue(src_local);
6259
}
6360

6461
__aicore__ inline void copy_out() {
6562
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
66-
67-
DataCopyExtParams dataCopyParams;
68-
dataCopyParams.blockCount = 1;
69-
dataCopyParams.blockLen = num_elem * sizeof(DST_T);
70-
DataCopyPad(dst_gm, dst_local, dataCopyParams);
63+
const size_t elem_per_block = 32 / sizeof(DST_T);
64+
size_t tail = num_elem % elem_per_block;
65+
size_t len = num_elem & ~(elem_per_block - 1);
66+
if (len > 0) {
67+
DataCopy(dst_gm, dst_local, len);
68+
}
69+
if(tail != 0) {
70+
for (size_t i = tail; i < elem_per_block; i++) {
71+
dst_local[len + i].SetValue(0, 0);
72+
}
73+
SetAtomicAdd<float>();
74+
DataCopy(dst_gm[len], dst_local[len], elem_per_block);
75+
SetAtomicNone();
76+
}
7177

7278
dst_queue.FreeTensor(dst_local);
7379
}

ggml/src/ggml-cann/kernels/get_row_f16.cpp

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class GET_ROW_F16 {
1414
int64_t *output_ne_ub, size_t *output_nb_ub) {
1515
// TODO, use template for F16/f32
1616
int64_t op_block_num = GetBlockNum();
17-
int64_t op_block_idx = GetBlockIdx();
17+
op_block_idx = GetBlockIdx();
1818

1919
for (int i = 0; i < 4; i++) {
2020
input_ne[i] = input_ne_ub[i];
@@ -59,33 +59,61 @@ class GET_ROW_F16 {
5959
}
6060

6161
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
62+
size_t origin_len = len;
6263
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
63-
size_t tail = len % 32;
64-
len = len & ~31;
65-
DataCopy(input_local, input_gm[offset], len);
64+
const size_t elem_per_block = 32 / sizeof(half);
65+
size_t tail = len % elem_per_block;
66+
len = len & ~(elem_per_block - 1);
6667
if(tail != 0) {
68+
//printf("f16 get_row: copy_in: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", offset, len, origin_len, tail, elem_per_block);
69+
//DumpTensor(input_local, 5, elem_per_block);
70+
len += elem_per_block;
71+
// DataCopy(input_local[len], input_gm[offset + len], elem_per_block);
72+
#if 0
6773
DataCopyExtParams dataCopyParams;
6874
dataCopyParams.blockCount = 1;
6975
dataCopyParams.blockLen = tail * sizeof(half);
7076
DataCopyPadExtParams<half> padParams;
7177
DataCopyPad(input_local[len], input_gm[offset + len],
7278
dataCopyParams, padParams);
79+
#endif
7380
}
81+
DataCopy(input_local, input_gm[offset], len);
7482
input_queue.EnQue(input_local);
7583
}
7684

7785
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
7886
LocalTensor<float> output_local = output_queue.DeQue<float>();
79-
size_t tail = len % 32;
80-
len = len & ~31;
81-
DataCopy(output_gm[offset], output_local, len);
87+
const size_t elem_per_block = 32 / sizeof(float);
88+
size_t tail = len % elem_per_block;
89+
len = len & ~(elem_per_block - 1);
90+
if (len > 0) {
91+
DataCopy(output_gm[offset], output_local, len);
92+
}
93+
#if 1
94+
if(tail != 0) {
95+
/* printf("\nf16 BLOCK_IDX:%d get_row: Copy_Out AtomicAdd: offset:%d, len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, tail, elem_per_block);
96+
DumpTensor(output_gm, 5, elem_per_block); */
97+
for (size_t i = tail; i < elem_per_block; i++) {
98+
output_local[len + i].SetValue(0, 0);
99+
}
100+
// DumpTensor(output_local[len], 5, elem_per_block);
101+
SetAtomicAdd<float>();
102+
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
103+
SetAtomicNone();
104+
// DumpTensor(output_gm, 5, elem_per_block);
105+
}
106+
#endif
107+
108+
#if 0
82109
if(tail != 0) {
83110
DataCopyExtParams dataCopyParams;
84111
dataCopyParams.blockCount = 1;
85112
dataCopyParams.blockLen = tail * sizeof(float);
86113
DataCopyPad(output_gm[offset + len], output_local[len],
87114
dataCopyParams);
88115
}
116+
#endif
89117
output_queue.FreeTensor(output_local);
90118
}
91119

@@ -150,6 +178,7 @@ class GET_ROW_F16 {
150178
GlobalTensor<float> output_gm;
151179
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
152180
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
181+
int64_t op_block_idx;
153182
};
154183

155184
template <typename T>

ggml/src/ggml-cann/kernels/get_row_f32.cpp

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class GET_ROW_F32 {
1313
int64_t *indices_ne_ub, size_t *indices_nb_ub,
1414
int64_t *output_ne_ub, size_t *output_nb_ub) {
1515
int64_t op_block_num = GetBlockNum();
16-
int64_t op_block_idx = GetBlockIdx();
16+
op_block_idx = GetBlockIdx();
1717

1818
for (int i = 0; i < 4; i++) {
1919
input_ne[i] = input_ne_ub[i];
@@ -51,36 +51,84 @@ class GET_ROW_F32 {
5151
// All data should asign to 32. It's ok because all data is align to 32.
5252
pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
5353
pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
54+
// printf("f32 BLOCK_IDX:%d get_row: init: ir:%d, dr:%d, n_elements:%d.\n", op_block_idx, ir, dr, n_elements);
5455
}
5556

5657
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
58+
size_t origin_len = len;
5759
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
58-
size_t tail = len % 32;
59-
len = len & ~31;
60-
DataCopy(input_local, input_gm[offset], len);
60+
const size_t elem_per_block = 32 / sizeof(float);
61+
size_t tail = len % elem_per_block;
62+
len = len & ~(elem_per_block - 1);
63+
64+
//printf("f32 BLOCK_IDX:%d get_row: Copy_in: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
65+
if (len > 0)
66+
DataCopy(input_local, input_gm[offset], len);
67+
//printf("f32 BLOCK_IDX:%d get_row: Copy_in executed: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
6168
if(tail != 0) {
69+
#if 1
70+
/* //printf("f32 BLOCK_IDX:%d get_row: Copy_in ENTER tail != 0: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
71+
for (int i = 0; i < elem_per_block; i++) {
72+
printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, origin input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0));
73+
}
74+
//DumpTensor(input_gm[offset + len], 5, elem_per_block);
75+
for (int i = 0; i < tail; i++) {
76+
printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, input local val:%f, input_gm:%f.\n", op_block_idx, len + i, input_local[len + i].GetValue(0), input_gm[offset + len + i]);
77+
} */
78+
DataCopy(input_local[len], input_gm[offset + len], elem_per_block);
79+
// clean
80+
/* for (int i = tail; i < elem_per_block; i++) {
81+
input_local[len + i].SetValue(0, 0);
82+
}
83+
for (int i = 0; i < elem_per_block; i++) {
84+
printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, after clean and copy, input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0));
85+
} */
86+
#endif
87+
#if 0
6288
DataCopyExtParams dataCopyParams;
6389
dataCopyParams.blockCount = 1;
6490
dataCopyParams.blockLen = tail * sizeof(float);
6591
DataCopyPadExtParams<float> padParams;
6692
DataCopyPad(input_local[len], input_gm[offset + len],
6793
dataCopyParams, padParams);
94+
#endif
6895
}
6996
input_queue.EnQue(input_local);
7097
}
7198

7299
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
73100
LocalTensor<float> output_local = output_queue.DeQue<float>();
74-
size_t tail = len % 32;
75-
len = len & ~31;
76-
DataCopy(output_gm[offset], output_local, len);
101+
const size_t elem_per_block = 32 / sizeof(float);
102+
size_t tail = len % elem_per_block;
103+
len = len & ~(elem_per_block - 1);
104+
if (len > 0) {
105+
DataCopy(output_gm[offset], output_local, len);
106+
}
107+
108+
#if 1
77109
if(tail != 0) {
110+
for (size_t i = tail; i < elem_per_block; i++) {
111+
output_local[len + i].SetValue(0, 0);
112+
}
113+
//printf("\nf32 BLOCK_IDX:%d get_row: Copy_Out AtomicAdd: offset:%d, len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, tail, elem_per_block);
114+
/* DumpTensor(output_gm[offset + len], 5, elem_per_block);
115+
DumpTensor(output_local[len], 5, elem_per_block); */
116+
SetAtomicAdd<float>();
117+
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
118+
SetAtomicNone();
119+
/* DumpTensor(output_gm[offset + len], 5, elem_per_block); */
120+
}
121+
#endif
122+
#if 0
123+
if(tail != 0) {
124+
78125
DataCopyExtParams dataCopyParams;
79126
dataCopyParams.blockCount = 1;
80127
dataCopyParams.blockLen = tail * sizeof(float);
81128
DataCopyPad(output_gm[offset + len], output_local[len],
82129
dataCopyParams);
83130
}
131+
#endif
84132
output_queue.FreeTensor(output_local);
85133
}
86134

@@ -144,6 +192,7 @@ class GET_ROW_F32 {
144192
GlobalTensor<float> output_gm;
145193
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
146194
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
195+
int64_t op_block_idx;
147196
};
148197

149198
template <typename T>

ggml/src/ggml-cann/kernels/get_row_q4_0.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,12 @@ class GET_ROW_Q4_0 {
110110
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
111111

112112
// TODO: cast more data to speed up.
113+
#ifdef ASCEND_310P
114+
// TODO: 310P support quantification
115+
#else
113116
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
114117
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
115-
118+
#endif
116119
// Only mul need compile by group.
117120
half scale = scale_gm.GetValue(scale_offset);
118121

0 commit comments

Comments
 (0)