-
Notifications
You must be signed in to change notification settings - Fork 13.7k
CANN: optimize the rope ops #15335
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CANN: optimize the rope ops #15335
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2154,87 +2154,128 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, | |
|
|
||
| GGML_TENSOR_BINARY_OP_LOCALS | ||
|
|
||
| // theta_scale arange, [0,1,...,ne00/2 - 1] | ||
| int64_t theta_scale_length = ne00 / 2; | ||
| ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(), | ||
| theta_scale_length * sizeof(float_t)); | ||
| void* theta_scale_buffer = theta_scale_allocator.get(); | ||
| int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1}; | ||
| size_t theta_scale_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t), | ||
| theta_scale_length * sizeof(float_t)}; | ||
|
|
||
| aclTensor* acl_theta_scale_tensor = | ||
| ggml_cann_create_tensor(theta_scale_buffer, ACL_FLOAT, sizeof(float_t), | ||
| theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); | ||
| float start = 0; | ||
| float step = 1; | ||
| float stop = ne00 / 2; | ||
| float n_elements = ne00 / 2; | ||
| aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements); | ||
|
|
||
| // power | ||
| aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT); | ||
| GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor, | ||
| acl_theta_scale_tensor); | ||
|
|
||
| // freq_scale | ||
| if (freq_scale != 1) { | ||
| aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true); | ||
| } | ||
|
|
||
| // freq_factors | ||
| if (src2) { | ||
| aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor( | ||
| src2->data, ggml_cann_type_mapping(src2->type), | ||
| ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); | ||
| aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor); | ||
| ggml_cann_release_resources(ctx, acl_freq_factors_tensor); | ||
| } | ||
|
|
||
| // position | ||
| GGML_ASSERT(src1->type == GGML_TYPE_I32); | ||
| int64_t position_length = src1->ne[0]; | ||
| int64_t position_ne[] = {1, 1, position_length, 1}; | ||
| size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), | ||
| sizeof(int32_t) * position_length}; | ||
| aclTensor* acl_position_tensor = ggml_cann_create_tensor( | ||
| src1->data, ggml_cann_type_mapping(src1->type), | ||
| ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS); | ||
|
|
||
| // power * position | ||
| int64_t theta_length = theta_scale_length * position_length; | ||
| ggml_cann_pool_alloc theta_allocator(ctx.pool(), | ||
| theta_length * sizeof(float_t)); | ||
| void* theta_buffer = theta_allocator.get(); | ||
|
|
||
| int64_t theta_ne[] = {theta_scale_length, 1, position_length, 1}; | ||
| size_t theta_nb[GGML_MAX_DIMS]; | ||
| theta_nb[0] = sizeof(float_t); | ||
| for (int i = 1; i < GGML_MAX_DIMS; i++) { | ||
| theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1]; | ||
| } | ||
| aclTensor* acl_theta_tensor = | ||
| ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t), | ||
| theta_ne, theta_nb, GGML_MAX_DIMS); | ||
| aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, | ||
| acl_theta_tensor); | ||
|
|
||
| // sin/cos | ||
| ggml_cann_pool_alloc sin_allocator(ctx.pool(), | ||
| theta_length * sizeof(float_t)); | ||
| void* sin_buffer = sin_allocator.get(); | ||
| aclTensor* acl_sin_tensor = ggml_cann_create_tensor( | ||
| sin_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, | ||
| GGML_MAX_DIMS, ACL_FORMAT_ND); | ||
| aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor); | ||
|
|
||
| ggml_cann_pool_alloc cos_allocator(ctx.pool(), | ||
| theta_length * sizeof(float_t)); | ||
| void* cos_buffer = cos_allocator.get(); | ||
| aclTensor* acl_cos_tensor = ggml_cann_create_tensor( | ||
| cos_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, | ||
| GGML_MAX_DIMS, ACL_FORMAT_ND); | ||
| aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor); | ||
| bool is_q = (std::strncmp(dst->name, "Qcur-", 5) == 0); | ||
| bool is_k = (std::strncmp(dst->name, "Kcur-", 5) == 0); | ||
| bool is_attention = is_q || is_k; | ||
|
|
||
| if(ctx.init_ptr == nullptr || !is_attention) { | ||
| // theta_scale arange, [0,1,...,ne00/2 - 1] | ||
| if(ctx.init_ptr != nullptr){ | ||
| ACL_CHECK(aclrtFree(ctx.init_ptr)); | ||
| } | ||
| ACL_CHECK(aclrtMalloc(&ctx.init_ptr,theta_scale_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); | ||
|
||
|
|
||
| aclTensor* acl_theta_scale_tensor = | ||
| ggml_cann_create_tensor(ctx.init_ptr, ACL_FLOAT, sizeof(float_t), | ||
| theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); | ||
| float start = 0; | ||
| float step = 1; | ||
| float stop = ne00 / 2; | ||
| float n_elements = ne00 / 2; | ||
| aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements); | ||
|
|
||
| // power | ||
| aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT); | ||
| GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor, | ||
| acl_theta_scale_tensor); | ||
|
|
||
| // freq_scale | ||
| if (freq_scale != 1) { | ||
| aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true); | ||
| } | ||
|
|
||
| // freq_factors | ||
| if (src2) { | ||
| aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor( | ||
| src2->data, ggml_cann_type_mapping(src2->type), | ||
| ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); | ||
| aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor); | ||
| ggml_cann_release_resources(ctx, acl_freq_factors_tensor); | ||
| } | ||
| // release | ||
| ggml_cann_release_resources(ctx, acl_theta_scale_tensor,acl_theta_scale); | ||
| } | ||
|
|
||
| if(ctx.sin_ptr == nullptr) { | ||
| int64_t theta_length = theta_scale_length * ctx.max_position_length; | ||
| ACL_CHECK(aclrtMalloc(&ctx.sin_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); | ||
| ACL_CHECK(aclrtMalloc(&ctx.cos_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); | ||
| } | ||
| if(position_length > ctx.max_position_length) { | ||
| ctx.max_position_length = position_length; | ||
| int64_t theta_length = theta_scale_length * ctx.max_position_length; | ||
| ACL_CHECK(aclrtFree(ctx.sin_ptr)); | ||
| ACL_CHECK(aclrtFree(ctx.cos_ptr)); | ||
| ACL_CHECK(aclrtMalloc(&ctx.sin_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); | ||
| ACL_CHECK(aclrtMalloc(&ctx.cos_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); | ||
| } | ||
|
|
||
| bool is_fisrt_layer = (std::strncmp(dst->name, "Qcur-0", GGML_MAX_NAME) == 0); | ||
|
|
||
| if(is_fisrt_layer || !is_attention) { | ||
|
|
||
| aclTensor* acl_theta_scale_tensor = | ||
| ggml_cann_create_tensor(ctx.init_ptr, ACL_FLOAT, sizeof(float_t), | ||
| theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); | ||
|
|
||
| // position | ||
| aclTensor* acl_position_tensor = ggml_cann_create_tensor( | ||
| src1->data, ggml_cann_type_mapping(src1->type), | ||
| ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS); | ||
|
|
||
| // power * position | ||
| int64_t theta_length = theta_scale_length * position_length; | ||
| ggml_cann_pool_alloc theta_allocator(ctx.pool(), | ||
| theta_length * sizeof(float_t)); | ||
| void* theta_buffer = theta_allocator.get(); | ||
|
|
||
| aclTensor* acl_theta_tensor = | ||
| ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t), | ||
| theta_ne, theta_nb, GGML_MAX_DIMS); | ||
| aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, | ||
| acl_theta_tensor); | ||
|
|
||
| // sin/cos | ||
| aclTensor* acl_sin_tensor = ggml_cann_create_tensor( | ||
| ctx.sin_ptr, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, | ||
| GGML_MAX_DIMS, ACL_FORMAT_ND); | ||
| aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor); | ||
|
|
||
| aclTensor* acl_cos_tensor = ggml_cann_create_tensor( | ||
| ctx.cos_ptr, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, | ||
| GGML_MAX_DIMS, ACL_FORMAT_ND); | ||
| aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor); | ||
|
|
||
| // release | ||
| ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor, | ||
| acl_theta_tensor, acl_sin_tensor, acl_cos_tensor); | ||
| } | ||
|
|
||
| aclTensor* acl_sin_tensor = ggml_cann_create_tensor( | ||
| ctx.sin_ptr, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, | ||
| GGML_MAX_DIMS, ACL_FORMAT_ND); | ||
| aclTensor* acl_cos_tensor = ggml_cann_create_tensor( | ||
| ctx.cos_ptr, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, | ||
| GGML_MAX_DIMS, ACL_FORMAT_ND); | ||
|
|
||
| // attn_factor | ||
| if (attn_factor != 1) { | ||
| aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true); | ||
|
|
@@ -2257,8 +2298,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, | |
| } | ||
|
|
||
| // release | ||
| ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor, | ||
| acl_theta_tensor, acl_sin_tensor, acl_cos_tensor, acl_theta_scale); | ||
| ggml_cann_release_resources(ctx, acl_sin_tensor, acl_cos_tensor); | ||
| } | ||
|
|
||
| #ifdef __cplusplus | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -368,6 +368,10 @@ struct ggml_backend_cann_context { | |
| std::string name; /**< Name of the device. */ | ||
| std::string description; /**< Description of the device. */ | ||
| aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */ | ||
| void* init_ptr = nullptr; | ||
| void* sin_ptr = nullptr; | ||
| void* cos_ptr = nullptr; | ||
| int64_t max_position_length = 200000; | ||
|
||
| #ifdef USE_ACL_GRAPH | ||
| /// Cached CANN ACL graph used for executing the current ggml computation graph. | ||
| std::unique_ptr<ggml_cann_graph> cann_graph; | ||
|
|
@@ -414,6 +418,15 @@ struct ggml_backend_cann_context { | |
| ACL_CHECK(aclrtDestroyStream(streams[i])); | ||
| } | ||
| } | ||
| if(init_ptr != nullptr) { | ||
| ACL_CHECK(aclrtFree(init_ptr)); | ||
| } | ||
| if(sin_ptr != nullptr) { | ||
| ACL_CHECK(aclrtFree(sin_ptr)); | ||
| } | ||
| if(cos_ptr != nullptr) { | ||
| ACL_CHECK(aclrtFree(cos_ptr)); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.