Skip to content

Commit 84d626b

Browse files
committed
CONV_TRANSPOSE_1D kernel_size>255
1 parent e4ae383 commit 84d626b

File tree

2 files changed

+138
-13
lines changed

2 files changed

+138
-13
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 137 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2988,32 +2988,156 @@ void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
29882988
GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src.get(), 3, false, acl_dst.get());
29892989
}
29902990

2991-
void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2991+
void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
29922992
ggml_tensor * src0 = dst->src[0];
29932993
ggml_tensor * src1 = dst->src[1];
29942994

29952995
// stride
2996-
int64_t s0 = ((const int32_t *) (dst->op_params))[0];
2996+
int64_t s0 = ((const int32_t*)(dst->op_params))[0];
29972997

2998-
acl_tensor_ptr acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
2998+
acl_tensor_ptr acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
29992999
acl_tensor_ptr acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
3000-
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
3000+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
3001+
3002+
// get base information of input and kernel
3003+
int64_t input_len = *(src1->ne);
3004+
int64_t dst_len = *(dst->ne);
3005+
int64_t kernel_size = *(src0->ne);
3006+
3007+
// set the max kernel size for each conv
3008+
int64_t max_kernel_size = 255;
3009+
3010+
// compute the partition of kernel
3011+
int64_t part_num = 1;
3012+
part_num = (kernel_size + max_kernel_size - 1) / max_kernel_size;
30013013

30023014
int64_t strideVal[1];
3003-
strideVal[0] = s0;
3004-
acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
3005-
int64_t paddingVal[] = { 0 };
3006-
acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
3007-
int64_t dilationVal[] = { 1 };
3008-
acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
3009-
int8_t cubeMathType = 0;
3015+
strideVal[0] = s0;
3016+
aclIntArray *stride = aclCreateIntArray(strideVal, 1);
3017+
int64_t paddingVal[] = {0};
3018+
aclIntArray *padding = aclCreateIntArray(paddingVal, 1);
3019+
int64_t dilationVal[] = {1};
3020+
aclIntArray *dilation = aclCreateIntArray(dilationVal, 1);
3021+
bool transposed = true;
3022+
int64_t groups = 1;
3023+
int8_t cubeMathType = 0;
30103024

30113025
#ifdef ASCEND_310P
30123026
cubeMathType = 1;
30133027
#endif
30143028

3015-
GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input.get(), acl_weight.get(), nullptr, stride.get(), padding.get(),
3016-
dilation.get(), true, padding.get(), 1, acl_dst.get(), cubeMathType);
3029+
auto weight_type = ggml_cann_type_mapping(src0->type);
3030+
auto dst_type = ggml_cann_type_mapping(dst->type);
3031+
3032+
// slice the kernel to make each conv available
3033+
int64_t slice_dim = -1;
3034+
int64_t slice_start = 0;
3035+
int64_t slice_end = max_kernel_size;
3036+
int64_t slice_step = 1;
3037+
int64_t interval = max_kernel_size;
3038+
3039+
int64_t left_pad_len = dilationVal[0] * (max_kernel_size - 1) + 1 - 2 * paddingVal[0];
3040+
int64_t right_pad_len = 0;
3041+
3042+
aclScalar* alpha = nullptr;
3043+
float alphaValue = 1.0;
3044+
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
3045+
3046+
// set zero to destination
3047+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst);
3048+
3049+
for(int k = 0; k < part_num; k++){
3050+
3051+
// create part kernel tensor and slice from big kernel
3052+
slice_start = max_kernel_size * k;
3053+
if(k == part_num - 1){
3054+
slice_end = kernel_size;
3055+
interval = kernel_size - max_kernel_size * k;
3056+
}else{
3057+
slice_end = max_kernel_size * (k+1);
3058+
}
3059+
3060+
int64_t part_ne[4];
3061+
for(int i = 0; i < 4; i++) {
3062+
part_ne[i] = *(src0->ne + i);
3063+
}
3064+
part_ne[0] = interval;
3065+
3066+
size_t part_nb[4];
3067+
part_nb[0] = sizeof(weight_type);
3068+
for (int i = 1; i < 4; i++) {
3069+
part_nb[i] = part_nb[i - 1] * part_ne[i - 1];
3070+
}
3071+
3072+
ggml_cann_pool_alloc part_kernel_allocator;
3073+
part_kernel_allocator.alloc(ctx.pool(), part_nb[3]);
3074+
void* part_kernel_buf = part_kernel_allocator.get();
3075+
3076+
acl_tensor_ptr part_kernel = ggml_cann_create_tensor(part_kernel_buf, weight_type,
3077+
ggml_element_size(src0), part_ne, part_nb, 3, ACL_FORMAT_NCL);
3078+
3079+
GGML_CANN_CALL_ACLNN_OP(ctx, Slice, acl_weight, slice_dim, slice_start, slice_end, slice_step, part_kernel);
3080+
3081+
// create the part conv result tensor
3082+
int64_t part_dst_ne[4];
3083+
for(int i = 0; i < 4; i++){
3084+
part_dst_ne[i] = *(dst->ne + i);
3085+
}
3086+
part_dst_ne[0] = (input_len - 1) * strideVal[0] - 2 * paddingVal[0] + dilationVal[0] * (part_ne[0] - 1) + 1;
3087+
3088+
size_t part_dst_nb[4];
3089+
part_dst_nb[0] = sizeof(weight_type);
3090+
for (int i = 1; i < 4; i++) {
3091+
part_dst_nb[i] = part_dst_nb[i - 1] * part_dst_ne[i - 1];
3092+
}
3093+
ggml_cann_pool_alloc part_dst_allocator;
3094+
part_dst_allocator.alloc(ctx.pool(), part_dst_nb[3]);
3095+
void* part_dst_buf = part_dst_allocator.get();
3096+
3097+
acl_tensor_ptr acl_part_dst = ggml_cann_create_tensor(part_dst_buf, dst_type, ggml_element_size(dst),
3098+
part_dst_ne, part_dst_nb, 3, ACL_FORMAT_NCL);
3099+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_part_dst);
3100+
3101+
// compute part conv transpose 1d
3102+
GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input, part_kernel, nullptr, stride,
3103+
padding, dilation, transposed, padding, groups, acl_part_dst, cubeMathType);
3104+
3105+
// compute the position of part result in final result
3106+
int64_t global_start = slice_start;
3107+
int64_t global_end = std::min((input_len - 1) * strideVal[0] + slice_end, dst_len);
3108+
3109+
left_pad_len = global_start;
3110+
right_pad_len = dst_len - global_end;
3111+
3112+
std::vector<int64_t> padDataVal = {left_pad_len,right_pad_len};
3113+
aclIntArray *padData = aclCreateIntArray(padDataVal.data(), 2);
3114+
3115+
aclScalar* pad_value = nullptr;
3116+
float pad_valueVal = 0.0;
3117+
pad_value = aclCreateScalar(&pad_valueVal, aclDataType::ACL_FLOAT);
3118+
3119+
int64_t conv_result_ne[4];
3120+
for(int i = 0; i < 4; i++){
3121+
conv_result_ne[i] = *(dst->ne + i);
3122+
}
3123+
3124+
size_t conv_result_nb[4];
3125+
conv_result_nb[0] = sizeof(weight_type);
3126+
for (int i = 1; i < 4; i++) {
3127+
conv_result_nb[i] = conv_result_nb[i - 1] * conv_result_ne[i - 1];
3128+
}
3129+
3130+
ggml_cann_pool_alloc conv_result_allocator;
3131+
conv_result_allocator.alloc(ctx.pool(), conv_result_nb[3]);
3132+
void* conv_result_buf = conv_result_allocator.get();
3133+
3134+
acl_tensor_ptr conv_result = ggml_cann_create_tensor(conv_result_buf, dst_type, ggml_element_size(dst),
3135+
conv_result_ne, conv_result_nb, 3, ACL_FORMAT_NCL);
3136+
3137+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, conv_result);
3138+
GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_part_dst, padData, pad_value, conv_result);
3139+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst, conv_result, alpha);
3140+
}
30173141
}
30183142

30193143
void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {

ggml/src/ggml-cann/aclnn_ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
#include <aclnnop/aclnn_sign.h>
4848
#include <aclnnop/aclnn_silu.h>
4949
#include <aclnnop/aclnn_sin.h>
50+
#include <aclnnop/aclnn_slice.h>
5051
#include <aclnnop/aclnn_sqrt.h>
5152
#include <aclnnop/aclnn_tanh.h>
5253

0 commit comments

Comments
 (0)