@@ -2988,32 +2988,156 @@ void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
29882988 GGML_CANN_CALL_ACLNN_OP (ctx, ArgMax, acl_src.get (), 3 , false , acl_dst.get ());
29892989}
29902990
2991- void ggml_cann_conv_transpose_1d (ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2991+ void ggml_cann_conv_transpose_1d (ggml_backend_cann_context& ctx, ggml_tensor* dst){
29922992 ggml_tensor * src0 = dst->src [0 ];
29932993 ggml_tensor * src1 = dst->src [1 ];
29942994
29952995 // stride
2996- int64_t s0 = ((const int32_t *) (dst->op_params ))[0 ];
2996+ int64_t s0 = ((const int32_t *) (dst->op_params ))[0 ];
29972997
2998- acl_tensor_ptr acl_input = ggml_cann_create_tensor (src1, src1->ne , src1->nb , 3 , ACL_FORMAT_NCL);
2998+ acl_tensor_ptr acl_input = ggml_cann_create_tensor (src1, src1->ne , src1->nb , 3 , ACL_FORMAT_NCL);
29992999 acl_tensor_ptr acl_weight = ggml_cann_create_tensor (src0, src0->ne , src0->nb , 3 , ACL_FORMAT_NCL);
3000- acl_tensor_ptr acl_dst = ggml_cann_create_tensor (dst, dst->ne , dst->nb , 3 , ACL_FORMAT_NCL);
3000+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor (dst, dst->ne , dst->nb , 3 , ACL_FORMAT_NCL);
3001+
3002+ // get base information of input and kernel
3003+ int64_t input_len = *(src1->ne );
3004+ int64_t dst_len = *(dst->ne );
3005+ int64_t kernel_size = *(src0->ne );
3006+
3007+ // set the max kernel size for each conv
3008+ int64_t max_kernel_size = 255 ;
3009+
3010+ // compute the partition of kernel
3011+ int64_t part_num = 1 ;
3012+ part_num = (kernel_size + max_kernel_size - 1 ) / max_kernel_size;
30013013
30023014 int64_t strideVal[1 ];
3003- strideVal[0 ] = s0;
3004- acl_int_array_ptr stride = ggml_cann_create_int_array (strideVal, 1 );
3005- int64_t paddingVal[] = { 0 };
3006- acl_int_array_ptr padding = ggml_cann_create_int_array (paddingVal, 1 );
3007- int64_t dilationVal[] = { 1 };
3008- acl_int_array_ptr dilation = ggml_cann_create_int_array (dilationVal, 1 );
3009- int8_t cubeMathType = 0 ;
3015+ strideVal[0 ] = s0;
3016+ aclIntArray *stride = aclCreateIntArray (strideVal, 1 );
3017+ int64_t paddingVal[] = {0 };
3018+ aclIntArray *padding = aclCreateIntArray (paddingVal, 1 );
3019+ int64_t dilationVal[] = {1 };
3020+ aclIntArray *dilation = aclCreateIntArray (dilationVal, 1 );
3021+ bool transposed = true ;
3022+ int64_t groups = 1 ;
3023+ int8_t cubeMathType = 0 ;
30103024
30113025#ifdef ASCEND_310P
30123026 cubeMathType = 1 ;
30133027#endif
30143028
3015- GGML_CANN_CALL_ACLNN_OP (ctx, Convolution, acl_input.get (), acl_weight.get (), nullptr , stride.get (), padding.get (),
3016- dilation.get (), true , padding.get (), 1 , acl_dst.get (), cubeMathType);
3029+ auto weight_type = ggml_cann_type_mapping (src0->type );
3030+ auto dst_type = ggml_cann_type_mapping (dst->type );
3031+
3032+ // slice the kernel to make each conv available
3033+ int64_t slice_dim = -1 ;
3034+ int64_t slice_start = 0 ;
3035+ int64_t slice_end = max_kernel_size;
3036+ int64_t slice_step = 1 ;
3037+ int64_t interval = max_kernel_size;
3038+
3039+ int64_t left_pad_len = dilationVal[0 ] * (max_kernel_size - 1 ) + 1 - 2 * paddingVal[0 ];
3040+ int64_t right_pad_len = 0 ;
3041+
3042+ aclScalar* alpha = nullptr ;
3043+ float alphaValue = 1.0 ;
3044+ alpha = aclCreateScalar (&alphaValue, aclDataType::ACL_FLOAT);
3045+
3046+ // set zero to destination
3047+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceZero, acl_dst);
3048+
3049+ for (int k = 0 ; k < part_num; k++){
3050+
3051+ // create part kernel tensor and slice from big kernel
3052+ slice_start = max_kernel_size * k;
3053+ if (k == part_num - 1 ){
3054+ slice_end = kernel_size;
3055+ interval = kernel_size - max_kernel_size * k;
3056+ }else {
3057+ slice_end = max_kernel_size * (k+1 );
3058+ }
3059+
3060+ int64_t part_ne[4 ];
3061+ for (int i = 0 ; i < 4 ; i++) {
3062+ part_ne[i] = *(src0->ne + i);
3063+ }
3064+ part_ne[0 ] = interval;
3065+
3066+ size_t part_nb[4 ];
3067+ part_nb[0 ] = sizeof (weight_type);
3068+ for (int i = 1 ; i < 4 ; i++) {
3069+ part_nb[i] = part_nb[i - 1 ] * part_ne[i - 1 ];
3070+ }
3071+
3072+ ggml_cann_pool_alloc part_kernel_allocator;
3073+ part_kernel_allocator.alloc (ctx.pool (), part_nb[3 ]);
3074+ void * part_kernel_buf = part_kernel_allocator.get ();
3075+
3076+ acl_tensor_ptr part_kernel = ggml_cann_create_tensor (part_kernel_buf, weight_type,
3077+ ggml_element_size (src0), part_ne, part_nb, 3 , ACL_FORMAT_NCL);
3078+
3079+ GGML_CANN_CALL_ACLNN_OP (ctx, Slice, acl_weight, slice_dim, slice_start, slice_end, slice_step, part_kernel);
3080+
3081+ // create the part conv result tensor
3082+ int64_t part_dst_ne[4 ];
3083+ for (int i = 0 ; i < 4 ; i++){
3084+ part_dst_ne[i] = *(dst->ne + i);
3085+ }
3086+ part_dst_ne[0 ] = (input_len - 1 ) * strideVal[0 ] - 2 * paddingVal[0 ] + dilationVal[0 ] * (part_ne[0 ] - 1 ) + 1 ;
3087+
3088+ size_t part_dst_nb[4 ];
3089+ part_dst_nb[0 ] = sizeof (weight_type);
3090+ for (int i = 1 ; i < 4 ; i++) {
3091+ part_dst_nb[i] = part_dst_nb[i - 1 ] * part_dst_ne[i - 1 ];
3092+ }
3093+ ggml_cann_pool_alloc part_dst_allocator;
3094+ part_dst_allocator.alloc (ctx.pool (), part_dst_nb[3 ]);
3095+ void * part_dst_buf = part_dst_allocator.get ();
3096+
3097+ acl_tensor_ptr acl_part_dst = ggml_cann_create_tensor (part_dst_buf, dst_type, ggml_element_size (dst),
3098+ part_dst_ne, part_dst_nb, 3 , ACL_FORMAT_NCL);
3099+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceZero, acl_part_dst);
3100+
3101+ // compute part conv transpose 1d
3102+ GGML_CANN_CALL_ACLNN_OP (ctx, Convolution, acl_input, part_kernel, nullptr , stride,
3103+ padding, dilation, transposed, padding, groups, acl_part_dst, cubeMathType);
3104+
3105+ // compute the position of part result in final result
3106+ int64_t global_start = slice_start;
3107+ int64_t global_end = std::min ((input_len - 1 ) * strideVal[0 ] + slice_end, dst_len);
3108+
3109+ left_pad_len = global_start;
3110+ right_pad_len = dst_len - global_end;
3111+
3112+ std::vector<int64_t > padDataVal = {left_pad_len,right_pad_len};
3113+ aclIntArray *padData = aclCreateIntArray (padDataVal.data (), 2 );
3114+
3115+ aclScalar* pad_value = nullptr ;
3116+ float pad_valueVal = 0.0 ;
3117+ pad_value = aclCreateScalar (&pad_valueVal, aclDataType::ACL_FLOAT);
3118+
3119+ int64_t conv_result_ne[4 ];
3120+ for (int i = 0 ; i < 4 ; i++){
3121+ conv_result_ne[i] = *(dst->ne + i);
3122+ }
3123+
3124+ size_t conv_result_nb[4 ];
3125+ conv_result_nb[0 ] = sizeof (weight_type);
3126+ for (int i = 1 ; i < 4 ; i++) {
3127+ conv_result_nb[i] = conv_result_nb[i - 1 ] * conv_result_ne[i - 1 ];
3128+ }
3129+
3130+ ggml_cann_pool_alloc conv_result_allocator;
3131+ conv_result_allocator.alloc (ctx.pool (), conv_result_nb[3 ]);
3132+ void * conv_result_buf = conv_result_allocator.get ();
3133+
3134+ acl_tensor_ptr conv_result = ggml_cann_create_tensor (conv_result_buf, dst_type, ggml_element_size (dst),
3135+ conv_result_ne, conv_result_nb, 3 , ACL_FORMAT_NCL);
3136+
3137+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceZero, conv_result);
3138+ GGML_CANN_CALL_ACLNN_OP (ctx, ConstantPadNd, acl_part_dst, padData, pad_value, conv_result);
3139+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceAdd, acl_dst, conv_result, alpha);
3140+ }
30173141}
30183142
30193143void ggml_cann_elu (ggml_backend_cann_context & ctx, ggml_tensor * dst) {
0 commit comments