2424
2525#include < acl/acl.h>
2626#include < stdarg.h>
27+ #include < aclnnop/aclnn_trans_matmul_weight.h>
2728
2829#include < cmath>
2930#include < cstdio>
@@ -1115,6 +1116,95 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
11151116 return GGML_STATUS_SUCCESS;
11161117}
11171118
1119+ static bool is_matmul_weight (const ggml_tensor* tensor) {
1120+ std::string name = ggml_get_name (tensor);
1121+ static const std::unordered_set<std::string> weight_suffixes{
1122+ " output.weight" ,
1123+ " attn_q.weight" ,
1124+ " attn_k.weight" ,
1125+ " attn_v.weight" ,
1126+ " attn_output.weight" ,
1127+ " ffn_gate.weight" ,
1128+ " ffn_up.weight" ,
1129+ " ffn_down.weight"
1130+ };
1131+
1132+ for (const auto & suffix : weight_suffixes) {
1133+ if (name.find (suffix) != std::string::npos) {
1134+ return true ;
1135+ }
1136+ }
1137+ return false ;
1138+ }
1139+
1140+ static int CreateAclTensorWeight (const void *hostData, const std::vector<int64_t > &shape, void **deviceAddr,
1141+ aclDataType dataType, aclTensor **tensor)
1142+ {
1143+ uint64_t size = 1 ;
1144+ for (auto i : shape) {
1145+ size *= i;
1146+ }
1147+
1148+ const aclIntArray *mat2Size = aclCreateIntArray (shape.data (), shape.size ());
1149+ ACL_CHECK (aclnnCalculateMatmulWeightSizeV2 (mat2Size, dataType, &size));
1150+
1151+ size *= sizeof (int16_t );
1152+
1153+ ACL_CHECK (aclrtMalloc (deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
1154+ aclrtMemcpy (*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
1155+
1156+ std::vector<int64_t > strides (shape.size (), 1 );
1157+ for (int64_t i = shape.size () - 2 ; i >= 0 ; i--) {
1158+ strides[i] = shape[i + 1 ] * strides[i + 1 ];
1159+ }
1160+
1161+ // std::vector<int64_t> storageShape;
1162+ // storageShape.push_back(size);
1163+ *tensor = aclCreateTensor (shape.data (), shape.size (), dataType, strides.data (), 0 , aclFormat::ACL_FORMAT_ND,
1164+ shape.data (), shape.size (), *deviceAddr);
1165+ return 0 ;
1166+ }
1167+
1168+ static void weight_format_to_nz (ggml_tensor *tensor, const void *data, size_t offset) {
1169+ aclrtStream stream;
1170+ ACL_CHECK (aclrtCreateStream (&stream));
1171+
1172+ std::vector<int64_t > weightShape = {tensor->ne [0 ], tensor->ne [1 ]};
1173+ std::vector<int64_t > weightTransposedShape = {tensor->ne [1 ], tensor->ne [0 ]};
1174+ void *weightDeviceAddr = nullptr ;
1175+ void *weightTransposedDeviceAddr = nullptr ;
1176+ aclTensor *weight = nullptr ;
1177+ aclTensor *weightTransposed = nullptr ;
1178+ CreateAclTensorWeight (data, weightShape, &weightDeviceAddr, ggml_cann_type_mapping (tensor->type ), &weight);
1179+ CreateAclTensorWeight (data, weightTransposedShape, &weightTransposedDeviceAddr,
1180+ ggml_cann_type_mapping (tensor->type ), &weightTransposed);
1181+
1182+ uint64_t workspaceSize = 0 ;
1183+ aclOpExecutor *executor;
1184+ void *workspaceAddr = nullptr ;
1185+
1186+ // TransMatmulWeight
1187+ ACL_CHECK (aclnnTransMatmulWeightGetWorkspaceSize (weightTransposed, &workspaceSize, &executor));
1188+ std::unique_ptr<void , aclError (*)(void *)> workspaceAddrPtrTrans (nullptr , aclrtFree);
1189+ if (workspaceSize > 0 ) {
1190+ ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
1191+ workspaceAddrPtrTrans.reset (workspaceAddr);
1192+ }
1193+ ACL_CHECK (aclnnTransMatmulWeight (workspaceAddr, workspaceSize, executor, stream));
1194+
1195+ size_t size = ggml_nelements (tensor) * ggml_element_size (tensor);
1196+
1197+ aclrtMemcpy ((char *)tensor->data + offset, size,
1198+ weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
1199+ ACL_CHECK (aclDestroyTensor (weight));
1200+ ACL_CHECK (aclDestroyTensor (weightTransposed));
1201+ aclrtFree (weightDeviceAddr);
1202+ aclrtFree (weightTransposedDeviceAddr);
1203+ if (workspaceSize > 0 ) {
1204+ aclrtFree (workspaceAddr);
1205+ }
1206+ }
1207+
11181208// TODO: need handle tensor which has paddings.
11191209/* *
11201210 * @brief Set tensor data in a CANN buffer.
@@ -1139,16 +1229,26 @@ static void ggml_backend_cann_buffer_set_tensor(
11391229 // For acl, synchronous functions use this default stream.
11401230 // Why aclrtSynchronizeDevice?
11411231
1232+ bool weightToNZ = false ;
1233+ #ifdef ASCEND_310P
1234+ weightToNZ = (getenv (" GGML_CANN_WEIGHT_NZ" ) != nullptr );
1235+ #endif
11421236 if (!need_transform (tensor->type )) {
11431237 ACL_CHECK (aclrtMemcpy ((char *)tensor->data + offset, size, data, size,
11441238 ACL_MEMCPY_HOST_TO_DEVICE));
1239+ if (weightToNZ && is_matmul_weight ((const ggml_tensor*)tensor)) {
1240+ weight_format_to_nz (tensor, data, offset);
1241+ }
11451242 } else {
11461243 void *transform_buffer = malloc (size);
11471244 ggml_backend_cann_transform (tensor, data, transform_buffer);
11481245
11491246 ACL_CHECK (aclrtMemcpy ((char *)tensor->data + offset, size,
11501247 transform_buffer, size,
11511248 ACL_MEMCPY_HOST_TO_DEVICE));
1249+ if (weightToNZ && is_matmul_weight ((const ggml_tensor*)tensor)) {
1250+ weight_format_to_nz (tensor, transform_buffer, offset);
1251+ }
11521252 free (transform_buffer);
11531253 }
11541254}
@@ -2044,8 +2144,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
20442144 switch (op->src [0 ]->type ) {
20452145 case GGML_TYPE_F16:
20462146 case GGML_TYPE_F32:
2047- return true ;
20482147 case GGML_TYPE_Q8_0:
2148+ return true ;
20492149 case GGML_TYPE_Q4_0:
20502150#ifdef ASCEND_310P
20512151 // Q4 && Q8 per group is not suppor on 310p device
0 commit comments