2424
2525#include < acl/acl.h>
2626#include < stdarg.h>
27+ #include < aclnnop/aclnn_trans_matmul_weight.h>
2728
2829#include < cmath>
2930#include < cstdio>
@@ -1115,6 +1116,63 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
11151116 return GGML_STATUS_SUCCESS;
11161117}
11171118
1119+ static int CreateAclTensorWeight (const void *hostData, const std::vector<int64_t > &shape, void **deviceAddr,
1120+ aclDataType dataType, aclTensor **tensor)
1121+ {
1122+ uint64_t size = 1 ;
1123+ for (auto i : shape) {
1124+ size *= i;
1125+ }
1126+
1127+ const aclIntArray *mat2Size = aclCreateIntArray (shape.data (), shape.size ());
1128+ ACL_CHECK (aclnnCalculateMatmulWeightSizeV2 (mat2Size, dataType, &size));
1129+
1130+ size *= sizeof (int16_t );
1131+
1132+ ACL_CHECK (aclrtMalloc (deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
1133+ aclrtMemcpy (*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
1134+
1135+ std::vector<int64_t > strides (shape.size (), 1 );
1136+ for (int64_t i = shape.size () - 2 ; i >= 0 ; i--) {
1137+ strides[i] = shape[i + 1 ] * strides[i + 1 ];
1138+ }
1139+
1140+ *tensor = aclCreateTensor (shape.data (), shape.size (), dataType, strides.data (), 0 , aclFormat::ACL_FORMAT_ND,
1141+ shape.data (), shape.size (), *deviceAddr);
1142+ return 0 ;
1143+ }
1144+
1145+ static void weight_format_to_nz (ggml_tensor *tensor, const void *data, size_t offset) {
1146+ aclrtStream stream;
1147+ ACL_CHECK (aclrtCreateStream (&stream));
1148+
1149+ std::vector<int64_t > weightTransposedShape = {tensor->ne [1 ], tensor->ne [0 ]};
1150+ void *weightTransposedDeviceAddr = nullptr ;
1151+ aclTensor *weightTransposed = nullptr ;
1152+ CreateAclTensorWeight (data, weightTransposedShape, &weightTransposedDeviceAddr,
1153+ ggml_cann_type_mapping (tensor->type ), &weightTransposed);
1154+
1155+ uint64_t workspaceSize = 0 ;
1156+ aclOpExecutor *executor;
1157+ void *workspaceAddr = nullptr ;
1158+
1159+ // TransMatmulWeight
1160+ ACL_CHECK (aclnnTransMatmulWeightGetWorkspaceSize (weightTransposed, &workspaceSize, &executor));
1161+ std::unique_ptr<void , aclError (*)(void *)> workspaceAddrPtrTrans (nullptr , aclrtFree);
1162+ if (workspaceSize > 0 ) {
1163+ ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
1164+ workspaceAddrPtrTrans.reset (workspaceAddr);
1165+ }
1166+ ACL_CHECK (aclnnTransMatmulWeight (workspaceAddr, workspaceSize, executor, stream));
1167+
1168+ size_t size = ggml_nelements (tensor) * ggml_element_size (tensor);
1169+
1170+ aclrtMemcpy ((char *)tensor->data + offset, size,
1171+ weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
1172+ ACL_CHECK (aclDestroyTensor (weightTransposed));
1173+ aclrtFree (weightTransposedDeviceAddr);
1174+ }
1175+
11181176// TODO: need handle tensor which has paddings.
11191177/* *
11201178 * @brief Set tensor data in a CANN buffer.
@@ -1139,9 +1197,16 @@ static void ggml_backend_cann_buffer_set_tensor(
11391197 // For acl, synchronous functions use this default stream.
11401198 // Why aclrtSynchronizeDevice?
11411199
1200+ bool weightToNZ = false ;
1201+ #ifdef ASCEND_310P
1202+ weightToNZ = (getenv (" GGML_CANN_WEIGHT_NZ" ) != nullptr );
1203+ #endif
11421204 if (!need_transform (tensor->type )) {
11431205 ACL_CHECK (aclrtMemcpy ((char *)tensor->data + offset, size, data, size,
11441206 ACL_MEMCPY_HOST_TO_DEVICE));
1207+ if (weightToNZ && is_matmul_weight ((const ggml_tensor*)tensor)) {
1208+ weight_format_to_nz (tensor, data, offset);
1209+ }
11451210 } else {
11461211 void *transform_buffer = malloc (size);
11471212 ggml_backend_cann_transform (tensor, data, transform_buffer);
0 commit comments