@@ -1116,61 +1116,59 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
11161116 return GGML_STATUS_SUCCESS;
11171117}
11181118
1119- static int CreateAclTensorWeight (const void *hostData, const std::vector<int64_t > &shape, void **deviceAddr,
1120- aclDataType dataType, aclTensor **tensor)
1121- {
1122- uint64_t size = 1 ;
1123- for (auto i : shape) {
1124- size *= i;
1119+ // ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
1120+ namespace {
1121+ void * g_nz_workspace = nullptr ;
1122+ size_t g_nz_workspace_allocated = 0 ;
1123+
1124+ void release_nz_workspace () {
1125+ if (g_nz_workspace) {
1126+ aclrtFree (g_nz_workspace);
1127+ g_nz_workspace = nullptr ;
1128+ g_nz_workspace_allocated = 0 ;
1129+ }
11251130 }
11261131
1127- const aclIntArray *mat2Size = aclCreateIntArray (shape.data (), shape.size ());
1128- ACL_CHECK (aclnnCalculateMatmulWeightSizeV2 (mat2Size, dataType, &size));
1129-
1130- size *= sizeof (int16_t );
1131-
1132- ACL_CHECK (aclrtMalloc (deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
1133- aclrtMemcpy (*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
1134-
1135- std::vector<int64_t > strides (shape.size (), 1 );
1136- for (int64_t i = shape.size () - 2 ; i >= 0 ; i--) {
1137- strides[i] = shape[i + 1 ] * strides[i + 1 ];
1132+ void relloc_nz_workspace (size_t new_size) {
1133+ if (new_size > g_nz_workspace_allocated) {
1134+ if (g_nz_workspace) {
1135+ aclrtFree (g_nz_workspace);
1136+ g_nz_workspace = nullptr ;
1137+ }
1138+ ACL_CHECK (aclrtMalloc (&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
1139+ g_nz_workspace_allocated = new_size;
1140+ }
11381141 }
1139-
1140- *tensor = aclCreateTensor (shape.data (), shape.size (), dataType, strides.data (), 0 , aclFormat::ACL_FORMAT_ND,
1141- shape.data (), shape.size (), *deviceAddr);
1142- return 0 ;
11431142}
11441143
1144+ /* *
1145+ * @brief Convert tensor weights to NZ format using Ascend CANN API.
1146+ *
1147+ * This function creates a transposed tensor descriptor and performs the
1148+ * TransMatmulWeight operation. Converting tensor formats can significantly
1149+ * improve performance on certain hardware.
1150+ *
1151+ * @param tensor Pointer to the input ggml_tensor containing the weights.
1152+ * @param data Pointer to the raw data buffer for the tensor weights.
1153+ * @param offset Byte offset within the tensor data buffer where weights start.
1154+ *
1155+ * @note The workspace buffer used in this function is managed globally and reused
1156+ * across calls. This reduces overhead from repeated memory allocation and deallocation.
1157+ */
11451158static void weight_format_to_nz (ggml_tensor *tensor, const void *data, size_t offset) {
1146- aclrtStream stream;
1147- ACL_CHECK (aclrtCreateStream (&stream));
1148-
1149- std::vector<int64_t > weightTransposedShape = {tensor->ne [1 ], tensor->ne [0 ]};
1150- void *weightTransposedDeviceAddr = nullptr ;
1151- aclTensor *weightTransposed = nullptr ;
1152- CreateAclTensorWeight (data, weightTransposedShape, &weightTransposedDeviceAddr,
1153- ggml_cann_type_mapping (tensor->type ), &weightTransposed);
1154-
1159+ aclTensor* weightTransposed = ggml_cann_create_tensor (tensor, tensor->ne ,
1160+ tensor->nb , 2 , ACL_FORMAT_ND, offset);
11551161 uint64_t workspaceSize = 0 ;
11561162 aclOpExecutor *executor;
1157- void *workspaceAddr = nullptr ;
11581163
11591164 // TransMatmulWeight
1160- ACL_CHECK (aclnnTransMatmulWeightGetWorkspaceSize (weightTransposed, &workspaceSize, &executor));
1161- std::unique_ptr<void , aclError (*)(void *)> workspaceAddrPtrTrans (nullptr , aclrtFree);
1162- if (workspaceSize > 0 ) {
1163- ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
1164- workspaceAddrPtrTrans.reset (workspaceAddr);
1165- }
1166- ACL_CHECK (aclnnTransMatmulWeight (workspaceAddr, workspaceSize, executor, stream));
1165+ ACL_CHECK (aclnnTransMatmulWeightGetWorkspaceSize (weightTransposed,
1166+ &workspaceSize, &executor));
1167+ // Avoid frequent malloc/free of the workspace.
1168+ relloc_nz_workspace (workspaceSize);
11671169
1168- size_t size = ggml_nelements (tensor) * ggml_element_size (tensor);
1169-
1170- aclrtMemcpy ((char *)tensor->data + offset, size,
1171- weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
1170+ ACL_CHECK (aclnnTransMatmulWeight (g_nz_workspace, workspaceSize, executor, nullptr ));
11721171 ACL_CHECK (aclDestroyTensor (weightTransposed));
1173- aclrtFree (weightTransposedDeviceAddr);
11741172}
11751173
11761174// TODO: need handle tensor which has paddings.
@@ -1197,14 +1195,14 @@ static void ggml_backend_cann_buffer_set_tensor(
11971195 // For acl, synchronous functions use this default stream.
11981196 // Why aclrtSynchronizeDevice?
11991197
1200- bool weightToNZ = false ;
1201- #ifdef ASCEND_310P
1202- weightToNZ = (getenv (" GGML_CANN_WEIGHT_NZ" ) != nullptr );
1203- #endif
1198+ // Only check env once.
1199+ static bool weight_to_nz = parse_bool (get_env (" GGML_CANN_WEIGHT_NZ" ).value_or (" " ));
12041200 if (!need_transform (tensor->type )) {
12051201 ACL_CHECK (aclrtMemcpy ((char *)tensor->data + offset, size, data, size,
12061202 ACL_MEMCPY_HOST_TO_DEVICE));
1207- if (weightToNZ && is_matmul_weight ((const ggml_tensor*)tensor)) {
1203+ if (weight_to_nz && is_matmul_weight ((const ggml_tensor*)tensor)) {
1204+ GGML_ASSERT (tensor->ne [2 ] == 1 );
1205+ GGML_ASSERT (tensor->ne [3 ] == 1 );
12081206 weight_format_to_nz (tensor, data, offset);
12091207 }
12101208 } else {
@@ -1440,20 +1438,32 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
14401438 size_t size = ggml_nbytes (tensor);
14411439 int64_t ne0 = tensor->ne [0 ];
14421440
1441+ // Only check env once.
1442+ static bool weight_to_nz = parse_bool (get_env (" GGML_CANN_WEIGHT_NZ" ).value_or (" " ));
1443+
14431444 // last line must bigger than 32, because every single op deal at
14441445 // least 32 bytes.
14451446 // TODO: quantized type?
14461447 // int64_t line_size = ne0 * ggml_element_size(tensor);
14471448 // int64_t line_size_align_32 = (line_size + 31) & ~31;
14481449 // size += (line_size_align_32 - line_size);
1449-
1450- // TODO: not support quantized yet.
1451- // TODO: consider un-continue tensor.
14521450 if (ggml_is_quantized (tensor->type )) {
14531451 if (ne0 % MATRIX_ROW_PADDING != 0 ) {
14541452 size += ggml_row_size (
14551453 tensor->type , MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
14561454 }
1455+ } else if (weight_to_nz && is_matmul_weight ((const ggml_tensor*)tensor)) {
1456+ // NZ format weight are not support quantized yet.
1457+ // If ND tensor transform to NZ, size may changed.
1458+ int64_t shape[] = {tensor->ne [1 ], tensor->ne [0 ]};
1459+ GGML_ASSERT (tensor->ne [2 ] == 1 );
1460+ GGML_ASSERT (tensor->ne [3 ] == 1 );
1461+ const aclIntArray *acl_shape = aclCreateIntArray (shape, 2 );
1462+ size_t new_size;
1463+ ACL_CHECK (aclnnCalculateMatmulWeightSizeV2 (acl_shape,
1464+ ggml_cann_type_mapping (tensor->type ), &new_size));
1465+ ACL_CHECK (aclDestroyIntArray (acl_shape));
1466+ size = std::max (size, new_size);
14571467 }
14581468
14591469 return size;
@@ -2080,6 +2090,8 @@ static enum ggml_status ggml_backend_cann_graph_compute(
20802090 (ggml_backend_cann_context*)backend->context ;
20812091
20822092 ggml_cann_set_device (cann_ctx->device );
2093+ // release temp buffer create by set tensor.
2094+ release_nz_workspace ();
20832095
20842096 for (int i = 0 ; i < cgraph->n_nodes ; i++) {
20852097 ggml_tensor* node = cgraph->nodes [i];
0 commit comments