Skip to content

Commit 11490b3

Browse files
authored
CANN: Improve loading efficiency after converting weights to NZ format. (#14985)
* CANN: Improve loading efficiency after converting weights to NZ format. * CANN: fix typo
1 parent 66625a5 commit 11490b3

File tree

3 files changed

+70
-58
lines changed

3 files changed

+70
-58
lines changed

docs/backend/CANN.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -310,5 +310,7 @@ Specifies the memory pool management strategy:
310310

311311
Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
312312

313-
## TODO
314-
- Support more models and data types.
313+
### GGML_CANN_WEIGHT_NZ
314+
315+
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
316+

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1913,11 +1913,9 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
19131913
bcast_weight_nb[4], bcast_weight_nb[5]};
19141914
aclTensor* acl_weight_tensor;
19151915

1916-
bool weightToNZ = false;
1917-
#ifdef ASCEND_310P
1918-
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
1919-
#endif
1920-
if (weightToNZ && is_matmul_weight(weight)) {
1916+
// Only check env once.
1917+
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
1918+
if (weight_to_nz && is_matmul_weight(weight)) {
19211919
int64_t acl_stride[2] = {1, transpose_ne[1]};
19221920

19231921
// Reverse ne.

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 63 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,61 +1116,59 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
11161116
return GGML_STATUS_SUCCESS;
11171117
}
11181118

1119-
static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
1120-
aclDataType dataType, aclTensor **tensor)
1121-
{
1122-
uint64_t size = 1;
1123-
for (auto i : shape) {
1124-
size *= i;
1119+
// ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
1120+
namespace {
1121+
void* g_nz_workspace = nullptr;
1122+
size_t g_nz_workspace_allocated = 0;
1123+
1124+
void release_nz_workspace() {
1125+
if (g_nz_workspace) {
1126+
aclrtFree(g_nz_workspace);
1127+
g_nz_workspace = nullptr;
1128+
g_nz_workspace_allocated = 0;
1129+
}
11251130
}
11261131

1127-
const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
1128-
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
1129-
1130-
size *= sizeof(int16_t);
1131-
1132-
ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
1133-
aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
1134-
1135-
std::vector<int64_t> strides(shape.size(), 1);
1136-
for (int64_t i = shape.size() - 2; i >= 0; i--) {
1137-
strides[i] = shape[i + 1] * strides[i + 1];
1132+
void relloc_nz_workspace(size_t new_size) {
1133+
if (new_size > g_nz_workspace_allocated) {
1134+
if (g_nz_workspace) {
1135+
aclrtFree(g_nz_workspace);
1136+
g_nz_workspace = nullptr;
1137+
}
1138+
ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
1139+
g_nz_workspace_allocated = new_size;
1140+
}
11381141
}
1139-
1140-
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
1141-
shape.data(), shape.size(), *deviceAddr);
1142-
return 0;
11431142
}
11441143

1144+
/**
1145+
* @brief Convert tensor weights to NZ format using Ascend CANN API.
1146+
*
1147+
* This function creates a transposed tensor descriptor and performs the
1148+
* TransMatmulWeight operation. Converting tensor formats can significantly
1149+
* improve performance on certain hardware.
1150+
*
1151+
* @param tensor Pointer to the input ggml_tensor containing the weights.
1152+
* @param data Pointer to the raw data buffer for the tensor weights.
1153+
* @param offset Byte offset within the tensor data buffer where weights start.
1154+
*
1155+
* @note The workspace buffer used in this function is managed globally and reused
1156+
* across calls. This reduces overhead from repeated memory allocation and deallocation.
1157+
*/
11451158
static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
1146-
aclrtStream stream;
1147-
ACL_CHECK(aclrtCreateStream(&stream));
1148-
1149-
std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
1150-
void *weightTransposedDeviceAddr = nullptr;
1151-
aclTensor *weightTransposed = nullptr;
1152-
CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
1153-
ggml_cann_type_mapping(tensor->type), &weightTransposed);
1154-
1159+
aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
1160+
tensor->nb, 2, ACL_FORMAT_ND, offset);
11551161
uint64_t workspaceSize = 0;
11561162
aclOpExecutor *executor;
1157-
void *workspaceAddr = nullptr;
11581163

11591164
// TransMatmulWeight
1160-
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
1161-
std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
1162-
if (workspaceSize > 0) {
1163-
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
1164-
workspaceAddrPtrTrans.reset(workspaceAddr);
1165-
}
1166-
ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
1165+
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
1166+
&workspaceSize, &executor));
1167+
// Avoid frequent malloc/free of the workspace.
1168+
relloc_nz_workspace(workspaceSize);
11671169

1168-
size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
1169-
1170-
aclrtMemcpy((char *)tensor->data + offset, size,
1171-
weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
1170+
ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
11721171
ACL_CHECK(aclDestroyTensor(weightTransposed));
1173-
aclrtFree(weightTransposedDeviceAddr);
11741172
}
11751173

11761174
// TODO: need handle tensor which has paddings.
@@ -1197,14 +1195,14 @@ static void ggml_backend_cann_buffer_set_tensor(
11971195
// For acl, synchronous functions use this default stream.
11981196
// Why aclrtSynchronizeDevice?
11991197

1200-
bool weightToNZ = false;
1201-
#ifdef ASCEND_310P
1202-
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
1203-
#endif
1198+
// Only check env once.
1199+
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
12041200
if (!need_transform(tensor->type)) {
12051201
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
12061202
ACL_MEMCPY_HOST_TO_DEVICE));
1207-
if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
1203+
if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
1204+
GGML_ASSERT(tensor->ne[2] == 1);
1205+
GGML_ASSERT(tensor->ne[3] == 1);
12081206
weight_format_to_nz(tensor, data, offset);
12091207
}
12101208
} else {
@@ -1440,20 +1438,32 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
14401438
size_t size = ggml_nbytes(tensor);
14411439
int64_t ne0 = tensor->ne[0];
14421440

1441+
// Only check env once.
1442+
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
1443+
14431444
// last line must bigger than 32, because every single op deal at
14441445
// least 32 bytes.
14451446
// TODO: quantized type?
14461447
// int64_t line_size = ne0 * ggml_element_size(tensor);
14471448
// int64_t line_size_align_32 = (line_size + 31) & ~31;
14481449
// size += (line_size_align_32 - line_size);
1449-
1450-
// TODO: not support quantized yet.
1451-
// TODO: consider un-continue tensor.
14521450
if (ggml_is_quantized(tensor->type)) {
14531451
if (ne0 % MATRIX_ROW_PADDING != 0) {
14541452
size += ggml_row_size(
14551453
tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
14561454
}
1455+
} else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
1456+
// NZ format weight are not support quantized yet.
1457+
// If ND tensor transform to NZ, size may changed.
1458+
int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
1459+
GGML_ASSERT(tensor->ne[2] == 1);
1460+
GGML_ASSERT(tensor->ne[3] == 1);
1461+
const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
1462+
size_t new_size;
1463+
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
1464+
ggml_cann_type_mapping(tensor->type), &new_size));
1465+
ACL_CHECK(aclDestroyIntArray(acl_shape));
1466+
size = std::max(size, new_size);
14571467
}
14581468

14591469
return size;
@@ -2080,6 +2090,8 @@ static enum ggml_status ggml_backend_cann_graph_compute(
20802090
(ggml_backend_cann_context*)backend->context;
20812091

20822092
ggml_cann_set_device(cann_ctx->device);
2093+
//release temp buffer create by set tensor.
2094+
release_nz_workspace();
20832095

20842096
for (int i = 0; i < cgraph->n_nodes; i++) {
20852097
ggml_tensor* node = cgraph->nodes[i];

0 commit comments

Comments
 (0)