Skip to content

Commit 14c28df

Browse files
authored
CANN: weight format to NZ for Ascend310P3 (ggml-org#14407)
* weight format to nz for 310p * remove quant weight format to nz * clean code * fix * make the conditions for converting weights to NZ format consistent * clean code
1 parent 8c988fa commit 14c28df

File tree

3 files changed

+118
-2
lines changed

3 files changed

+118
-2
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1785,8 +1785,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
17851785
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
17861786
bcast_weight_nb[2], bcast_weight_nb[3],
17871787
bcast_weight_nb[4], bcast_weight_nb[5]};
1788-
aclTensor* acl_weight_tensor =
1789-
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
1788+
aclTensor* acl_weight_tensor;
1789+
1790+
bool weightToNZ = false;
1791+
#ifdef ASCEND_310P
1792+
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
1793+
#endif
1794+
if (weightToNZ && is_matmul_weight(weight)) {
1795+
int64_t acl_stride[2] = {1, transpose_ne[1]};
1796+
1797+
// Reverse ne.
1798+
std::reverse(transpose_ne, transpose_ne + n_dims);
1799+
1800+
std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
1801+
1802+
acl_weight_tensor = aclCreateTensor(
1803+
transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
1804+
0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
1805+
} else {
1806+
acl_weight_tensor =
1807+
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
1808+
}
17901809
aclTensor* acl_dst =
17911810
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
17921811

ggml/src/ggml-cann/aclnn_ops.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#ifndef CANN_ACLNN_OPS
2424
#define CANN_ACLNN_OPS
2525

26+
#include <unordered_set>
2627
#include <functional>
2728
#include <aclnnop/aclnn_abs.h>
2829
#include <aclnnop/aclnn_neg.h>
@@ -1020,6 +1021,37 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
10201021
*/
10211022
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
10221023

1024+
/**
1025+
* @brief Check whether a tensor is a weight tensor for matrix multiplication.
1026+
*
1027+
* @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations,
1028+
* typically within neural network layers. The function maintains a static set of canonical weight
1029+
* naming suffixes from Transformer-based architectures. Uses substring matching to identify weight
1030+
* tensors even with hierarchical naming patterns.
1031+
*
1032+
* @param tensor Pointer to the target ggml_tensor object (const-qualified).
1033+
*/
1034+
static bool is_matmul_weight(const ggml_tensor* tensor) {
1035+
std::string name = ggml_get_name(tensor);
1036+
static const std::unordered_set<std::string> weight_suffixes{
1037+
"output.weight",
1038+
"attn_q.weight",
1039+
"attn_k.weight",
1040+
"attn_v.weight",
1041+
"attn_output.weight",
1042+
"ffn_gate.weight",
1043+
"ffn_up.weight",
1044+
"ffn_down.weight"
1045+
};
1046+
1047+
for (const auto& suffix : weight_suffixes) {
1048+
if (name.find(suffix) != std::string::npos) {
1049+
return true;
1050+
}
1051+
}
1052+
return false;
1053+
}
1054+
10231055
/**
10241056
* @brief Applies a element-wise operation to two input tensors using the CANN
10251057
* backend.

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
#include <acl/acl.h>
2626
#include <stdarg.h>
27+
#include <aclnnop/aclnn_trans_matmul_weight.h>
2728

2829
#include <cmath>
2930
#include <cstdio>
@@ -1115,6 +1116,63 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
11151116
return GGML_STATUS_SUCCESS;
11161117
}
11171118

1119+
static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
1120+
aclDataType dataType, aclTensor **tensor)
1121+
{
1122+
uint64_t size = 1;
1123+
for (auto i : shape) {
1124+
size *= i;
1125+
}
1126+
1127+
const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
1128+
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
1129+
1130+
size *= sizeof(int16_t);
1131+
1132+
ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
1133+
aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
1134+
1135+
std::vector<int64_t> strides(shape.size(), 1);
1136+
for (int64_t i = shape.size() - 2; i >= 0; i--) {
1137+
strides[i] = shape[i + 1] * strides[i + 1];
1138+
}
1139+
1140+
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
1141+
shape.data(), shape.size(), *deviceAddr);
1142+
return 0;
1143+
}
1144+
1145+
static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
1146+
aclrtStream stream;
1147+
ACL_CHECK(aclrtCreateStream(&stream));
1148+
1149+
std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
1150+
void *weightTransposedDeviceAddr = nullptr;
1151+
aclTensor *weightTransposed = nullptr;
1152+
CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
1153+
ggml_cann_type_mapping(tensor->type), &weightTransposed);
1154+
1155+
uint64_t workspaceSize = 0;
1156+
aclOpExecutor *executor;
1157+
void *workspaceAddr = nullptr;
1158+
1159+
// TransMatmulWeight
1160+
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
1161+
std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
1162+
if (workspaceSize > 0) {
1163+
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
1164+
workspaceAddrPtrTrans.reset(workspaceAddr);
1165+
}
1166+
ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
1167+
1168+
size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
1169+
1170+
aclrtMemcpy((char *)tensor->data + offset, size,
1171+
weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
1172+
ACL_CHECK(aclDestroyTensor(weightTransposed));
1173+
aclrtFree(weightTransposedDeviceAddr);
1174+
}
1175+
11181176
// TODO: need handle tensor which has paddings.
11191177
/**
11201178
* @brief Set tensor data in a CANN buffer.
@@ -1139,9 +1197,16 @@ static void ggml_backend_cann_buffer_set_tensor(
11391197
// For acl, synchronous functions use this default stream.
11401198
// Why aclrtSynchronizeDevice?
11411199

1200+
bool weightToNZ = false;
1201+
#ifdef ASCEND_310P
1202+
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
1203+
#endif
11421204
if (!need_transform(tensor->type)) {
11431205
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
11441206
ACL_MEMCPY_HOST_TO_DEVICE));
1207+
if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
1208+
weight_format_to_nz(tensor, data, offset);
1209+
}
11451210
} else {
11461211
void *transform_buffer = malloc(size);
11471212
ggml_backend_cann_transform(tensor, data, transform_buffer);

0 commit comments

Comments
 (0)