Skip to content

Commit ccfbaec

Browse files
committed
weight format to nz for 310p
1 parent 8846aac commit ccfbaec

File tree

2 files changed

+153
-11
lines changed

2 files changed

+153
-11
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 52 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1783,8 +1783,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
17831783
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
17841784
bcast_weight_nb[2], bcast_weight_nb[3],
17851785
bcast_weight_nb[4], bcast_weight_nb[5]};
1786-
aclTensor* acl_weight_tensor =
1787-
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
1786+
aclTensor* acl_weight_tensor;
1787+
1788+
bool weightToNZ = false;
1789+
#ifdef ASCEND_310P
1790+
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
1791+
#endif
1792+
if (weightToNZ && n_dims == 2) {
1793+
int64_t acl_stride[2] = {1, transpose_ne[1]};
1794+
1795+
// Reverse ne.
1796+
std::reverse(transpose_ne, transpose_ne + n_dims);
1797+
1798+
std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
1799+
1800+
acl_weight_tensor = aclCreateTensor(
1801+
transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
1802+
0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
1803+
} else {
1804+
acl_weight_tensor =
1805+
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
1806+
}
17881807
aclTensor* acl_dst =
17891808
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
17901809

@@ -1909,14 +1928,37 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
19091928
int64_t output_ne_offset = 0;
19101929
int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
19111930

1912-
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
1913-
(char*)src0->data + batch0 * weight_stride,
1914-
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
1915-
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
1916-
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
1917-
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
1918-
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
1919-
scale_ne_offset);
1931+
aclTensor* acl_weight_tensor;
1932+
aclTensor* acl_scale_tensor;
1933+
1934+
bool weightToNZ = false;
1935+
#ifdef ASCEND_310P
1936+
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
1937+
#endif
1938+
if (weightToNZ) {
1939+
int64_t acl_weight_stride[] = {weight_ne[1], 1};
1940+
std::vector<int64_t> storageDims = {weight_ne[0], weight_ne[1]};
1941+
acl_weight_tensor = aclCreateTensor(
1942+
weight_ne, 2, ggml_cann_type_mapping(type), acl_weight_stride,
1943+
weight_ne_offset / ggml_element_size(src0), ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2,
1944+
src0->data);
1945+
1946+
int64_t acl_scale_stride[] = {scale_ne[1], 1};
1947+
std::vector<int64_t> scaleStorageDims = {scale_ne[0], scale_ne[1]};
1948+
acl_scale_tensor = aclCreateTensor(
1949+
scale_ne, 2, ACL_FLOAT16, acl_scale_stride,
1950+
scale_ne_offset, ACL_FORMAT_ND, scaleStorageDims.data(), 2,
1951+
scale_offset + batch0 * scale_stride);
1952+
} else {
1953+
acl_weight_tensor = ggml_cann_create_tensor(
1954+
(char*)src0->data + batch0 * weight_stride,
1955+
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
1956+
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
1957+
acl_scale_tensor = ggml_cann_create_tensor(
1958+
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
1959+
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
1960+
scale_ne_offset);
1961+
}
19201962
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
19211963
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
19221964
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
#include <acl/acl.h>
2626
#include <stdarg.h>
27+
#include <aclnnop/aclnn_trans_matmul_weight.h>
2728

2829
#include <cmath>
2930
#include <cstdio>
@@ -1115,6 +1116,95 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
11151116
return GGML_STATUS_SUCCESS;
11161117
}
11171118

1119+
static bool is_matmul_weight(const ggml_tensor* tensor) {
1120+
std::string name = ggml_get_name(tensor);
1121+
static const std::unordered_set<std::string> weight_suffixes{
1122+
"output.weight",
1123+
"attn_q.weight",
1124+
"attn_k.weight",
1125+
"attn_v.weight",
1126+
"attn_output.weight",
1127+
"ffn_gate.weight",
1128+
"ffn_up.weight",
1129+
"ffn_down.weight"
1130+
};
1131+
1132+
for (const auto& suffix : weight_suffixes) {
1133+
if (name.find(suffix) != std::string::npos) {
1134+
return true;
1135+
}
1136+
}
1137+
return false;
1138+
}
1139+
1140+
static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
1141+
aclDataType dataType, aclTensor **tensor)
1142+
{
1143+
uint64_t size = 1;
1144+
for (auto i : shape) {
1145+
size *= i;
1146+
}
1147+
1148+
const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
1149+
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
1150+
1151+
size *= sizeof(int16_t);
1152+
1153+
ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
1154+
aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
1155+
1156+
std::vector<int64_t> strides(shape.size(), 1);
1157+
for (int64_t i = shape.size() - 2; i >= 0; i--) {
1158+
strides[i] = shape[i + 1] * strides[i + 1];
1159+
}
1160+
1161+
// std::vector<int64_t> storageShape;
1162+
// storageShape.push_back(size);
1163+
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
1164+
shape.data(), shape.size(), *deviceAddr);
1165+
return 0;
1166+
}
1167+
1168+
static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
1169+
aclrtStream stream;
1170+
ACL_CHECK(aclrtCreateStream(&stream));
1171+
1172+
std::vector<int64_t> weightShape = {tensor->ne[0], tensor->ne[1]};
1173+
std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
1174+
void *weightDeviceAddr = nullptr;
1175+
void *weightTransposedDeviceAddr = nullptr;
1176+
aclTensor *weight = nullptr;
1177+
aclTensor *weightTransposed = nullptr;
1178+
CreateAclTensorWeight(data, weightShape, &weightDeviceAddr, ggml_cann_type_mapping(tensor->type), &weight);
1179+
CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
1180+
ggml_cann_type_mapping(tensor->type), &weightTransposed);
1181+
1182+
uint64_t workspaceSize = 0;
1183+
aclOpExecutor *executor;
1184+
void *workspaceAddr = nullptr;
1185+
1186+
// TransMatmulWeight
1187+
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
1188+
std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
1189+
if (workspaceSize > 0) {
1190+
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
1191+
workspaceAddrPtrTrans.reset(workspaceAddr);
1192+
}
1193+
ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
1194+
1195+
size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
1196+
1197+
aclrtMemcpy((char *)tensor->data + offset, size,
1198+
weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
1199+
ACL_CHECK(aclDestroyTensor(weight));
1200+
ACL_CHECK(aclDestroyTensor(weightTransposed));
1201+
aclrtFree(weightDeviceAddr);
1202+
aclrtFree(weightTransposedDeviceAddr);
1203+
if (workspaceSize > 0) {
1204+
aclrtFree(workspaceAddr);
1205+
}
1206+
}
1207+
11181208
// TODO: need handle tensor which has paddings.
11191209
/**
11201210
* @brief Set tensor data in a CANN buffer.
@@ -1139,16 +1229,26 @@ static void ggml_backend_cann_buffer_set_tensor(
11391229
// For acl, synchronous functions use this default stream.
11401230
// Why aclrtSynchronizeDevice?
11411231

1232+
bool weightToNZ = false;
1233+
#ifdef ASCEND_310P
1234+
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
1235+
#endif
11421236
if (!need_transform(tensor->type)) {
11431237
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
11441238
ACL_MEMCPY_HOST_TO_DEVICE));
1239+
if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
1240+
weight_format_to_nz(tensor, data, offset);
1241+
}
11451242
} else {
11461243
void *transform_buffer = malloc(size);
11471244
ggml_backend_cann_transform(tensor, data, transform_buffer);
11481245

11491246
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
11501247
transform_buffer, size,
11511248
ACL_MEMCPY_HOST_TO_DEVICE));
1249+
if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
1250+
weight_format_to_nz(tensor, transform_buffer, offset);
1251+
}
11521252
free(transform_buffer);
11531253
}
11541254
}
@@ -2044,8 +2144,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
20442144
switch (op->src[0]->type) {
20452145
case GGML_TYPE_F16:
20462146
case GGML_TYPE_F32:
2047-
return true;
20482147
case GGML_TYPE_Q8_0:
2148+
return true;
20492149
case GGML_TYPE_Q4_0:
20502150
#ifdef ASCEND_310P
20512151
// Q4 && Q8 per group is not suppor on 310p device

0 commit comments

Comments
 (0)