24
24
25
25
#include < acl/acl.h>
26
26
#include < stdarg.h>
27
+ #include < aclnnop/aclnn_trans_matmul_weight.h>
27
28
28
29
#include < cmath>
29
30
#include < cstdio>
@@ -1115,6 +1116,63 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
1115
1116
return GGML_STATUS_SUCCESS;
1116
1117
}
1117
1118
1119
+ static int CreateAclTensorWeight (const void *hostData, const std::vector<int64_t > &shape, void **deviceAddr,
1120
+ aclDataType dataType, aclTensor **tensor)
1121
+ {
1122
+ uint64_t size = 1 ;
1123
+ for (auto i : shape) {
1124
+ size *= i;
1125
+ }
1126
+
1127
+ const aclIntArray *mat2Size = aclCreateIntArray (shape.data (), shape.size ());
1128
+ ACL_CHECK (aclnnCalculateMatmulWeightSizeV2 (mat2Size, dataType, &size));
1129
+
1130
+ size *= sizeof (int16_t );
1131
+
1132
+ ACL_CHECK (aclrtMalloc (deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
1133
+ aclrtMemcpy (*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
1134
+
1135
+ std::vector<int64_t > strides (shape.size (), 1 );
1136
+ for (int64_t i = shape.size () - 2 ; i >= 0 ; i--) {
1137
+ strides[i] = shape[i + 1 ] * strides[i + 1 ];
1138
+ }
1139
+
1140
+ *tensor = aclCreateTensor (shape.data (), shape.size (), dataType, strides.data (), 0 , aclFormat::ACL_FORMAT_ND,
1141
+ shape.data (), shape.size (), *deviceAddr);
1142
+ return 0 ;
1143
+ }
1144
+
1145
+ static void weight_format_to_nz (ggml_tensor *tensor, const void *data, size_t offset) {
1146
+ aclrtStream stream;
1147
+ ACL_CHECK (aclrtCreateStream (&stream));
1148
+
1149
+ std::vector<int64_t > weightTransposedShape = {tensor->ne [1 ], tensor->ne [0 ]};
1150
+ void *weightTransposedDeviceAddr = nullptr ;
1151
+ aclTensor *weightTransposed = nullptr ;
1152
+ CreateAclTensorWeight (data, weightTransposedShape, &weightTransposedDeviceAddr,
1153
+ ggml_cann_type_mapping (tensor->type ), &weightTransposed);
1154
+
1155
+ uint64_t workspaceSize = 0 ;
1156
+ aclOpExecutor *executor;
1157
+ void *workspaceAddr = nullptr ;
1158
+
1159
+ // TransMatmulWeight
1160
+ ACL_CHECK (aclnnTransMatmulWeightGetWorkspaceSize (weightTransposed, &workspaceSize, &executor));
1161
+ std::unique_ptr<void , aclError (*)(void *)> workspaceAddrPtrTrans (nullptr , aclrtFree);
1162
+ if (workspaceSize > 0 ) {
1163
+ ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
1164
+ workspaceAddrPtrTrans.reset (workspaceAddr);
1165
+ }
1166
+ ACL_CHECK (aclnnTransMatmulWeight (workspaceAddr, workspaceSize, executor, stream));
1167
+
1168
+ size_t size = ggml_nelements (tensor) * ggml_element_size (tensor);
1169
+
1170
+ aclrtMemcpy ((char *)tensor->data + offset, size,
1171
+ weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
1172
+ ACL_CHECK (aclDestroyTensor (weightTransposed));
1173
+ aclrtFree (weightTransposedDeviceAddr);
1174
+ }
1175
+
1118
1176
// TODO: need handle tensor which has paddings.
1119
1177
/* *
1120
1178
* @brief Set tensor data in a CANN buffer.
@@ -1139,9 +1197,16 @@ static void ggml_backend_cann_buffer_set_tensor(
1139
1197
// For acl, synchronous functions use this default stream.
1140
1198
// Why aclrtSynchronizeDevice?
1141
1199
1200
+ bool weightToNZ = false ;
1201
+ #ifdef ASCEND_310P
1202
+ weightToNZ = (getenv (" GGML_CANN_WEIGHT_NZ" ) != nullptr );
1203
+ #endif
1142
1204
if (!need_transform (tensor->type )) {
1143
1205
ACL_CHECK (aclrtMemcpy ((char *)tensor->data + offset, size, data, size,
1144
1206
ACL_MEMCPY_HOST_TO_DEVICE));
1207
+ if (weightToNZ && is_matmul_weight ((const ggml_tensor*)tensor)) {
1208
+ weight_format_to_nz (tensor, data, offset);
1209
+ }
1145
1210
} else {
1146
1211
void *transform_buffer = malloc (size);
1147
1212
ggml_backend_cann_transform (tensor, data, transform_buffer);
0 commit comments