Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
344 changes: 341 additions & 3 deletions ggml/src/ggml-cann/aclnn_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
#include <aclnnop/aclnn_group_norm.h>
#include <aclnnop/aclnn_index_fill_tensor.h>
#include <aclnnop/aclnn_layer_norm.h>
#include <aclnnop/aclnn_mm.h>
#include <aclnnop/aclnn_batch_matmul.h>
#include <aclnnop/aclnn_matmul.h>
#include <aclnnop/aclnn_max_pool.h>
#include <aclnnop/aclnn_permute.h>
Expand Down Expand Up @@ -2425,7 +2427,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclTensor* acl_weight, aclTensor* acl_dst) {
int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
// fp32, atlas a2 will transpose it to HFLOAT32.

uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
Expand All @@ -2443,6 +2444,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
}

/**
* @brief Performs matrix multiplication of two 2D tensors.
*
* This function computes the matrix multiplication of the input tensor
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
* destination tensor `acl_dst`.
* The operation is defined as:
* \f[
* \text {acl_dst}=\text {acl_input@acl_weight}
* \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_input The input tensor for the matrix multiplication.
* @param acl_weight The weight tensor for the matrix multiplication.
* @param acl_dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclTensor* acl_weight, aclTensor* acl_dst) {
int8_t cube_math_type = 2;
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;

ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
cube_math_type, &workspaceSize,
&executor));

if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}

ACL_CHECK(
aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
}

/**
* @brief Performs matrix multiplication of two 3D tensors.
*
* This function computes the matrix multiplication of the input tensor
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
* destination tensor `acl_dst`.
* The operation is defined as:
* \f[
* \text {acl_dst}=\text {acl_input@acl_weight}
* \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_input The input tensor for the matrix multiplication.
* @param acl_weight The weight tensor for the matrix multiplication.
* @param acl_dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclTensor* acl_weight, aclTensor* acl_dst) {
int8_t cube_math_type = 2;
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;

ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
cube_math_type, &workspaceSize,
&executor));

if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}

ACL_CHECK(
aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
}

/**
* @brief Performs matrix multiplication with floating-point precision on
* tensors using the CANN backend.
Expand Down Expand Up @@ -2484,6 +2559,70 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
ACL_CHECK(aclDestroyTensor(acl_dst));
}

/**
* @brief Performs matrix multiplication with floating-point precision on
* tensors using the CANN backend.
*
* This function performs matrix multiplication of the input tensor and the
* weight tensor, handling broadcasting and transposing as needed, and stores
* the result in the destination tensor `dst`.
*
* @param ctx The context for the CANN backend operations.
* @param dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
static void ggml_cann_mat_mul_fp2(ggml_backend_cann_context& ctx,
ggml_tensor* dst) {
ggml_tensor* weight = dst->src[0]; // weight
ggml_tensor* input = dst->src[1]; // input

// when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
// broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
BCAST_MUL_MAT_SHAPE(input, weight, dst);

int64_t n_dims = bcast_dims;
if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
n_dims = 2;
} else if (bcast_input_ne[2] == 1) {
n_dims = 3;
}
}

aclTensor* acl_input_tensor =
ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
int64_t transpose_ne[] = {
bcast_weight_ne[1], bcast_weight_ne[0],
bcast_weight_ne[2], bcast_weight_ne[3],
bcast_weight_ne[4], bcast_weight_ne[5]
};
size_t transpose_nb[] = {
bcast_weight_nb[1], bcast_weight_nb[0],
bcast_weight_nb[2], bcast_weight_nb[3],
bcast_weight_nb[4], bcast_weight_nb[5]
};
aclTensor* acl_weight_tensor =
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
aclTensor* acl_dst =
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);

switch (n_dims) {
case 2:
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
case 3:
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
default:
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
}

ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
ACL_CHECK(aclDestroyTensor(acl_dst));
}

/**
* @brief Performs matrix multiplication with quantized weights and
* floating-point inputs using the CANN backend.
Expand Down Expand Up @@ -2636,16 +2775,215 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
}

/**
* @brief Performs matrix multiplication with quantized weights and
* floating-point inputs using the CANN backend.
*
* This function performs matrix multiplication of the input tensor `src1` and
* the weight tensor `src0`, handling broadcasting, transposing, and
* quantization as needed, and stores the result in the destination tensor
* `dst`.
*
* @param ctx The context for the CANN backend operations.
* @param dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
static void ggml_cann_mul_mat_quant2(ggml_backend_cann_context& ctx,
ggml_tensor* dst,
const enum ggml_type type) {
ggml_tensor* src0 = dst->src[0]; // weight
ggml_tensor* src1 = dst->src[1]; // input

// The shape of the weight is NCHW.
// Matrix multiplication uses HW dims.
// HC is regarded as batch.
// weight need transpose.
float weight_elem_size;
if (type == GGML_TYPE_Q4_0) {
weight_elem_size = float(sizeof(uint8_t)) / 2;
} else if (type == GGML_TYPE_Q8_0) {
weight_elem_size = float(sizeof(uint8_t));
} else {
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
}
float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];

// scale stored at the end of weight.
// scale need transpose.
size_t scale_elem_size = sizeof(uint16_t);
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
char* scale_offset = (char*)src0->data + weight_size;

// input
size_t input_elem_size = sizeof(uint16_t);
int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
ggml_cann_pool_alloc input_alloctor(ctx.pool());
void* input_buffer = src1->data;

// case in
if (src1->type != GGML_TYPE_F16) {
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);

int64_t* input_cast_ne = src1->ne;
size_t input_cast_nb[GGML_MAX_DIMS];
input_cast_nb[0] = sizeof(uint16_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
}

aclTensor* acl_input_tensor = ggml_cann_create_tensor(
input_buffer,
ACL_FLOAT16,
input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);

ACL_CHECK(aclDestroyTensor(acl_input_tensor));
ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
}

// output
size_t output_elem_size = sizeof(uint16_t);
size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
ggml_cann_pool_alloc output_allocator(ctx.pool());
void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;

// aclnn
int64_t max_elem_size = 65535;
int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
aclOpExecutor* executor = nullptr;
uint64_t workspaceSize = 0;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems this part remove the limitation of max line length (65536), Please check whether Qwen2-1.5B-Instruct Q8_0 is support now.

void* workspaceAddr = nullptr;
for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);

int64_t batch1 = (n1 * src1->ne[2]) + c1;
int64_t batch0 = (n0 * src0->ne[2]) + c0;

aclTensor* acl_input_tensor = ggml_cann_create_tensor(
(char*)input_buffer + batch1 * input_stride,
ACL_FLOAT16,
input_elem_size, input_ne, input_nb, 2);

// first split
int64_t weight_ne_offset = 0;
int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
int64_t scale_ne_offset = 0;
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
int64_t output_ne_offset = 0;
int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};

aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
(char*)src0->data + batch0 * weight_stride,
ggml_cann_type_mapping(type),
weight_elem_size, weight_ne, weight_nb, 2,
ACL_FORMAT_ND, weight_ne_offset);
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride,
ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2,
ACL_FORMAT_ND, scale_ne_offset);
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
(char*)output_buffer + batch1 * output_stride,
ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2,
ACL_FORMAT_ND, output_ne_offset);

ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
nullptr, nullptr, nullptr, nullptr, QK8_0,
acl_output_tensor, &workspaceSize, &executor));
if (workspaceAddr == nullptr) {
workspaceAddr = workspace_allocator.alloc(workspaceSize);
}
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
workspaceAddr, workspaceSize, executor, ctx.stream()));

ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
ACL_CHECK(aclDestroyTensor(acl_output_tensor));

// other splits
for (int64_t split = 1; split < split_size; split++) {
weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
scale_ne[0] = weight_ne[0];
output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
output_ne[0] = weight_ne[0];

acl_weight_tensor = ggml_cann_create_tensor(
(char*)src0->data + batch0 * weight_stride,
ggml_cann_type_mapping(type),
weight_elem_size, weight_ne, weight_nb, 2,
ACL_FORMAT_ND, weight_ne_offset);
acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride,
ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2,
ACL_FORMAT_ND, scale_ne_offset);
acl_output_tensor = ggml_cann_create_tensor(
(char*)output_buffer + batch1 * output_stride,
ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2,
ACL_FORMAT_ND, output_ne_offset);

ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
nullptr, nullptr, nullptr, nullptr, QK8_0,
acl_output_tensor, &workspaceSize, &executor));
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
workspaceAddr, workspaceSize, executor, ctx.stream()));

ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
}

ACL_CHECK(aclDestroyTensor(acl_input_tensor));
}
}

// cast out
if (dst->type != GGML_TYPE_F16) {
int64_t* output_cast_ne = dst->ne;
size_t output_cast_nb[GGML_MAX_DIMS];
output_cast_nb[0] = sizeof(uint16_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
}

aclTensor* acl_output_tensor = ggml_cann_create_tensor(
output_buffer,
ACL_FLOAT16,
output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));

ACL_CHECK(aclDestroyTensor(acl_output_tensor));
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
}
}

void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
const enum ggml_type type = dst->src[0]->type;
switch (type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
ggml_cann_mat_mul_fp(ctx, dst);
ggml_cann_mat_mul_fp2(ctx, dst);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the old function is not used anymore, It's better to detele them.

break;
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
ggml_cann_mul_mat_quant(ctx, dst, type);
ggml_cann_mul_mat_quant2(ctx, dst, type);
break;
default:
GGML_ABORT("fatal error");
Expand Down
Loading
Loading