Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,10 @@ ggml/src/ggml-cuda/%.o: \
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # GGML_HIPBLAS

ifndef GGML_NO_CPU_AARCH64
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
endif

ifdef GGML_METAL
MK_CPPFLAGS += -DGGML_USE_METAL
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
Expand All @@ -883,7 +887,7 @@ ifdef GGML_METAL_NDEBUG
endif
ifdef GGML_METAL_EMBED_LIBRARY
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
OBJ_GGML += ggml/src/ggml-metal-embed.o
OBJ_GGML += ggml/src/ggml-metal-embed.o
endif
endif # GGML_METAL

Expand Down
1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ else()
endif()

option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversionn of Q4_0 to Q4_X_X" ON)

option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
Expand Down
5 changes: 5 additions & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,11 @@ extern "C" {
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif

GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
GGML_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);



#ifdef __cplusplus
}
#endif
6 changes: 6 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,12 @@ if (GGML_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind)
endif()

if (GGML_CPU_AARCH64)
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")

add_compile_definitions(GGML_USE_CPU_AARCH64)
endif()

if (GGML_CANN)
if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
Expand Down
108 changes: 108 additions & 0 deletions ggml/src/ggml-aarch64.c
Original file line number Diff line number Diff line change
Expand Up @@ -3476,3 +3476,111 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
}
}
}

static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);

block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
const block_q4_0 * src = (const block_q4_0 *)data;
block_q4_0 dst_tmp[4];
int nrow = t->ne[1]; // Number of rows
int nrows_interleaved = 4;
int nblocks = t->ne[0] / QK4_0;

GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));

if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
return -1;
}

for (int b = 0; b < nrow; b += nrows_interleaved) {
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++) {
dst_tmp[i] = src[x + i * nblocks];
}
*dst++ = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
}
src += nrows_interleaved * nblocks;
}
return 0;

GGML_UNUSED(data_size);
}

static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * restrict data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
GGML_ASSERT(interleave_block == 8);

block_q4_0x8 * dst = (block_q4_0x8*)t->data;
const block_q4_0 * src = (const block_q4_0*) data;
block_q4_0 dst_tmp[8];
int nrow = t->ne[1]; // Number of rows
int nrows_interleaved = 8;
int nblocks = t->ne[0] / QK4_0;

GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));

if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
return -1;
}

for (int b = 0; b < nrow; b += nrows_interleaved) {
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++ ) {
dst_tmp[i] = src[x + i * nblocks];
}
*dst++ = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
}
src += nrows_interleaved * nblocks;
}
return 0;

GGML_UNUSED(data_size);
}

// Prepare for optimized kernels if applicable
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
int ret = -1;

if (cur->type == repack_type) {
memcpy(cur->data, data, data_size);
return;
}

GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);

switch (repack_type) {
case GGML_TYPE_Q4_0_8_8:
ret = repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
break;
case GGML_TYPE_Q4_0_4_8:
ret = repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
break;
case GGML_TYPE_Q4_0_4_4:
ret = repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
break;
default:
GGML_ABORT("Unsupported type");
}
if (ret == -1) {
memcpy(cur->data, data, data_size);
}
}

enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
if (cur->type == GGML_TYPE_Q4_0) {
// TODO: enable for AVX2 - currently disabled due to bad gemv performance
if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
return GGML_TYPE_Q4_0_8_8;
}
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
return GGML_TYPE_Q4_0_4_8;
}
if (ggml_cpu_has_neon()) {
return GGML_TYPE_Q4_0_4_4;
}
}

return cur->type;
}
3 changes: 3 additions & 0 deletions ggml/src/ggml-aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);

#ifdef __cplusplus
}
#endif
Expand Down
125 changes: 116 additions & 9 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2239,15 +2239,104 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
}
#endif

// buffer type AARCH64

#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpedantic"
#endif

#include "ggml-aarch64.h"

#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif

static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT

GGML_UNUSED(buffer);
}

static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));

enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra;

ggml_aarch64_repack_tensor(tensor, repack_type, data, size);

GGML_UNUSED(buffer);
}

static const struct ggml_backend_buffer_i ggml_backend_cpu_aarch64_buffer_i = {
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .init_tensor = */ ggml_backend_cpu_aarch64_buffer_init_tensor,
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_cpu_aarch64_buffer_set_tensor,
/* .get_tensor = */ NULL,
Copy link
Member

@slaren slaren Nov 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is going to be necessary to implement the get_tensor function to return a copy of the tensor in the original Q4_0 format. This is because some backends such as CUDA and Vulkan will offload computation of large batches to the GPU, and to do that the tensor needs to be copied to VRAM in the Q4_0 format. But it may be better to disable this behavior entirely for repacked tensors by only doing so when the weight is stored in a host buffer, since that ensures that no conversions are required.

Copy link
Collaborator Author

@chaxu01 chaxu01 Nov 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is going to be necessary to implement the get_tensor function to return a copy of the tensor in the original Q4_0 format. This is because some backends such as CUDA and Vulkan will offload computation of large batches to the GPU, and to do that the tensor needs to be copied to VRAM in the Q4_0 format.

I understand the requirement to revert the repacked weight data back to the original Q4_0 format in the get_tensor function to ensure compatibility with CUDA and Vulkan backends. This way, when these backends offload computations for large batches to the GPU, the tensor will be correctly formatted for transfer to VRAM. I will proceed with implementing this conversion logic.

But it may be better to disable this behavior entirely for repacked tensors by only doing so when the weight is stored in a host buffer, since that ensures that no conversions are required.

I would appreciate some additional clarification on this point. If I understand correctly, are you suggesting that we disable support for CUDA and Vulkan for tensors that have been repacked? This would imply that repacked tensors should not be used with GPU backends to avoid the need for conversion. Please confirm if our interpretation is accurate or if you have a different approach in mind.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are correct in your interpretation. My concern is that if we don't implement the logic to convert the tensors back to Q4_0 in the get_tensor function, then it will result in a performance loss when processing prompts with a GPU backend (even without layers offloaded, since the GPU is still used for processing prompts). However, in that case, the best solution may be to disable the weight repacking entirely, since repacking doesn't help with generation performance anyway, and prompt processing would be done in the GPU, so there is no reason to repack the weights at all. I think this will need to be handled in the llama.cpp instead and add an exception for this case.

Anyway, this is not an issue at the moment since it is only enabled for ARM currently. You don't need to do anything, I will solve the merge conflicts and merge this.

/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_cpu_buffer_clear,
/* .reset = */ NULL,
};

static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "CPU_AARCH64";

GGML_UNUSED(buft);
}

static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * data = ggml_aligned_malloc(size);

if (data == NULL) {
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
}

return ggml_backend_buffer_init(buft, ggml_backend_cpu_aarch64_buffer_i, data, size);
}

ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
/* .iface = */ {
/* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .is_host = */ NULL,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ NULL,
};

return &ggml_backend_cpu_buffer_type_aarch64;
}

bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) {
return buft == ggml_backend_cpu_aarch64_buffer_type();
}

static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
static ggml_backend_buffer_type_t bufts[] = {
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
std::vector<ggml_backend_buffer_type_t> bufts;

#ifdef GGML_USE_CPU_HBM
ggml_backend_cpu_hbm_buffer_type(),
bufts.push_back(ggml_backend_cpu_hbm_buffer_type());
#endif
NULL
};

return bufts;
#ifdef GGML_USE_CPU_AARCH64
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
#endif

bufts.push_back(NULL);

return bufts;
}();

return bufts.data();

GGML_UNUSED(device);
}
Expand Down Expand Up @@ -2558,6 +2647,21 @@ static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_b
}

static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];

if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) {
return false;
}
}

for (int i = 1; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
return false;
}
}

switch (op->op) {
case GGML_OP_CPY:
return
Expand All @@ -2566,13 +2670,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
op->type != GGML_TYPE_IQ1_S &&
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
case GGML_OP_ROPE_BACK:
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
case GGML_OP_IM2COL_BACK:
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
case GGML_OP_OUT_PROD:
return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32;
default:
return true;
}
Expand All @@ -2581,7 +2685,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
}

static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft);
return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);

GGML_UNUSED(dev);
}
Expand Down Expand Up @@ -2652,6 +2756,9 @@ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
};

ggml_backend_reg_t ggml_backend_cpu_reg(void) {
// init CPU feature detection
ggml_cpu_init();

static struct ggml_backend_reg ggml_backend_cpu_reg = {
/* .iface = */ ggml_backend_cpu_reg_i,
/* .context = */ NULL,
Expand Down
Loading
Loading