Skip to content

Commit b632bf0

Browse files
committed
refactor add new buffer type for online flow
1 parent 647eb31 commit b632bf0

File tree

14 files changed

+328
-912
lines changed

14 files changed

+328
-912
lines changed

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -874,6 +874,11 @@ ggml/src/ggml-cuda/%.o: \
874874
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
875875
endif # GGML_HIPBLAS
876876

877+
ifdef GGML_CPU_AARCH64
878+
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
879+
MK_CFLAGS += -DGGML_USE_CPU_AARCH64
880+
endif
881+
877882
ifdef GGML_METAL
878883
MK_CPPFLAGS += -DGGML_USE_METAL
879884
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit

common/arg.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2047,13 +2047,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20472047
common_log_set_timestamps(common_log_main(), true);
20482048
}
20492049
).set_env("LLAMA_LOG_TIMESTAMPS"));
2050-
add_opt(common_arg(
2051-
{"-rtrp", "--runtime-repack"},
2052-
string_format("Allow runtime requantization and repacking of Q4_0 to enable optimized GEMM and GEMV kernels (default: %d)", params.runtime_repack),
2053-
[](common_params & params) {
2054-
params.runtime_repack = true;
2055-
}
2056-
).set_examples({LLAMA_EXAMPLE_MAIN}));
20572050

20582051
return ctx_arg;
20592052
}

common/common.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -983,7 +983,7 @@ struct llama_model_params common_model_params_to_llama(const common_params & par
983983
mparams.main_gpu = params.main_gpu;
984984
mparams.split_mode = params.split_mode;
985985
mparams.tensor_split = params.tensor_split;
986-
mparams.use_mmap = params.use_mmap && !params.runtime_repack;
986+
mparams.use_mmap = params.use_mmap;
987987
mparams.use_mlock = params.use_mlock;
988988
mparams.check_tensors = params.check_tensors;
989989
if (params.kv_overrides.empty()) {
@@ -1056,7 +1056,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
10561056
cparams.offload_kqv = !params.no_kv_offload;
10571057
cparams.flash_attn = params.flash_attn;
10581058
cparams.no_perf = params.no_perf;
1059-
cparams.runtime_repack = params.runtime_repack;
10601059

10611060
if (params.reranking) {
10621061
cparams.embeddings = true;

common/common.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -271,8 +271,6 @@ struct common_params {
271271
bool warmup = true; // warmup run
272272
bool check_tensors = false; // validate tensor data
273273

274-
bool runtime_repack = false; // runtime repack weight for optimized kernels
275-
276274
std::string cache_type_k = "f16"; // KV cache data type for the K
277275
std::string cache_type_v = "f16"; // KV cache data type for the V
278276

examples/llama-bench/llama-bench.cpp

Lines changed: 84 additions & 112 deletions
Large diffs are not rendered by default.

ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ else()
9292
endif()
9393

9494
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
95+
option(GGML_CPU_AARCH64 "ggml: use runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu" OFF)
9596

9697
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
9798
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})

ggml/include/ggml-backend.h

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -305,19 +305,7 @@ extern "C" {
305305
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
306306
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
307307

308-
//
309-
// CPU backend
310-
//
311-
312-
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
313-
314-
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
315-
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
316-
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
317-
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
318-
GGML_API void ggml_backend_cpu_set_runtime_repack(ggml_backend_t backend_cpu, bool runtime_repack);
319-
320-
// Create a backend buffer from an existing pointer
308+
// CPU buffer types are always available
321309
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
322310
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
323311

ggml/include/ggml-cpu.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,10 @@ extern "C" {
145145
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
146146
#endif
147147

148+
#ifdef GGML_USE_CPU_AARCH64
149+
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
150+
#endif
151+
148152
#ifdef __cplusplus
149153
}
150154
#endif

ggml/src/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -880,6 +880,12 @@ if (GGML_CPU_HBM)
880880
target_link_libraries(ggml PUBLIC memkind)
881881
endif()
882882

883+
if (GGML_CPU_AARCH64)
884+
message(STATUS "Using runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu")
885+
886+
add_compile_definitions(GGML_USE_CPU_AARCH64)
887+
endif()
888+
883889
if (GGML_CANN)
884890
if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
885891
set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})

ggml/src/ggml-aarch64.c

Lines changed: 49 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -3477,101 +3477,87 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
34773477
}
34783478
}
34793479

3480-
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor *t, int interleave_block, uint8_t **pmem, size_t *psize) {
3480+
#ifdef GGML_USE_CPU_AARCH64
3481+
static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * data, size_t data_size) {
34813482
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
34823483
GGML_ASSERT(t->ne[0] % 8 == 0);
34833484
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
34843485

3485-
// Do in-place transformation. Allocate scratch buffer
3486-
size_t size = sizeof(block_q4_0x4) * t->ne[0] / QK4_0;
3487-
if (size > *psize) {
3488-
uint8_t *new_mem = realloc(*pmem, size);
3489-
if (!new_mem) {
3490-
return -1;
3491-
}
3492-
*pmem = new_mem;
3493-
*psize = size;
3494-
}
3495-
block_q4_0x4 *dst = (block_q4_0x4*) *pmem;
3496-
block_q4_0 *src = (block_q4_0*) t->data;
3486+
block_q4_0x4 *dst = (block_q4_0x4 *)t->data;
3487+
const block_q4_0 *src = (const block_q4_0 *)data;
34973488
block_q4_0 dst_tmp[4];
3498-
int n = t->ne[0];
34993489
int nrow = t->ne[1]; // Number of rows
35003490
int nrows_interleaved = 4;
35013491
int nblocks = t->ne[0] / QK4_0;
3502-
for (int b = 0; b < (nrow * n); b += nrows_interleaved * n) {
3503-
int cnt = 0;
3504-
for (int64_t x = 0; x < nblocks; x++) {
3505-
for (int i = 0; i < nrows_interleaved; i++ ) {
3492+
3493+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
3494+
3495+
for (int b = 0; b < nrow; b += nrows_interleaved) {
3496+
for (int64_t x = 0; x < nblocks; x++)
3497+
{
3498+
for (int i = 0; i < nrows_interleaved; i++) {
35063499
dst_tmp[i] = src[x + i * nblocks];
35073500
}
3508-
dst[cnt++] = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
3501+
*dst++ = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
35093502
}
3510-
memcpy(src, dst, size);
3511-
src += cnt * 4;
3503+
src += nrows_interleaved * nblocks;
35123504
}
3513-
return 0;
3505+
3506+
GGML_UNUSED(data_size);
35143507
}
35153508

3516-
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, uint8_t **pmem, size_t *psize) {
3509+
static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * data, size_t data_size) {
35173510
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
35183511
GGML_ASSERT(t->ne[0] % 8 == 0);
35193512
GGML_ASSERT(interleave_block == 8);
35203513

3521-
// Do in-place transformation. Allocate scratch buffer
3522-
size_t size = sizeof(block_q4_0x8) * t->ne[0] / QK4_0;
3523-
if (size > *psize) {
3524-
uint8_t *new_mem = realloc(*pmem, size);
3525-
if (!new_mem) {
3526-
return -1;
3527-
}
3528-
*pmem = new_mem;
3529-
*psize = size;
3530-
}
3531-
block_q4_0x8 *dst = (block_q4_0x8*) *pmem;
3532-
block_q4_0 *src = (block_q4_0*) t->data;
3514+
block_q4_0x8 *dst = (block_q4_0x8*)t->data;
3515+
const block_q4_0 *src = (const block_q4_0*) data;
35333516
block_q4_0 dst_tmp[8];
3534-
int n = t->ne[0];
35353517
int nrow = t->ne[1]; // Number of rows
35363518
int nrows_interleaved = 8;
35373519
int nblocks = t->ne[0] / QK4_0;
3538-
for (int b = 0; b < (nrow * n); b += nrows_interleaved * n) {
3539-
int cnt = 0;
3520+
3521+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
3522+
3523+
for (int b = 0; b < nrow; b += nrows_interleaved) {
35403524
for (int64_t x = 0; x < nblocks; x++) {
35413525
for (int i = 0; i < nrows_interleaved; i++ ) {
35423526
dst_tmp[i] = src[x + i * nblocks];
35433527
}
3544-
dst[cnt++] = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
3528+
*dst++ = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
35453529
}
3546-
memcpy(src, dst, size);
3547-
src += cnt * 4;
3530+
src += nrows_interleaved * nblocks;
35483531
}
3549-
return 0;
3532+
3533+
GGML_UNUSED(data_size);
35503534
}
35513535

35523536
// Prepare for optimized kernels if applicable
3553-
void ggml_prepare_optimal_kernel(struct ggml_tensor *cur, uint8_t **pmem, size_t *psize) {
3554-
UNUSED(cur);
3555-
UNUSED(pmem);
3556-
UNUSED(psize);
3557-
3537+
int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size) {
3538+
GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
3539+
int ret = -1;
35583540
#if defined(__ARM_ARCH)
3559-
if (cur->type == GGML_TYPE_Q4_0) {
3560-
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
3561-
if (repack_q4_0_to_q4_0_8_bl(cur, 8, pmem, psize) == 0) {
3562-
cur->type = GGML_TYPE_Q4_0_8_8;
3563-
}
3564-
}
3565-
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
3566-
if (repack_q4_0_to_q4_0_4_bl(cur, 8, pmem, psize) == 0) {
3567-
cur->type = GGML_TYPE_Q4_0_4_8;
3568-
}
3569-
}
3570-
else if (ggml_cpu_has_neon()) {
3571-
if (repack_q4_0_to_q4_0_4_bl(cur, 4, pmem, psize) == 0) {
3572-
cur->type = GGML_TYPE_Q4_0_4_4;
3573-
}
3574-
}
3541+
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
3542+
repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
3543+
cur->type = GGML_TYPE_Q4_0_8_8;
3544+
ret = 0;
3545+
}
3546+
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
3547+
repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
3548+
cur->type = GGML_TYPE_Q4_0_4_8;
3549+
ret = 0;
3550+
}
3551+
else if (ggml_cpu_has_neon()) {
3552+
repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
3553+
cur->type = GGML_TYPE_Q4_0_4_4;
3554+
ret = 0;
35753555
}
35763556
#endif
3557+
return ret;
3558+
3559+
GGML_UNUSED(cur);
3560+
GGML_UNUSED(data);
3561+
GGML_UNUSED(data_size);
35773562
}
3563+
#endif

0 commit comments

Comments
 (0)