refactor add new buffer type for online flow

chaxu01 · chaxu01 · commit b632bf0fc5e8 · 2024-11-08T17:06:39.000+01:00
diff --git a/Makefile b/Makefile
@@ -874,6 +874,11 @@ ggml/src/ggml-cuda/%.o: \
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 endif # GGML_HIPBLAS
 
+ifdef GGML_CPU_AARCH64
+	MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
+	MK_CFLAGS += -DGGML_USE_CPU_AARCH64
+endif
+
 ifdef GGML_METAL
 	MK_CPPFLAGS += -DGGML_USE_METAL
 	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -2047,13 +2047,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             common_log_set_timestamps(common_log_main(), true);
         }
     ).set_env("LLAMA_LOG_TIMESTAMPS"));
-    add_opt(common_arg(
-        {"-rtrp", "--runtime-repack"},
-        string_format("Allow runtime requantization and repacking of Q4_0 to enable optimized GEMM and GEMV kernels (default: %d)", params.runtime_repack),
-        [](common_params & params) {
-            params.runtime_repack = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
 
     return ctx_arg;
 }
diff --git a/common/common.cpp b/common/common.cpp
@@ -983,7 +983,7 @@ struct llama_model_params common_model_params_to_llama(const common_params & par
     mparams.main_gpu        = params.main_gpu;
     mparams.split_mode      = params.split_mode;
     mparams.tensor_split    = params.tensor_split;
-    mparams.use_mmap        = params.use_mmap && !params.runtime_repack;
+    mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
     if (params.kv_overrides.empty()) {
@@ -1056,7 +1056,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.offload_kqv       = !params.no_kv_offload;
     cparams.flash_attn        = params.flash_attn;
     cparams.no_perf           = params.no_perf;
-    cparams.runtime_repack   = params.runtime_repack;
 
     if (params.reranking) {
         cparams.embeddings    = true;
diff --git a/common/common.h b/common/common.h
@@ -271,8 +271,6 @@ struct common_params {
     bool warmup            = true;  // warmup run
     bool check_tensors     = false; // validate tensor data
 
-    bool runtime_repack    = false; // runtime repack weight for optimized kernels
-
     std::string cache_type_k = "f16"; // KV cache data type for the K
     std::string cache_type_v = "f16"; // KV cache data type for the V
 
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
@@ -92,6 +92,7 @@ else()
 endif()
 
 option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
+option(GGML_CPU_AARCH64 "ggml: use runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu" OFF)
 
 option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
@@ -305,19 +305,7 @@ extern "C" {
     GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
     GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
 
-    //
-    // CPU backend
-    //
-
-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
-
-    GGML_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
-    GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
-    GGML_API void ggml_backend_cpu_set_runtime_repack(ggml_backend_t backend_cpu, bool runtime_repack);
-
-    // Create a backend buffer from an existing pointer
+    // CPU buffer types are always available
     GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
     GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
 
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
@@ -145,6 +145,10 @@ extern "C" {
     GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
 #endif
 
+#ifdef GGML_USE_CPU_AARCH64
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
@@ -880,6 +880,12 @@ if (GGML_CPU_HBM)
     target_link_libraries(ggml PUBLIC memkind)
 endif()
 
+if (GGML_CPU_AARCH64)
+    message(STATUS "Using runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu")
+
+    add_compile_definitions(GGML_USE_CPU_AARCH64)
+endif()
+
 if (GGML_CANN)
     if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
         set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
@@ -3477,101 +3477,87 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     }
 }
 
-static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor *t, int interleave_block, uint8_t **pmem, size_t *psize) {
+#ifdef GGML_USE_CPU_AARCH64
+static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * data, size_t data_size) {
     GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
     GGML_ASSERT(t->ne[0] % 8 == 0);
     GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
 
-    // Do in-place transformation. Allocate scratch buffer
-    size_t size = sizeof(block_q4_0x4) * t->ne[0] / QK4_0;
-    if (size > *psize) {
-        uint8_t *new_mem = realloc(*pmem, size);
-        if (!new_mem) {
-            return -1;
-        }
-        *pmem = new_mem;
-        *psize = size;
-    }
-    block_q4_0x4 *dst = (block_q4_0x4*) *pmem;
-    block_q4_0 *src = (block_q4_0*) t->data;
+    block_q4_0x4 *dst = (block_q4_0x4 *)t->data;
+    const block_q4_0 *src = (const block_q4_0 *)data;
     block_q4_0 dst_tmp[4];
-    int n = t->ne[0];
     int nrow = t->ne[1]; // Number of rows
     int nrows_interleaved = 4;
     int nblocks = t->ne[0] / QK4_0;
-    for (int b = 0; b < (nrow * n); b += nrows_interleaved * n) {
-        int cnt = 0;
-        for (int64_t x = 0; x < nblocks; x++) {
-            for (int i  = 0; i < nrows_interleaved; i++ ) {
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++)
+        {
+            for (int i = 0; i < nrows_interleaved; i++) {
                 dst_tmp[i] = src[x + i * nblocks];
             }
-            dst[cnt++] = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
+            *dst++ = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
         }
-        memcpy(src, dst, size);
-        src += cnt * 4;
+        src += nrows_interleaved * nblocks;
     }
-    return 0;
+
+    GGML_UNUSED(data_size);
 }
 
-static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,  uint8_t **pmem, size_t *psize) {
+static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,  const void * data, size_t data_size) {
     GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
     GGML_ASSERT(t->ne[0] % 8 == 0);
     GGML_ASSERT(interleave_block == 8);
 
-    // Do in-place transformation. Allocate scratch buffer
-    size_t size = sizeof(block_q4_0x8) * t->ne[0] / QK4_0;
-    if (size > *psize) {
-        uint8_t *new_mem = realloc(*pmem, size);
-        if (!new_mem) {
-            return -1;
-        }
-        *pmem = new_mem;
-        *psize = size;
-    }
-    block_q4_0x8 *dst = (block_q4_0x8*) *pmem;
-    block_q4_0 *src = (block_q4_0*) t->data;
+    block_q4_0x8 *dst = (block_q4_0x8*)t->data;
+    const block_q4_0 *src = (const block_q4_0*) data;
     block_q4_0 dst_tmp[8];
-    int n = t->ne[0];
     int nrow = t->ne[1]; // Number of rows
     int nrows_interleaved = 8;
     int nblocks = t->ne[0] / QK4_0;
-    for (int b = 0; b < (nrow * n); b += nrows_interleaved * n) {
-        int cnt = 0;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
         for (int64_t x = 0; x < nblocks; x++) {
             for (int i  = 0; i < nrows_interleaved; i++ ) {
                 dst_tmp[i] = src[x + i * nblocks];
             }
-            dst[cnt++] = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
+            *dst++ = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
         }
-        memcpy(src, dst, size);
-        src += cnt * 4;
+        src += nrows_interleaved * nblocks;
     }
-    return 0;
+
+    GGML_UNUSED(data_size);
 }
 
 // Prepare for optimized kernels if applicable
-void ggml_prepare_optimal_kernel(struct ggml_tensor *cur, uint8_t **pmem, size_t *psize) {
-    UNUSED(cur);
-    UNUSED(pmem);
-    UNUSED(psize);
-
+int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size) {
+    GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
+    int ret = -1;
 #if defined(__ARM_ARCH)
-    if (cur->type == GGML_TYPE_Q4_0) {
-        if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
-            if (repack_q4_0_to_q4_0_8_bl(cur, 8, pmem, psize) == 0) {
-                cur->type = GGML_TYPE_Q4_0_8_8;
-            }
-        }
-        else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-            if (repack_q4_0_to_q4_0_4_bl(cur, 8, pmem, psize) == 0) {
-                cur->type = GGML_TYPE_Q4_0_4_8;
-            }
-        }
-        else if (ggml_cpu_has_neon()) {
-            if (repack_q4_0_to_q4_0_4_bl(cur, 4, pmem, psize) == 0) {
-                cur->type = GGML_TYPE_Q4_0_4_4;
-            }
-        }
+    if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
+        repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
+        cur->type = GGML_TYPE_Q4_0_8_8;
+        ret = 0;
+    }
+    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+        repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
+        cur->type = GGML_TYPE_Q4_0_4_8;
+        ret = 0;
+    }
+    else if (ggml_cpu_has_neon()) {
+        repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
+        cur->type = GGML_TYPE_Q4_0_4_4;
+        ret = 0;
     }
 #endif
+    return ret;
+
+    GGML_UNUSED(cur);
+    GGML_UNUSED(data);
+    GGML_UNUSED(data_size);
 }
+#endif
diff --git a/ggml/src/ggml-aarch64.h b/ggml/src/ggml-aarch64.h
@@ -33,7 +33,9 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
 void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
-void ggml_prepare_optimal_kernel(struct ggml_tensor *cur, uint8_t **pmem, size_t *psize);
+#ifdef GGML_USE_CPU_AARCH64
+int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size);
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
diff --git a/include/llama.h b/include/llama.h
diff --git a/src/llama.cpp b/src/llama.cpp

Original file line number	Diff line number	Diff line change
`@@ -2047,13 +2047,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`2047`	`2047`	`common_log_set_timestamps(common_log_main(), true);`
`2048`	`2048`	`}`
`2049`	`2049`	`).set_env("LLAMA_LOG_TIMESTAMPS"));`
`2050`		`- add_opt(common_arg(`
`2051`		`- {"-rtrp", "--runtime-repack"},`
`2052`		`- string_format("Allow runtime requantization and repacking of Q4_0 to enable optimized GEMM and GEMV kernels (default: %d)", params.runtime_repack),`
`2053`		`- [](common_params & params) {`
`2054`		`- params.runtime_repack = true;`
`2055`		`- }`
`2056`		`- ).set_examples({LLAMA_EXAMPLE_MAIN}));`
`2057`	`2050`
`2058`	`2051`	`return ctx_arg;`
`2059`	`2052`	`}`