From c95fa362b3587d1822558f7e28414521075f254f Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 24 Mar 2025 23:05:38 +0530 Subject: [PATCH 1/7] ci: [SYCL] ggml-ci Use main GPU and enable sysman (#12547) --- ci/run.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/run.sh b/ci/run.sh index edf5803531466..038190a1b05be 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -52,7 +52,10 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then echo "source /opt/intel/oneapi/setvars.sh" exit 1 fi - + # Use only main GPU + export ONEAPI_DEVICE_SELECTOR="level_zero:0" + # Enable sysman for correct memory reporting + export ZES_ENABLE_SYSMAN=1 CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" fi From 2d77d88e70d017cd82c3f1a4517e3102e2028ac4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 25 Mar 2025 09:19:23 +0200 Subject: [PATCH 2/7] context : fix worst-case reserve outputs (#12545) ggml-ci --- src/llama-context.cpp | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 5bec63e2e79ff..aa363df6356ea 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -294,10 +294,7 @@ llama_context::llama_context( // TODO: something cleaner const auto n_outputs_save = n_outputs; - // max number of outputs - n_outputs = n_tokens; - - LLAMA_LOG_DEBUG("%s: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs); + LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs); int n_splits_pp = -1; int n_nodes_pp = -1; @@ -313,8 +310,15 @@ llama_context::llama_context( // reserve pp graph first so that buffers are only allocated once { llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + // max number of outputs + n_outputs = ubatch_pp.n_tokens; + + LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_pp.n_tokens, ubatch_pp.n_seqs); + auto * gf = graph_init(); graph_build(ctx_compute.get(), gf, ubatch_pp, LLM_GRAPH_TYPE_DEFAULT); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { throw std::runtime_error("failed to allocate compute pp buffers"); } @@ -326,11 +330,18 @@ llama_context::llama_context( // reserve with tg graph to get the number of splits and nodes { llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + n_outputs = ubatch_tg.n_tokens; + + LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_tg.n_tokens, ubatch_tg.n_seqs); + auto * gf = graph_init(); graph_build(ctx_compute.get(), gf, ubatch_tg, LLM_GRAPH_TYPE_DEFAULT); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { throw std::runtime_error("failed to allocate compute tg buffers"); } + n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); n_nodes_tg = ggml_graph_n_nodes(gf); } @@ -338,8 +349,14 @@ llama_context::llama_context( // reserve again with pp graph to avoid ggml-alloc reallocations during inference { llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + n_outputs = ubatch_pp.n_tokens; + + LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_pp.n_tokens, ubatch_pp.n_seqs); + auto * gf = graph_init(); graph_build(ctx_compute.get(), gf, ubatch_pp, LLM_GRAPH_TYPE_DEFAULT); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { throw std::runtime_error("failed to allocate compute pp buffers"); } From 3cd3a395323fa9cdf6ecfa1fea290bf228d4e856 Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Tue, 25 Mar 2025 15:45:08 +0800 Subject: [PATCH 3/7] ci: [MUSA] add CI and update doc (#12562) Signed-off-by: Xiaodong Ye --- ci/README.md | 39 +++++++++++++++++++++++++++++++++++++++ ci/run.sh | 11 ++++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/ci/README.md b/ci/README.md index 8245c9df65db8..db4d9066816e8 100644 --- a/ci/README.md +++ b/ci/README.md @@ -26,4 +26,43 @@ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # with SYCL support source /opt/intel/oneapi/setvars.sh GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt + +# with MUSA support +GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +``` + +## Running MUSA CI in a Docker Container + +Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container: + +### 1. Create a local directory to store cached models, configuration files and venv: + +```bash +mkdir -p $HOME/llama.cpp/ci-cache +``` + +### 2. Create a local directory to store CI run results: + +```bash +mkdir -p $HOME/llama.cpp/ci-results +``` + +### 3. Start a Docker container and run the CI: + +```bash +docker run --privileged -it \ + -v $HOME/llama.cpp/ci-cache:/ci-cache \ + -v $HOME/llama.cpp/ci-results:/ci-results \ + -v $PWD:/ws -w /ws \ + mthreads/musa:rc3.1.1-devel-ubuntu22.04 ``` + +Inside the container, execute the following commands: + +```bash +apt update -y && apt install -y cmake git python3.10-venv wget +git config --global --add safe.directory /ws +GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache +``` + +This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs. diff --git a/ci/run.sh b/ci/run.sh index 038190a1b05be..efc24391d2e7e 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -16,6 +16,9 @@ # # with VULKAN support # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # +# # with MUSA support +# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# if [ -z "$2" ]; then echo "usage: $0 " @@ -62,6 +65,12 @@ fi if [ ! -z ${GG_BUILD_VULKAN} ]; then CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1" fi + +if [ ! -z ${GG_BUILD_MUSA} ]; then + # Use qy1 by default (MTT S80) + MUSA_ARCH=${MUSA_ARCH:-21} + CMAKE_EXTRA="-DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}" +fi ## helpers # download a file if it does not exist or if it is outdated @@ -811,7 +820,7 @@ export LLAMA_LOG_PREFIX=1 export LLAMA_LOG_TIMESTAMPS=1 if [ -z ${GG_BUILD_LOW_PERF} ]; then - # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt + # Create symlink: ./llama.cpp/models-mnt -> $MNT/models rm -rf ${SRC}/models-mnt mnt_models=${MNT}/models mkdir -p ${mnt_models} From 36ee06dd2dcf8d3d1157ebe36366d7670770e75f Mon Sep 17 00:00:00 2001 From: Dan Johansson Date: Tue, 25 Mar 2025 10:35:20 +0100 Subject: [PATCH 4/7] docs : add build instructions for KleidiAI (#12563) Signed-off-by: Dan Johansson --- docs/build.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/build.md b/docs/build.md index 7b5503a1fee53..fbf12c7664d50 100644 --- a/docs/build.md +++ b/docs/build.md @@ -435,6 +435,26 @@ llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md). +## ArmĀ® KleidiAIā„¢ +KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend. + +To enable KleidiAI, go to the llama.cpp directory and build using CMake +```bash +cmake -B build -DGGML_CPU_KLEIDIAI=ON +cmake --build build --config Release +``` +You can verify that KleidiAI is being used by running +```bash +./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?" +``` +If KleidiAI is enabled, the ouput will contain a line similar to: +``` +load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB +``` +KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`. + +Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`. + ## Android To read documentation for how to build on Android, [click here](./android.md) From e2f560175a195f63c3276972a3d1caec0bd13e05 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Tue, 25 Mar 2025 16:10:18 +0530 Subject: [PATCH 5/7] SYCL: disable Q4_0 reorder optimization (#12560) ggml-ci --- ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index f4b68333e059b..9fa24b98098f2 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -191,7 +191,7 @@ static void ggml_check_sycl() try { if (!initialized) { g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); - g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); + g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1); g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); GGML_LOG_INFO("Running with Environment Variables:\n"); From 053b3f9aae63151732eccf6b7408c6418ba8746e Mon Sep 17 00:00:00 2001 From: Dan Johansson Date: Tue, 25 Mar 2025 12:10:18 +0100 Subject: [PATCH 6/7] ggml-cpu : update KleidiAI to v1.5.0 (#12568) ggml-cpu : bug fix related to KleidiAI LHS packing Signed-off-by: Dan Johansson --- ggml/src/ggml-cpu/CMakeLists.txt | 4 ++-- ggml/src/ggml-cpu/kleidiai/kernels.cpp | 9 ++------- ggml/src/ggml-cpu/kleidiai/kernels.h | 1 - ggml/src/ggml-cpu/kleidiai/kleidiai.cpp | 7 +++---- 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 6aa078a93ea8e..cb71e9b396089 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -359,9 +359,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # Fetch KleidiAI sources: include(FetchContent) - set(KLEIDIAI_COMMIT_TAG "v1.3.0") + set(KLEIDIAI_COMMIT_TAG "v1.5.0") set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz") - set(KLEIDIAI_ARCHIVE_MD5 "060bd2dc64642b091f461cc8dd7426d9") + set(KLEIDIAI_ARCHIVE_MD5 "ea22e1aefb800e9bc8c74d91633cc58e") if (POLICY CMP0135) cmake_policy(SET CMP0135 NEW) diff --git a/ggml/src/ggml-cpu/kleidiai/kernels.cpp b/ggml/src/ggml-cpu/kleidiai/kernels.cpp index a8a59a887cb51..aacc2bb5ee0f2 100644 --- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp @@ -51,11 +51,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot, }, /* .lhs_info = */ { - /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32, - /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, + /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon, + /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon, /* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon, /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon, - /* .require_aligned_m_idx = */ true, }, /* .rhs_info = */ { /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, @@ -100,7 +99,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, /* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, - /* .require_aligned_m_idx = */ false, }, /* .rhs_info = */ { /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, @@ -144,7 +142,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, /* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, - /* .require_aligned_m_idx = */ false, }, /* .rhs_info = */ { /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, @@ -189,7 +186,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, /* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, - /* .require_aligned_m_idx = */ false, }, /* .rhs_info = */ { /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, @@ -233,7 +229,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, /* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, - /* .require_aligned_m_idx = */ false, }, /* .rhs_info = */ { /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, diff --git a/ggml/src/ggml-cpu/kleidiai/kernels.h b/ggml/src/ggml-cpu/kleidiai/kernels.h index a0b0d149344ad..2ffe97eb42feb 100644 --- a/ggml/src/ggml-cpu/kleidiai/kernels.h +++ b/ggml/src/ggml-cpu/kleidiai/kernels.h @@ -40,7 +40,6 @@ struct lhs_packing_info { size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr); void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs, size_t lhs_stride, void* lhs_packed); - bool require_aligned_m_idx; }; struct rhs_packing_info { diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp index 4dff5c67ee9ad..4e89ca0faa2a7 100644 --- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp @@ -124,8 +124,7 @@ class tensor_traits : public ggml::cpu::tensor_traits { size_t sr = kernel->get_sr(); // Calculate number of columns to be processed per thread - const bool use_multithread = lhs_info->require_aligned_m_idx && m <= mr ? false : true; - const size_t num_m_per_thread = use_multithread ? kai_roundup(m, nth) / nth : m; + const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth; const size_t m_start = ith * num_m_per_thread; size_t m_to_process = num_m_per_thread; if ((m_start + m_to_process) > m) { @@ -135,11 +134,11 @@ class tensor_traits : public ggml::cpu::tensor_traits { if(m_start < m) { // Transform LHS const size_t src_stride = src1->nb[1]; - const float * src_ptr = reinterpret_cast(lhs + lhs_info->get_offset(0, dst->src[1]->nb[1])); + const float * src_ptr = reinterpret_cast(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1])); const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr); void * lhs_packed_ptr = static_cast(lhs_packed + lhs_packed_offset); - lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, m_start, src_ptr, src_stride, lhs_packed_ptr); + lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr); } ggml_barrier(params->threadpool); From ef19c71769681a0b3dde6bc90911728376e5d236 Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Tue, 25 Mar 2025 17:46:11 +0000 Subject: [PATCH 7/7] run: de-duplicate fmt and format functions and optimize (#11596) --- examples/run/run.cpp | 36 ++++++++++-------------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 462a6d151933e..68e94b0b3c3f8 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -38,24 +38,6 @@ } #endif -GGML_ATTRIBUTE_FORMAT(1, 2) -static std::string fmt(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - const int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::string buf; - buf.resize(size); - const int size2 = vsnprintf(const_cast(buf.data()), buf.size() + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - - return buf; -} - GGML_ATTRIBUTE_FORMAT(1, 2) static int printe(const char * fmt, ...) { va_list args; @@ -525,11 +507,11 @@ class HttpClient { int secs = static_cast(seconds) % 60; if (hrs > 0) { - return fmt("%dh %02dm %02ds", hrs, mins, secs); + return string_format("%dh %02dm %02ds", hrs, mins, secs); } else if (mins > 0) { - return fmt("%dm %02ds", mins, secs); + return string_format("%dm %02ds", mins, secs); } else { - return fmt("%ds", secs); + return string_format("%ds", secs); } } @@ -544,7 +526,7 @@ class HttpClient { } } - return fmt("%.2f %s", dbl_size, suffix[i]); + return string_format("%.2f %s", dbl_size, suffix[i]); } static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t, @@ -578,7 +560,9 @@ class HttpClient { return (now_downloaded_plus_file_size * 100) / total_to_download; } - static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast(percentage)); } + static std::string generate_progress_prefix(curl_off_t percentage) { + return string_format("%3ld%% |", static_cast(percentage)); + } static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) { const auto now = std::chrono::steady_clock::now(); @@ -589,9 +573,9 @@ class HttpClient { static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download, double speed, double estimated_time) { const int width = 10; - return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width, - human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width, - human_readable_time(estimated_time).c_str()); + return string_format("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), + width, human_readable_size(total_to_download).c_str(), width, + human_readable_size(speed).c_str(), width, human_readable_time(estimated_time).c_str()); } static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {