Skip to content

Commit 6adca19

Browse files
committed
ggml-cpu: Add CPU backend support for KleidiAI library
1 parent c07e87f commit 6adca19

19 files changed

+675
-22
lines changed

common/common.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,6 +1099,8 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
10991099
mparams.kv_overrides = params.kv_overrides.data();
11001100
}
11011101

1102+
mparams.n_threads = params.cpuparams.n_threads;
1103+
11021104
return mparams;
11031105
}
11041106

ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ endif()
101101

102102
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
103103
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
104+
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
104105
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
105106
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
106107
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})

ggml/include/ggml-backend.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ extern "C" {
189189
// Set the number of threads for the backend
190190
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
191191
// Get additional buffer types provided by the device (returns a NULL-terminated array)
192-
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
192+
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device, int n_threads);
193193
// Set the abort callback for the backend
194194
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
195195
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)

ggml/include/ggml-cpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ extern "C" {
9595
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
9696
GGML_BACKEND_API int ggml_cpu_has_sve (void);
9797
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
98+
GGML_BACKEND_API int ggml_cpu_has_sme (void);
9899
// other
99100
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
100101
GGML_BACKEND_API int ggml_cpu_has_vsx (void);

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
126126
check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
127127
check_arm_feature(i8mm "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
128128
check_arm_feature(sve "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
129+
check_arm_feature(sme "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
129130

130131
list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
131132
else()
@@ -150,7 +151,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
150151
if (ARM_FEATURE_RESULT)
151152
message(WARNING "Failed to get ARM features")
152153
else()
153-
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
154+
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
154155
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
155156
if (NOT ${feature_pos} EQUAL -1)
156157
message(STATUS "ARM feature ${feature} enabled")
@@ -316,6 +317,91 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
316317
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
317318
endif()
318319

320+
if (GGML_CPU_KLEIDIAI)
321+
message(STATUS "Using KleidiAI optimized kernels if applicable")
322+
323+
# Disable the KleidiAI tests
324+
set(KLEIDIAI_BUILD_TESTS OFF)
325+
326+
# Fetch KleidiAI sources:
327+
include(FetchContent)
328+
set(KLEIDIAI_COMMIT_SHA "v1.2.0")
329+
set(KLEIDIAI_DOWNLOAD_URL "https://gitlab.arm.com/kleidi/kleidiai/-/archive/${KLEIDIAI_COMMIT_SHA}/kleidiai-${KLEIDIAI_COMMIT_SHA}.tar.gz")
330+
set(KLEIDIAI_ARCHIVE_MD5 "cebcb660079bf15626e7bdaecd18f49c")
331+
332+
if (POLICY CMP0135)
333+
cmake_policy(SET CMP0135 NEW)
334+
endif()
335+
336+
FetchContent_Declare(KleidiAI_Download
337+
URL ${KLEIDIAI_DOWNLOAD_URL}
338+
DOWNLOAD_EXTRACT_TIMESTAMP NEW
339+
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
340+
341+
FetchContent_MakeAvailable(KleidiAI_Download)
342+
FetchContent_GetProperties(KleidiAI_Download
343+
SOURCE_DIR KLEIDIAI_SRC
344+
POPULATED KLEIDIAI_POPULATED)
345+
346+
if (NOT KLEIDIAI_POPULATED)
347+
message(FATAL_ERROR "KleidiAI source downloaded failed.")
348+
endif()
349+
350+
add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
351+
352+
# Remove kleidiai target after fetching it
353+
if (TARGET kleidiai)
354+
set_target_properties(kleidiai PROPERTIES EXCLUDE_FROM_ALL TRUE)
355+
endif()
356+
357+
list(APPEND GGML_CPU_SOURCES
358+
ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp
359+
ggml-cpu/ggml-kleidiai/kleidiai_kernels.cpp
360+
ggml-cpu/ggml-kleidiai/ggml-kleidiai.h
361+
ggml-cpu/ggml-kleidiai/kleidiai_kernels.h
362+
)
363+
364+
# KleidiAI
365+
include_directories(
366+
${KLEIDIAI_SRC}/
367+
${KLEIDIAI_SRC}/kai/
368+
${KLEIDIAI_SRC}/kai/ukernels/
369+
${KLEIDIAI_SRC}/kai/ukernels/matmul/
370+
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
371+
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
372+
373+
string(FIND ${ARCH_FLAGS} "+dotprod" DOTPROD_ENABLED)
374+
string(FIND ${ARCH_FLAGS} "+i8mm" I8MM_ENABLED)
375+
string(FIND ${ARCH_FLAGS} "+sme" SME_ENABLED)
376+
377+
set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})
378+
379+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
380+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
381+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
382+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
383+
384+
if (NOT DOTPROD_ENABLED MATCHES -1)
385+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
386+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
387+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
388+
endif()
389+
390+
if (NOT I8MM_ENABLED MATCHES -1)
391+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c)
392+
endif()
393+
394+
if (NOT SME_ENABLED MATCHES -1)
395+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
396+
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
397+
set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
398+
endif()
399+
400+
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CPU_KLEIDIAI)
401+
set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS ${PRIVATE_ARCH_FLAGS})
402+
list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
403+
endif()
404+
319405
message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
320406
target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
321407
target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})

ggml/src/ggml-cpu/ggml-cpu-traits.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
1010
} // namespace ggml::cpu
1111

1212
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
13-
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
13+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(params->nth)) {
1414
if (extra && extra->context) {
1515
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
1616
auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
2323
}
2424

2525
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
26-
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
26+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(n_threads)) {
2727
if (extra && extra->context) {
2828
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
2929
auto tensor_traits = buf_extra->get_tensor_traits(op);

ggml/src/ggml-cpu/ggml-cpu-traits.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,6 @@ class extra_buffer_type {
3333
} // namespace ggml::cpu
3434

3535
// implemented in ggml-cpu.cpp.
36-
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
36+
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type(int n_threads);
3737

3838
#endif

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,8 @@ struct ggml_arm_arch_features_type {
114114
int has_i8mm;
115115
int has_sve;
116116
int sve_cnt;
117-
} ggml_arm_arch_features = {-1, -1, -1, -1, 0};
117+
int has_sme;
118+
} ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
118119
#endif
119120

120121

@@ -2389,15 +2390,20 @@ bool ggml_is_numa(void) {
23892390
#define HWCAP2_I8MM (1 << 13)
23902391
#endif
23912392

2393+
#if !defined(HWCAP2_SME)
2394+
#define HWCAP2_SME (1 << 23)
2395+
#endif
2396+
23922397
static void ggml_init_arm_arch_features(void) {
23932398
#if defined(__linux__) && defined(__aarch64__)
23942399
uint32_t hwcap = getauxval(AT_HWCAP);
23952400
uint32_t hwcap2 = getauxval(AT_HWCAP2);
23962401

2397-
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
2402+
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
23982403
ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
2399-
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
2400-
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
2404+
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
2405+
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
2406+
ggml_arm_arch_features.has_sme = !!(hwcap2 & HWCAP2_SME);
24012407

24022408
#if defined(__ARM_FEATURE_SVE)
24032409
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
@@ -2420,6 +2426,11 @@ static void ggml_init_arm_arch_features(void) {
24202426
}
24212427
ggml_arm_arch_features.has_i8mm = oldp;
24222428

2429+
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
2430+
oldp = 0;
2431+
}
2432+
ggml_arm_arch_features.has_sme = oldp;
2433+
24232434
ggml_arm_arch_features.has_sve = 0;
24242435
ggml_arm_arch_features.sve_cnt = 0;
24252436
#else
@@ -2443,6 +2454,12 @@ static void ggml_init_arm_arch_features(void) {
24432454
ggml_arm_arch_features.has_sve = 0;
24442455
ggml_arm_arch_features.sve_cnt = 0;
24452456
#endif
2457+
2458+
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
2459+
ggml_arm_arch_features.has_sme = 1;
2460+
#else
2461+
ggml_arm_arch_features.has_sme = 0;
2462+
#endif
24462463
#endif
24472464
}
24482465
#endif
@@ -14349,6 +14366,14 @@ int ggml_cpu_get_sve_cnt(void) {
1434914366
#endif
1435014367
}
1435114368

14369+
int ggml_cpu_has_sme(void) {
14370+
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
14371+
return ggml_arm_arch_features.has_sme;
14372+
#else
14373+
return 0;
14374+
#endif
14375+
}
14376+
1435214377
void ggml_cpu_init(void) {
1435314378
// needed to initialize f16 tables
1435414379
{

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
#include "ggml-cpu-hbm.h"
1515
#endif
1616

17+
#ifdef GGML_USE_CPU_KLEIDIAI
18+
#include "ggml-kleidiai/ggml-kleidiai.h"
19+
#endif
20+
1721
#if defined(__APPLE__)
1822
#include <sys/types.h>
1923
#include <sys/sysctl.h>
@@ -29,8 +33,8 @@
2933

3034
// ggml-backend interface
3135

32-
std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
33-
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
36+
std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type(int n_threads) {
37+
static std::vector<ggml_backend_buffer_type_t> bufts = [n_threads]() {
3438
std::vector<ggml_backend_buffer_type_t> bufts;
3539

3640
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
@@ -39,6 +43,12 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
3943
}
4044
#endif
4145

46+
#ifdef GGML_USE_CPU_KLEIDIAI
47+
if (ggml_backend_cpu_kleidiai_buffer_type(n_threads)) {
48+
bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type(n_threads));
49+
}
50+
#endif
51+
4252
#ifdef GGML_USE_CPU_AARCH64
4353
if (ggml_backend_cpu_aarch64_buffer_type()) {
4454
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
@@ -48,19 +58,21 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
4858
bufts.push_back(NULL);
4959

5060
return bufts;
61+
62+
GGML_UNUSED(n_threads);
5163
}();
5264

5365
return bufts;
5466
}
5567

56-
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
57-
return ggml_backend_cpu_get_extra_buffers_type().data();
68+
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device, int n_threads) {
69+
return ggml_backend_cpu_get_extra_buffers_type(n_threads).data();
5870

5971
GGML_UNUSED(device);
6072
}
6173

6274
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
63-
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
75+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(-1)) {
6476
if (extra && extra == buft) return true;
6577
}
6678
return false;
@@ -375,7 +387,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
375387
}
376388

377389
// extra_buffer_op?
378-
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
390+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(-1)) {
379391
if (extra) {
380392
auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
381393
if (buf_extra && buf_extra->supports_op(dev, op)) {
@@ -540,6 +552,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
540552
static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
541553
features.push_back({ "SVE_CNT", sve_cnt.c_str() });
542554
}
555+
if (ggml_cpu_has_sme()) {
556+
features.push_back({ "SME", "1" });
557+
}
543558
if (ggml_cpu_has_riscv_v()) {
544559
features.push_back({ "RISCV_V", "1" });
545560
}
@@ -561,6 +576,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
561576
#ifdef GGML_USE_OPENMP
562577
features.push_back({ "OPENMP", "1" });
563578
#endif
579+
#ifdef GGML_USE_CPU_KLEIDIAI
580+
features.push_back({ "KLEIDIAI_REPACK", "1" });
581+
#endif
564582
#ifdef GGML_USE_CPU_AARCH64
565583
features.push_back({ "AARCH64_REPACK", "1" });
566584
#endif

0 commit comments

Comments
 (0)