Skip to content
2 changes: 2 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1099,6 +1099,8 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.kv_overrides = params.kv_overrides.data();
}

mparams.n_threads = params.cpuparams.n_threads;

return mparams;
}

Expand Down
1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ endif()

option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
Expand Down
2 changes: 1 addition & 1 deletion ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ extern "C" {
// Set the number of threads for the backend
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
// Get additional buffer types provided by the device (returns a NULL-terminated array)
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device, int n_threads);
// Set the abort callback for the backend
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
Expand Down
1 change: 1 addition & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
GGML_BACKEND_API int ggml_cpu_has_sve (void);
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
GGML_BACKEND_API int ggml_cpu_has_sme (void);
// other
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
Expand Down
88 changes: 87 additions & 1 deletion ggml/src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
check_arm_feature(i8mm "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
check_arm_feature(sve "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
check_arm_feature(sme "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")

list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
else()
Expand All @@ -150,7 +151,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if (ARM_FEATURE_RESULT)
message(WARNING "Failed to get ARM features")
else()
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
if (NOT ${feature_pos} EQUAL -1)
message(STATUS "ARM feature ${feature} enabled")
Expand Down Expand Up @@ -316,6 +317,91 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
endif()

if (GGML_CPU_KLEIDIAI)
message(STATUS "Using KleidiAI optimized kernels if applicable")

# Disable the KleidiAI tests
set(KLEIDIAI_BUILD_TESTS OFF)

# Fetch KleidiAI sources:
include(FetchContent)
set(KLEIDIAI_COMMIT_SHA "v1.2.0")
set(KLEIDIAI_DOWNLOAD_URL "https://gitlab.arm.com/kleidi/kleidiai/-/archive/${KLEIDIAI_COMMIT_SHA}/kleidiai-${KLEIDIAI_COMMIT_SHA}.tar.gz")
set(KLEIDIAI_ARCHIVE_MD5 "cebcb660079bf15626e7bdaecd18f49c")

if (POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
endif()

FetchContent_Declare(KleidiAI_Download
URL ${KLEIDIAI_DOWNLOAD_URL}
DOWNLOAD_EXTRACT_TIMESTAMP NEW
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})

FetchContent_MakeAvailable(KleidiAI_Download)
FetchContent_GetProperties(KleidiAI_Download
SOURCE_DIR KLEIDIAI_SRC
POPULATED KLEIDIAI_POPULATED)

if (NOT KLEIDIAI_POPULATED)
message(FATAL_ERROR "KleidiAI source downloaded failed.")
endif()

add_compile_definitions(GGML_USE_CPU_KLEIDIAI)

# Remove kleidiai target after fetching it
if (TARGET kleidiai)
set_target_properties(kleidiai PROPERTIES EXCLUDE_FROM_ALL TRUE)
endif()

list(APPEND GGML_CPU_SOURCES
ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp
ggml-cpu/ggml-kleidiai/kleidiai_kernels.cpp
ggml-cpu/ggml-kleidiai/ggml-kleidiai.h
ggml-cpu/ggml-kleidiai/kleidiai_kernels.h
)

# KleidiAI
include_directories(
${KLEIDIAI_SRC}/
${KLEIDIAI_SRC}/kai/
${KLEIDIAI_SRC}/kai/ukernels/
${KLEIDIAI_SRC}/kai/ukernels/matmul/
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)

string(FIND ${ARCH_FLAGS} "+dotprod" DOTPROD_ENABLED)
string(FIND ${ARCH_FLAGS} "+i8mm" I8MM_ENABLED)
string(FIND ${ARCH_FLAGS} "+sme" SME_ENABLED)

set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})

list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)

if (NOT DOTPROD_ENABLED MATCHES -1)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
endif()

if (NOT I8MM_ENABLED MATCHES -1)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c)
endif()

if (NOT SME_ENABLED MATCHES -1)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
endif()

list(APPEND GGML_CDEF_PUBLIC GGML_USE_CPU_KLEIDIAI)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GGML_CDEF_PUBLIC has been removed, and this will have no effect. If it is important to add this definition, it should be done with target_compile_definitions.

set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS ${PRIVATE_ARCH_FLAGS})
list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
endif()

message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-cpu/ggml-cpu-traits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
} // namespace ggml::cpu

bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(params->nth)) {
if (extra && extra->context) {
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
auto tensor_traits = buf_extra->get_tensor_traits(op);
Expand All @@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
}

bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(n_threads)) {
if (extra && extra->context) {
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
auto tensor_traits = buf_extra->get_tensor_traits(op);
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cpu/ggml-cpu-traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,6 @@ class extra_buffer_type {
} // namespace ggml::cpu

// implemented in ggml-cpu.cpp.
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type(int n_threads);

#endif
33 changes: 29 additions & 4 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ struct ggml_arm_arch_features_type {
int has_i8mm;
int has_sve;
int sve_cnt;
} ggml_arm_arch_features = {-1, -1, -1, -1, 0};
int has_sme;
} ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
#endif


Expand Down Expand Up @@ -2389,15 +2390,20 @@ bool ggml_is_numa(void) {
#define HWCAP2_I8MM (1 << 13)
#endif

#if !defined(HWCAP2_SME)
#define HWCAP2_SME (1 << 23)
#endif

static void ggml_init_arm_arch_features(void) {
#if defined(__linux__) && defined(__aarch64__)
uint32_t hwcap = getauxval(AT_HWCAP);
uint32_t hwcap2 = getauxval(AT_HWCAP2);

ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
ggml_arm_arch_features.has_sme = !!(hwcap2 & HWCAP2_SME);

#if defined(__ARM_FEATURE_SVE)
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
Expand All @@ -2420,6 +2426,11 @@ static void ggml_init_arm_arch_features(void) {
}
ggml_arm_arch_features.has_i8mm = oldp;

if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
oldp = 0;
}
ggml_arm_arch_features.has_sme = oldp;

ggml_arm_arch_features.has_sve = 0;
ggml_arm_arch_features.sve_cnt = 0;
#else
Expand All @@ -2443,6 +2454,12 @@ static void ggml_init_arm_arch_features(void) {
ggml_arm_arch_features.has_sve = 0;
ggml_arm_arch_features.sve_cnt = 0;
#endif

#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
ggml_arm_arch_features.has_sme = 1;
#else
ggml_arm_arch_features.has_sme = 0;
#endif
#endif
}
#endif
Expand Down Expand Up @@ -14349,6 +14366,14 @@ int ggml_cpu_get_sve_cnt(void) {
#endif
}

int ggml_cpu_has_sme(void) {
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
return ggml_arm_arch_features.has_sme;
#else
return 0;
#endif
}

void ggml_cpu_init(void) {
// needed to initialize f16 tables
{
Expand Down
30 changes: 24 additions & 6 deletions ggml/src/ggml-cpu/ggml-cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
#include "ggml-cpu-hbm.h"
#endif

#ifdef GGML_USE_CPU_KLEIDIAI
#include "ggml-kleidiai/ggml-kleidiai.h"
#endif

#if defined(__APPLE__)
#include <sys/types.h>
#include <sys/sysctl.h>
Expand All @@ -29,8 +33,8 @@

// ggml-backend interface

std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type(int n_threads) {
static std::vector<ggml_backend_buffer_type_t> bufts = [n_threads]() {
std::vector<ggml_backend_buffer_type_t> bufts;

#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
Expand All @@ -39,6 +43,12 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
}
#endif

#ifdef GGML_USE_CPU_KLEIDIAI
if (ggml_backend_cpu_kleidiai_buffer_type(n_threads)) {
bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type(n_threads));
}
#endif

#ifdef GGML_USE_CPU_AARCH64
if (ggml_backend_cpu_aarch64_buffer_type()) {
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
Expand All @@ -48,19 +58,21 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
bufts.push_back(NULL);

return bufts;

GGML_UNUSED(n_threads);
}();

return bufts;
}

static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
return ggml_backend_cpu_get_extra_buffers_type().data();
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device, int n_threads) {
return ggml_backend_cpu_get_extra_buffers_type(n_threads).data();

GGML_UNUSED(device);
}

static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(-1)) {
if (extra && extra == buft) return true;
}
return false;
Expand Down Expand Up @@ -375,7 +387,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
}

// extra_buffer_op?
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(-1)) {
if (extra) {
auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
if (buf_extra && buf_extra->supports_op(dev, op)) {
Expand Down Expand Up @@ -540,6 +552,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
features.push_back({ "SVE_CNT", sve_cnt.c_str() });
}
if (ggml_cpu_has_sme()) {
features.push_back({ "SME", "1" });
}
if (ggml_cpu_has_riscv_v()) {
features.push_back({ "RISCV_V", "1" });
}
Expand All @@ -561,6 +576,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
#ifdef GGML_USE_OPENMP
features.push_back({ "OPENMP", "1" });
#endif
#ifdef GGML_USE_CPU_KLEIDIAI
features.push_back({ "KLEIDIAI_REPACK", "1" });
#endif
#ifdef GGML_USE_CPU_AARCH64
features.push_back({ "AARCH64_REPACK", "1" });
#endif
Expand Down
Loading
Loading