@@ -126,6 +126,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
126126 check_arm_feature(dotprod "#include <arm_neon.h>\n int main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" )
127127 check_arm_feature(i8mm "#include <arm_neon.h>\n int main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" )
128128 check_arm_feature(sve "#include <arm_sve.h>\n int main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }" )
129+ check_arm_feature(sme "#include <arm_sme.h>\n __arm_locally_streaming int main() { __asm__ volatile(\" smstart; smstop;\" ); return 0; }" )
129130
130131 list (APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX} " )
131132 else ()
@@ -150,7 +151,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
150151 if (ARM_FEATURE_RESULT)
151152 message (WARNING "Failed to get ARM features" )
152153 else ()
153- foreach (feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
154+ foreach (feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME )
154155 string (FIND "${ARM_FEATURE} " "__ARM_FEATURE_${feature} 1" feature_pos)
155156 if (NOT ${feature_pos} EQUAL -1)
156157 message (STATUS "ARM feature ${feature} enabled" )
@@ -316,6 +317,91 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
316317 target_compile_definitions (${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
317318 endif ()
318319
320+ if (GGML_CPU_KLEIDIAI)
321+ message (STATUS "Using KleidiAI optimized kernels if applicable" )
322+
323+ # Disable the KleidiAI tests
324+ set (KLEIDIAI_BUILD_TESTS OFF )
325+
326+ # Fetch KleidiAI sources:
327+ include (FetchContent)
328+ set (KLEIDIAI_COMMIT_SHA "v1.2.0" )
329+ set (KLEIDIAI_DOWNLOAD_URL "https://gitlab.arm.com/kleidi/kleidiai/-/archive/${KLEIDIAI_COMMIT_SHA} /kleidiai-${KLEIDIAI_COMMIT_SHA} .tar.gz" )
330+ set (KLEIDIAI_ARCHIVE_MD5 "cebcb660079bf15626e7bdaecd18f49c" )
331+
332+ if (POLICY CMP0135)
333+ cmake_policy (SET CMP0135 NEW)
334+ endif ()
335+
336+ FetchContent_Declare(KleidiAI_Download
337+ URL ${KLEIDIAI_DOWNLOAD_URL}
338+ DOWNLOAD_EXTRACT_TIMESTAMP NEW
339+ URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5} )
340+
341+ FetchContent_MakeAvailable(KleidiAI_Download)
342+ FetchContent_GetProperties(KleidiAI_Download
343+ SOURCE_DIR KLEIDIAI_SRC
344+ POPULATED KLEIDIAI_POPULATED)
345+
346+ if (NOT KLEIDIAI_POPULATED)
347+ message (FATAL_ERROR "KleidiAI source downloaded failed." )
348+ endif ()
349+
350+ add_compile_definitions (GGML_USE_CPU_KLEIDIAI)
351+
352+ # Remove kleidiai target after fetching it
353+ if (TARGET kleidiai)
354+ set_target_properties (kleidiai PROPERTIES EXCLUDE_FROM_ALL TRUE )
355+ endif ()
356+
357+ list (APPEND GGML_CPU_SOURCES
358+ ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp
359+ ggml-cpu/ggml-kleidiai/kleidiai_kernels.cpp
360+ ggml-cpu/ggml-kleidiai/ggml-kleidiai.h
361+ ggml-cpu/ggml-kleidiai/kleidiai_kernels.h
362+ )
363+
364+ # KleidiAI
365+ include_directories (
366+ ${KLEIDIAI_SRC} /
367+ ${KLEIDIAI_SRC} /kai/
368+ ${KLEIDIAI_SRC} /kai/ukernels/
369+ ${KLEIDIAI_SRC} /kai/ukernels/matmul/
370+ ${KLEIDIAI_SRC} /kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
371+ ${KLEIDIAI_SRC} /kai/ukernels/matmul/pack/)
372+
373+ string (FIND ${ARCH_FLAGS} "+dotprod" DOTPROD_ENABLED)
374+ string (FIND ${ARCH_FLAGS} "+i8mm" I8MM_ENABLED)
375+ string (FIND ${ARCH_FLAGS} "+sme" SME_ENABLED)
376+
377+ set (PRIVATE_ARCH_FLAGS ${ARCH_FLAGS} )
378+
379+ list (APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC} /kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
380+ list (APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC} /kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
381+ list (APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC} /kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
382+ list (APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC} /kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
383+
384+ if (NOT DOTPROD_ENABLED MATCHES -1)
385+ list (APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC} /kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
386+ list (APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC} /kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
387+ list (APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC} /kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
388+ endif ()
389+
390+ if (NOT I8MM_ENABLED MATCHES -1)
391+ list (APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC} /kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c)
392+ endif ()
393+
394+ if (NOT SME_ENABLED MATCHES -1)
395+ list (APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC} /kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
396+ list (APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC} /kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
397+ set (PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS} +sve+sve2" )
398+ endif ()
399+
400+ list (APPEND GGML_CDEF_PUBLIC GGML_USE_CPU_KLEIDIAI)
401+ set_source_files_properties (${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS ${PRIVATE_ARCH_FLAGS} )
402+ list (APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES} )
403+ endif ()
404+
319405 message (STATUS "Adding CPU backend variant ${GGML_CPU_NAME} : ${ARCH_FLAGS} ${ARCH_DEFINITIONS} " )
320406 target_sources (${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES} )
321407 target_compile_options (${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS} )
0 commit comments