Skip to content

Commit 09436f4

Browse files
committed
Update cmake for SME build and add alignment for SME
1 parent 02315a8 commit 09436f4

File tree

4 files changed

+30
-23
lines changed

4 files changed

+30
-23
lines changed

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -403,8 +403,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
403403
endif()
404404

405405
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CPU_KLEIDIAI)
406-
set_source_files_properties("${GGML_KLEIDIAI_SOURCES}" PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
407-
list(APPEND GGML_CPU_SOURCES "${GGML_KLEIDIAI_SOURCES}")
406+
set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
407+
list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
408408
endif()
409409

410410
message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")

ggml/src/ggml-cpu/kleidiai/kernels.cpp

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,11 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
5151
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
5252
},
5353
/* .lhs_info = */ {
54-
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
55-
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
56-
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon,
57-
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
54+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
55+
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
56+
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon,
57+
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
58+
/* .require_aligned_m_idx = */ true,
5859
},
5960
/* .rhs_info = */ {
6061
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
@@ -95,10 +96,11 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
9596
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
9697
},
9798
/* .lhs_info = */ {
98-
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
99-
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
100-
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
101-
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
99+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
100+
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
101+
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
102+
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
103+
/* .require_aligned_m_idx = */ false,
102104
},
103105
/* .rhs_info = */ {
104106
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -138,10 +140,11 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
138140
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
139141
},
140142
/* .lhs_info = */ {
141-
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
142-
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
143-
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
144-
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
143+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
144+
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
145+
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
146+
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
147+
/* .require_aligned_m_idx = */ false,
145148
},
146149
/* .rhs_info = */ {
147150
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -182,10 +185,11 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
182185
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
183186
},
184187
/* .lhs_info = */ {
185-
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
186-
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
187-
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
188-
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
188+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
189+
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
190+
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
191+
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
192+
/* .require_aligned_m_idx = */ false,
189193
},
190194
/* .rhs_info = */ {
191195
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -225,10 +229,11 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
225229
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
226230
},
227231
/* .lhs_info = */ {
228-
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
229-
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
230-
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
231-
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
232+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
233+
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
234+
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
235+
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
236+
/* .require_aligned_m_idx = */ false,
232237
},
233238
/* .rhs_info = */ {
234239
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,

ggml/src/ggml-cpu/kleidiai/kernels.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ struct lhs_packing_info {
4040
size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
4141
void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
4242
size_t lhs_stride, void* lhs_packed);
43+
bool require_aligned_m_idx;
4344
};
4445

4546
struct rhs_packing_info {

ggml/src/ggml-cpu/kleidiai/kleidiai.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
124124
size_t sr = kernel->get_sr();
125125

126126
// Calculate number of columns to be processed per thread
127-
const size_t num_m_per_thread = kai_roundup(m, nth) / nth;
127+
const bool use_multithread = lhs_info->require_aligned_m_idx && m <= mr ? false : true;
128+
const size_t num_m_per_thread = use_multithread ? kai_roundup(m, nth) / nth : m;
128129
const size_t m_start = ith * num_m_per_thread;
129130
size_t m_to_process = num_m_per_thread;
130131
if ((m_start + m_to_process) > m) {

0 commit comments

Comments
 (0)