Skip to content

Commit 4d74393

Browse files
authored
ggml: update kleidiai to v1.13.0 (ggml-org#15663)
1 parent dd89255 commit 4d74393

File tree

5 files changed

+67
-20
lines changed

5 files changed

+67
-20
lines changed

ggml/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
2-
project("ggml" C CXX)
2+
project("ggml" C CXX ASM)
33
include(CheckIncludeFileCXX)
44

55
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -497,9 +497,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
497497

498498
# Fetch KleidiAI sources:
499499
include(FetchContent)
500-
set(KLEIDIAI_COMMIT_TAG "v1.11.0")
500+
set(KLEIDIAI_COMMIT_TAG "v1.13.0")
501501
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
502-
set(KLEIDIAI_ARCHIVE_MD5 "3fe9e5ab964c375c53839296eb71eaa2")
502+
set(KLEIDIAI_ARCHIVE_MD5 "d82a8de939d9814621a5ba23907bdac1")
503503

504504
if (POLICY CMP0135)
505505
cmake_policy(SET CMP0135 NEW)
@@ -555,6 +555,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
555555

556556
list(APPEND GGML_KLEIDIAI_SOURCES
557557
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
558+
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.c
558559
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
559560
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
560561
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
@@ -576,7 +577,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
576577
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
577578
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
578579
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
579-
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
580+
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
581+
${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
580582
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
581583
endif()
582584

ggml/src/ggml-cpu/kleidiai/kernels.cpp

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
1616
#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
17+
#include "kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.h"
1718
#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
1819

1920
#include "kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.h"
@@ -127,6 +128,12 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
127128
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
128129
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
129130
},
131+
/* .gemm_lhs_info = */ {
132+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
133+
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon,
134+
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon,
135+
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
136+
},
130137
/* SME GEMV */
131138
/* .kern_info = */ {
132139
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
@@ -141,7 +148,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
141148
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
142149
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
143150
},
144-
/* .lhs_info = */ {
151+
/* .gemv_lhs_info = */ {
145152
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
146153
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon,
147154
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon,
@@ -173,6 +180,12 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
173180
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
174181
/* .run_kernel = */ kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
175182
},
183+
/* .gemm_lhs_info = */ {
184+
/* .get_offset = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme,
185+
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_pack_bf16p2vlx2_f32_sme,
186+
/* .packed_size = */ kai_get_lhs_packed_size_lhs_pack_bf16p2vlx2_f32_sme,
187+
/* .pack_func = */ kai_run_lhs_pack_bf16p2vlx2_f32_sme,
188+
},
176189
/* SME GEMV */
177190
/* .kern_info = */ {
178191
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
@@ -187,7 +200,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
187200
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
188201
/* .run_kernel = */ kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
189202
},
190-
/* .lhs_info = */ {
203+
/* .gemv_lhs_info = */ {
191204
/* .get_offset = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme,
192205
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_pack_bf16p2vlx2_f32_sme,
193206
/* .packed_size = */ kai_get_lhs_packed_size_lhs_pack_bf16p2vlx2_f32_sme,
@@ -222,6 +235,12 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
222235
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
223236
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
224237
},
238+
/* .gemm_lhs_info = */ {
239+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
240+
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
241+
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
242+
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
243+
},
225244
/* DOTPROD GEMV */
226245
/* .kern_info = */ {
227246
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
@@ -236,7 +255,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
236255
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
237256
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
238257
},
239-
/* .lhs_info = */ {
258+
/* .gemv_lhs_info = */ {
240259
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
241260
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
242261
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
@@ -270,6 +289,12 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
270289
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
271290
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
272291
},
292+
/* .gemm_lhs_info = */ {
293+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
294+
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
295+
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
296+
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
297+
},
273298
/* i8mm GEMV */
274299
/* .kern_info = */ {
275300
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
@@ -284,7 +309,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
284309
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
285310
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
286311
},
287-
/* .lhs_info = */ {
312+
/* .gemv_lhs_info = */ {
288313
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
289314
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
290315
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
@@ -319,6 +344,12 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
319344
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
320345
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
321346
},
347+
/* .gemm_lhs_info = */ {
348+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
349+
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
350+
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
351+
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
352+
},
322353
/* i8mm GEMV */
323354
/* .kern_info = */ {
324355
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
@@ -333,7 +364,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
333364
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
334365
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
335366
},
336-
/* .lhs_info = */ {
367+
/* .gemv_lhs_info = */ {
337368
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
338369
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
339370
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
@@ -367,6 +398,12 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
367398
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
368399
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
369400
},
401+
/* .gemm_lhs_info = */ {
402+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
403+
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
404+
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
405+
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
406+
},
370407
/* DOTPROD GEMV */
371408
/* .kern_info = */ {
372409
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
@@ -381,7 +418,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
381418
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
382419
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
383420
},
384-
/* .lhs_info = */ {
421+
/* .gemv_lhs_info = */ {
385422
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
386423
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
387424
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,

ggml/src/ggml-cpu/kleidiai/kernels.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,11 @@ struct rhs_packing_info {
8484

8585
struct ggml_kleidiai_kernels {
8686
kernel_info gemm;
87+
lhs_packing_info gemm_lhs_info;
88+
8789
kernel_info gemv;
88-
lhs_packing_info lhs_info;
90+
lhs_packing_info gemv_lhs_info;
91+
8992
rhs_packing_info rhs_info;
9093

9194
cpu_feature required_cpu;

ggml/src/ggml-cpu/kleidiai/kleidiai.cpp

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,9 @@ class tensor_traits : public ggml::cpu::tensor_traits {
123123
}
124124
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, op);
125125
GGML_ASSERT(kernels);
126-
kernel_info * kernel = op->src[1]->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
126+
bool is_gemv = op->src[1]->ne[1] == 1;
127+
kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
128+
lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
127129

128130
size_t k = op->src[0]->ne[0];
129131
size_t n = op->src[0]->ne[1];
@@ -134,9 +136,9 @@ class tensor_traits : public ggml::cpu::tensor_traits {
134136
size_t sr = kernel->get_sr();
135137

136138
if (kernels->rhs_type == GGML_TYPE_Q4_0) {
137-
size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, QK4_0, mr, kr, sr);
139+
size = variant_call<size_t>(lhs_info->packed_size, m, k, QK4_0, mr, kr, sr);
138140
} else if (kernels->rhs_type == GGML_TYPE_F16) {
139-
size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, mr, kr, sr) +
141+
size = variant_call<size_t>(lhs_info->packed_size, m, k, mr, kr, sr) +
140142
variant_call<size_t>(kernels->rhs_info.packed_size, n, k) +
141143
k * n * sizeof(float) + n * sizeof(float);
142144
} else {
@@ -173,7 +175,9 @@ class tensor_traits : public ggml::cpu::tensor_traits {
173175
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
174176
GGML_ASSERT(kernels);
175177

176-
kernel_info * kernel = src1->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
178+
bool is_gemv = src1->ne[1] == 1;
179+
kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
180+
lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
177181
GGML_ASSERT(kernel);
178182

179183
const int nth = params->nth;
@@ -198,7 +202,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
198202
const int64_t kr = static_cast<int64_t>(kernel->get_kr());
199203
const int64_t sr = static_cast<int64_t>(kernel->get_sr());
200204

201-
const size_t lhs_packed_size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, mr, kr, sr);
205+
const size_t lhs_packed_size = variant_call<size_t>(lhs_info->packed_size, m, k, mr, kr, sr);
202206
const size_t rhs_packed_size = variant_call<size_t>(kernels->rhs_info.packed_size, n, k);
203207
const size_t kxn_size = k * n * sizeof(float);
204208
const size_t bias_size = n * sizeof(float);
@@ -229,12 +233,12 @@ class tensor_traits : public ggml::cpu::tensor_traits {
229233
const int64_t num_m_per_thread = (ith == num_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0;
230234

231235
const size_t lhs_offset = variant_call<size_t>(kernels->gemm.get_lhs_offset, m_start, lhs_stride);
232-
const size_t lhs_packed_offset = variant_call<size_t>(kernels->lhs_info.get_packed_offset, m_start, k, mr, kr, sr);
236+
const size_t lhs_packed_offset = variant_call<size_t>(lhs_info->get_packed_offset, m_start, k, mr, kr, sr);
233237

234238
const void * src_ptr = static_cast<const uint8_t *>(lhs_batch) + lhs_offset;
235239
void * dst_ptr = static_cast<uint8_t *>(lhs_packed) + lhs_packed_offset;
236240

237-
variant_call<void>(kernels->lhs_info.pack_func, num_m_per_thread, k, mr, kr, sr, 0, src_ptr, lhs_stride, dst_ptr);
241+
variant_call<void>(lhs_info->pack_func, num_m_per_thread, k, mr, kr, sr, 0, src_ptr, lhs_stride, dst_ptr);
238242
}
239243
}
240244

@@ -306,8 +310,9 @@ class tensor_traits : public ggml::cpu::tensor_traits {
306310
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
307311
GGML_ASSERT(kernels);
308312

309-
kernel_info * kernel = src1->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
310-
lhs_packing_info * lhs_info = &kernels->lhs_info;
313+
bool is_gemv = src1->ne[1] == 1;
314+
kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
315+
lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
311316

312317
GGML_ASSERT(kernel);
313318

0 commit comments

Comments
 (0)