Skip to content

Commit 119d3bf

Browse files
committed
Add environmental variable GGML_KLEIDIAI_SME
1 parent 6adca19 commit 119d3bf

File tree

14 files changed

+38
-48
lines changed

14 files changed

+38
-48
lines changed

common/common.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,8 +1099,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
10991099
mparams.kv_overrides = params.kv_overrides.data();
11001100
}
11011101

1102-
mparams.n_threads = params.cpuparams.n_threads;
1103-
11041102
return mparams;
11051103
}
11061104

ggml/include/ggml-backend.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ extern "C" {
189189
// Set the number of threads for the backend
190190
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
191191
// Get additional buffer types provided by the device (returns a NULL-terminated array)
192-
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device, int n_threads);
192+
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
193193
// Set the abort callback for the backend
194194
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
195195
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -325,9 +325,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
325325

326326
# Fetch KleidiAI sources:
327327
include(FetchContent)
328-
set(KLEIDIAI_COMMIT_SHA "v1.2.0")
329-
set(KLEIDIAI_DOWNLOAD_URL "https://gitlab.arm.com/kleidi/kleidiai/-/archive/${KLEIDIAI_COMMIT_SHA}/kleidiai-${KLEIDIAI_COMMIT_SHA}.tar.gz")
330-
set(KLEIDIAI_ARCHIVE_MD5 "cebcb660079bf15626e7bdaecd18f49c")
328+
set(KLEIDIAI_COMMIT_TAG "v1.2.0")
329+
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
330+
set(KLEIDIAI_ARCHIVE_MD5 "6634fefce7357ecfee9eace2068bc68b")
331331

332332
if (POLICY CMP0135)
333333
cmake_policy(SET CMP0135 NEW)

ggml/src/ggml-cpu/ggml-cpu-traits.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
1010
} // namespace ggml::cpu
1111

1212
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
13-
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(params->nth)) {
13+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
1414
if (extra && extra->context) {
1515
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
1616
auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
2323
}
2424

2525
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
26-
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(n_threads)) {
26+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
2727
if (extra && extra->context) {
2828
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
2929
auto tensor_traits = buf_extra->get_tensor_traits(op);

ggml/src/ggml-cpu/ggml-cpu-traits.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,6 @@ class extra_buffer_type {
3333
} // namespace ggml::cpu
3434

3535
// implemented in ggml-cpu.cpp.
36-
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type(int n_threads);
36+
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
3737

3838
#endif

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@
3333

3434
// ggml-backend interface
3535

36-
std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type(int n_threads) {
37-
static std::vector<ggml_backend_buffer_type_t> bufts = [n_threads]() {
36+
std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
37+
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
3838
std::vector<ggml_backend_buffer_type_t> bufts;
3939

4040
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
@@ -44,8 +44,8 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
4444
#endif
4545

4646
#ifdef GGML_USE_CPU_KLEIDIAI
47-
if (ggml_backend_cpu_kleidiai_buffer_type(n_threads)) {
48-
bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type(n_threads));
47+
if (ggml_backend_cpu_kleidiai_buffer_type()) {
48+
bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());
4949
}
5050
#endif
5151

@@ -58,21 +58,19 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
5858
bufts.push_back(NULL);
5959

6060
return bufts;
61-
62-
GGML_UNUSED(n_threads);
6361
}();
6462

6563
return bufts;
6664
}
6765

68-
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device, int n_threads) {
69-
return ggml_backend_cpu_get_extra_buffers_type(n_threads).data();
66+
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
67+
return ggml_backend_cpu_get_extra_buffers_type().data();
7068

7169
GGML_UNUSED(device);
7270
}
7371

7472
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
75-
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(-1)) {
73+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
7674
if (extra && extra == buft) return true;
7775
}
7876
return false;
@@ -387,7 +385,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
387385
}
388386

389387
// extra_buffer_op?
390-
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(-1)) {
388+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
391389
if (extra) {
392390
auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
393391
if (buf_extra && buf_extra->supports_op(dev, op)) {
@@ -577,7 +575,7 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
577575
features.push_back({ "OPENMP", "1" });
578576
#endif
579577
#ifdef GGML_USE_CPU_KLEIDIAI
580-
features.push_back({ "KLEIDIAI_REPACK", "1" });
578+
features.push_back({ "KLEIDIAI", "1" });
581579
#endif
582580
#ifdef GGML_USE_CPU_AARCH64
583581
features.push_back({ "AARCH64_REPACK", "1" });

ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,25 +34,25 @@ struct ggml_kleidiai_context {
3434
ggml_kleidiai_kernels * kernels;
3535
} static ctx = { NULL };
3636

37-
static void init_kleidiai_context(int n_threads) {
37+
static void init_kleidiai_context(void) {
3838
static bool initialized = false;
3939

4040
if (!initialized) {
41-
GGML_ASSERT(n_threads > 0);
42-
4341
initialized = true;
42+
const char *env_var = getenv("GGML_KLEIDIAI_SME");
43+
int sme_enabled = 0;
4444

4545
cpu_feature features = (ggml_cpu_has_dotprod() ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
4646
(ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM : CPU_FEATURE_NONE) |
4747
(ggml_cpu_has_sve() ? CPU_FEATURE_SVE : CPU_FEATURE_NONE);
4848

49-
#if defined(__APPLE__)
50-
if (n_threads == 1) {
49+
if (env_var) {
50+
sme_enabled = atoi(env_var);
51+
}
52+
53+
if (sme_enabled != 0) {
5154
features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
5255
}
53-
#else
54-
features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
55-
#endif
5656
ctx.kernels = ggml_kleidiai_select_kernels(features);
5757
}
5858
}
@@ -162,6 +162,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
162162
ctx.kernels->rhs_info.pack_func(1, n, k, nr, kr, sr, k_q4_0_block_size, (const uint8_t *)data, NULL, tensor->data, 0, &params);
163163

164164
return 0;
165+
166+
GGML_UNUSED(data_size);
165167
}
166168
};
167169

@@ -223,7 +225,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
223225
op->src[0]->type == GGML_TYPE_Q4_0 &&
224226
op->src[0]->buffer &&
225227
(ggml_n_dims(op->src[0]) == 2) &&
226-
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type(-1) && ctx.kernels
228+
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels
227229
) {
228230
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
229231
return false;
@@ -237,7 +239,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
237239

238240
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
239241
if (op->op == GGML_OP_MUL_MAT) {
240-
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type(-1)) {
242+
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
241243
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
242244
}
243245
}
@@ -246,7 +248,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
246248
};
247249
} // namespace ggml::cpu::kleidiai
248250

249-
ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(int n_threads) {
251+
ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void) {
250252
static ggml::cpu::kleidiai::extra_buffer_type ctx;
251253
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_kleidiai = {
252254
/* .iface = */ {
@@ -261,7 +263,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(int n_threads)
261263
/* .context = */ &ctx,
262264
};
263265

264-
init_kleidiai_context(n_threads);
266+
init_kleidiai_context();
265267

266268
return &ggml_backend_cpu_buffer_type_kleidiai;
267269
}

ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
extern "C" {
1212
#endif
1313

14-
ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(int n_threads);
14+
ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void);
1515

1616
#ifdef __cplusplus
1717
}

include/llama.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -304,8 +304,6 @@ extern "C" {
304304
bool use_mmap; // use mmap if possible
305305
bool use_mlock; // force system to keep model in RAM
306306
bool check_tensors; // validate model tensor data
307-
308-
int n_threads;
309307
};
310308

311309
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations

src/llama-model-loader.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -445,8 +445,7 @@ llama_model_loader::llama_model_loader(
445445
std::vector<std::string> & splits,
446446
bool use_mmap,
447447
bool check_tensors,
448-
const struct llama_model_kv_override * param_overrides_p,
449-
int n_threads) {
448+
const struct llama_model_kv_override * param_overrides_p) {
450449
int trace = 0;
451450
if (getenv("LLAMA_TRACE")) {
452451
trace = atoi(getenv("LLAMA_TRACE"));
@@ -684,7 +683,6 @@ llama_model_loader::llama_model_loader(
684683

685684
this->use_mmap = use_mmap;
686685
this->check_tensors = check_tensors;
687-
this->n_threads = n_threads;
688686
}
689687

690688
std::string llama_model_loader::get_arch_name() const {

0 commit comments

Comments
 (0)