@@ -2288,24 +2288,56 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
22882288 }
22892289 }
22902290
2291- // avoid using a host buffer when using mmap
2292- auto * buft_dev = ggml_backend_buft_get_device(buft);
2293- if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
2294- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2295- if (!cpu_dev) {
2296- throw std::runtime_error("no CPU backend found");
2297- }
2298- buft = ggml_backend_dev_buffer_type(cpu_dev);
2291+ // avoid using a host buffer when using mmap
2292+ auto * buft_dev = ggml_backend_buft_get_device(buft);
2293+ if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
2294+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2295+ if (!cpu_dev) {
2296+ throw std::runtime_error("no CPU backend found");
2297+ }
2298+
2299+ // If enabled, prefer CPU "extra" (AMX) buffer types for weights on CPU; else use CPU default
2300+ ggml_backend_buffer_type_t cpu_default_buft = ggml_backend_dev_buffer_type(cpu_dev);
2301+ const bool prefer_cpu_extra = params.amx_enable_mmap;
2302+
2303+ if (!prefer_cpu_extra) {
2304+ buft = cpu_default_buft;
2305+ } else {
2306+ ggml_backend_buffer_type_t chosen = nullptr;
2307+
2308+ // Iterate available buffer types, skipping device-host buffer types
2309+ for (const auto & cur : *buft_list) {
2310+ ggml_backend_dev_t cur_dev = cur.first;
2311+ ggml_backend_buffer_type_t cur_buft = cur.second;
2312+
2313+ if (cur_dev && cur_buft == ggml_backend_dev_host_buffer_type(cur_dev)) {
2314+ continue;
22992315 }
23002316
2301- if (buft != buft_list->front().second) {
2302- n_moved_tensors++;
2303- if (!first_moved_tensor) {
2304- first_moved_tensor = t_meta;
2305- first_moved_from_buft = buft_list->front().second;
2306- first_moved_to_buft = buft;
2317+ // Prefer CPU "extra" (non-default) if supported for this tensor/op
2318+ if (cur_dev == cpu_dev && cur_buft != cpu_default_buft) {
2319+ if (weight_buft_supported(hparams, t_meta, op, cur_buft, cur_dev)) {
2320+ chosen = cur_buft;
2321+ break;
23072322 }
23082323 }
2324+ }
2325+
2326+ buft = chosen ? chosen : cpu_default_buft;
2327+ }
2328+ }
2329+
2330+
2331+ // (keep your existing moved-tensors accounting exactly as-is)
2332+ if (buft != buft_list->front().second) {
2333+ n_moved_tensors++;
2334+ if (!first_moved_tensor) {
2335+ first_moved_tensor = t_meta;
2336+ first_moved_from_buft = buft_list->front().second;
2337+ first_moved_to_buft = buft;
2338+ }
2339+ }
2340+
23092341
23102342 ggml_context * ctx = ctx_for_buft(buft);
23112343
@@ -19642,6 +19674,7 @@ llama_model_params llama_model_default_params() {
1964219674 /*.use_mlock =*/ false,
1964319675 /*.check_tensors =*/ false,
1964419676 /*.use_extra_bufts =*/ true,
19677+ /*.amx_enable_mmap =*/ false,
1964519678 };
1964619679
1964719680 return result;
0 commit comments