@@ -2295,24 +2295,56 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
22952295 }
22962296 }
22972297
2298- // avoid using a host buffer when using mmap
2299- auto * buft_dev = ggml_backend_buft_get_device(buft);
2300- if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
2301- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2302- if (!cpu_dev) {
2303- throw std::runtime_error("no CPU backend found");
2304- }
2305- buft = ggml_backend_dev_buffer_type(cpu_dev);
2298+ // avoid using a host buffer when using mmap
2299+ auto * buft_dev = ggml_backend_buft_get_device(buft);
2300+ if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
2301+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2302+ if (!cpu_dev) {
2303+ throw std::runtime_error("no CPU backend found");
2304+ }
2305+
2306+ // If enabled, prefer CPU "extra" (AMX) buffer types for weights on CPU; else use CPU default
2307+ ggml_backend_buffer_type_t cpu_default_buft = ggml_backend_dev_buffer_type(cpu_dev);
2308+ const bool prefer_cpu_extra = params.amx_enable_mmap;
2309+
2310+ if (!prefer_cpu_extra) {
2311+ buft = cpu_default_buft;
2312+ } else {
2313+ ggml_backend_buffer_type_t chosen = nullptr;
2314+
2315+ // Iterate available buffer types, skipping device-host buffer types
2316+ for (const auto & cur : *buft_list) {
2317+ ggml_backend_dev_t cur_dev = cur.first;
2318+ ggml_backend_buffer_type_t cur_buft = cur.second;
2319+
2320+ if (cur_dev && cur_buft == ggml_backend_dev_host_buffer_type(cur_dev)) {
2321+ continue;
23062322 }
23072323
2308- if (buft != buft_list->front().second) {
2309- n_moved_tensors++;
2310- if (!first_moved_tensor) {
2311- first_moved_tensor = t_meta;
2312- first_moved_from_buft = buft_list->front().second;
2313- first_moved_to_buft = buft;
2324+ // Prefer CPU "extra" (non-default) if supported for this tensor/op
2325+ if (cur_dev == cpu_dev && cur_buft != cpu_default_buft) {
2326+ if (weight_buft_supported(hparams, t_meta, op, cur_buft, cur_dev)) {
2327+ chosen = cur_buft;
2328+ break;
23142329 }
23152330 }
2331+ }
2332+
2333+ buft = chosen ? chosen : cpu_default_buft;
2334+ }
2335+ }
2336+
2337+
2338+ // (keep your existing moved-tensors accounting exactly as-is)
2339+ if (buft != buft_list->front().second) {
2340+ n_moved_tensors++;
2341+ if (!first_moved_tensor) {
2342+ first_moved_tensor = t_meta;
2343+ first_moved_from_buft = buft_list->front().second;
2344+ first_moved_to_buft = buft;
2345+ }
2346+ }
2347+
23162348
23172349 ggml_context * ctx = ctx_for_buft(buft);
23182350
@@ -19649,6 +19681,7 @@ llama_model_params llama_model_default_params() {
1964919681 /*.use_mlock =*/ false,
1965019682 /*.check_tensors =*/ false,
1965119683 /*.use_extra_bufts =*/ true,
19684+ /*.amx_enable_mmap =*/ false,
1965219685 };
1965319686
1965419687 return result;
0 commit comments