Skip to content

Commit d141bd7

Browse files
authored
Merge branch 'ggml-org:master' into mradermacher
2 parents b75d0cd + c9c64de commit d141bd7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+2413
-957
lines changed

.devops/intel.Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
1+
ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
22

33
## Build Image
44

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -899,7 +899,7 @@ jobs:
899899
shell: bash
900900

901901
env:
902-
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
902+
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
903903
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
904904
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
905905
steps:

.github/workflows/release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ jobs:
448448
shell: bash
449449

450450
env:
451-
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
451+
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
452452
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
453453
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
454454
steps:

common/arg.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1445,6 +1445,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14451445
params.n_keep = value;
14461446
}
14471447
));
1448+
add_opt(common_arg(
1449+
{"--swa-full"},
1450+
string_format("use full-size SWA cache (default: %s)\n"
1451+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
1452+
[](common_params & params) {
1453+
params.swa_full = true;
1454+
}
1455+
));
14481456
add_opt(common_arg(
14491457
{"--no-context-shift"},
14501458
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -2880,6 +2888,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
28802888
params.chat_template = read_file(value);
28812889
}
28822890
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2891+
add_opt(common_arg(
2892+
{"--no-prefill-assistant"},
2893+
string_format(
2894+
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
2895+
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
2896+
),
2897+
[](common_params & params) {
2898+
params.prefill_assistant = false;
2899+
}
2900+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
28832901
add_opt(common_arg(
28842902
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
28852903
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),

common/common.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,6 +1106,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
11061106
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
11071107
}
11081108

1109+
mparams.progress_callback = params.load_progress_callback;
1110+
mparams.progress_callback_user_data = params.load_progress_callback_user_data;
1111+
11091112
return mparams;
11101113
}
11111114

@@ -1137,6 +1140,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11371140
cparams.flash_attn = params.flash_attn;
11381141
cparams.no_perf = params.no_perf;
11391142
cparams.op_offload = !params.no_op_offload;
1143+
cparams.swa_full = params.swa_full;
11401144

11411145
if (params.reranking) {
11421146
cparams.embeddings = true;

common/common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,7 @@ struct common_params {
323323
bool flash_attn = false; // flash attention
324324
bool no_perf = false; // disable performance metrics
325325
bool ctx_shift = true; // context shift on inifinite text generation
326+
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
326327

327328
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
328329
bool use_mmap = true; // use mmap for faster loads
@@ -368,6 +369,7 @@ struct common_params {
368369
bool use_jinja = false; // NOLINT
369370
bool enable_chat_template = true;
370371
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
372+
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
371373

372374
std::vector<std::string> api_keys;
373375

@@ -427,6 +429,11 @@ struct common_params {
427429

428430
// common params
429431
std::string out_file; // output filename for all example programs
432+
// optional callback for model loading progress and cancellation:
433+
// called with a progress value between 0.0 and 1.0.
434+
// return false from callback to abort model loading or true to continue
435+
llama_progress_callback load_progress_callback = NULL;
436+
void * load_progress_callback_user_data = NULL;
430437
};
431438

432439
// call once at the start of a program if it uses libcommon

convert_hf_to_gguf.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,7 @@ def prepare_tensors(self):
308308
gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
309309
gguf.MODEL_TENSOR.POSNET_NORM1,
310310
gguf.MODEL_TENSOR.POSNET_NORM2,
311+
gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
311312
)
312313
)
313314
or not new_name.endswith(".weight")
@@ -2092,6 +2093,26 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
20922093
return super().modify_tensors(data_torch, name, bid)
20932094

20942095

2096+
@ModelBase.register("Llama4ForConditionalGeneration")
2097+
class Llama4VisionModel(VisionModel):
2098+
def set_gguf_parameters(self):
2099+
super().set_gguf_parameters()
2100+
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.LLAMA4)
2101+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
2102+
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
2103+
assert self.hparams["hidden_act"] == "gelu"
2104+
self.gguf_writer.add_vision_use_gelu(True)
2105+
2106+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2107+
del bid # unused
2108+
if "multi_modal_projector" in name or "vision_model" in name:
2109+
# process vision tensors
2110+
if "positional_embedding_vlm" in name and ".weight" not in name:
2111+
name += ".weight"
2112+
return [(self.map_tensor_name(name), data_torch)]
2113+
return []
2114+
2115+
20952116
@ModelBase.register("Mistral3ForConditionalGeneration")
20962117
class Mistral3Model(LlamaModel):
20972118
model_arch = gguf.MODEL_ARCH.LLAMA

docs/backend/CANN.md

Lines changed: 74 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -56,60 +56,82 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
5656

5757
## Model Supports
5858

59-
| Model Name | FP16 | Q8_0 | Q4_0 |
59+
| Model Name | FP16 | Q4_0 | Q8_0 |
6060
|:----------------------------|:-----:|:----:|:----:|
61-
| AquilaChat2-7B ||||
62-
| Baichuan-7b ||||
63-
| Baichuan2-7B-Chat ||||
64-
| bitnet_b1_58-large ||||
65-
| bloom-560m || x ||
66-
| bloomz-alpaca-560m || x ||
67-
| c4ai-command-r-35B-v01 | x | x | x |
68-
| chatglm3-6B | x | x | x |
69-
| chinese-alpaca-2-1.3b ||||
70-
| CodeShell-7B ||||
71-
| deepseek-ai_deepseek-coder-1.3B-base | x | x | x |
72-
| deepseek-ai_DeepSeek-V2-Lite | x | x | x |
73-
| deepseek-coder-6.7B-instruct | x | x | x |
74-
| DeepSeek-V2-Lite-64x1.5B | x | x | x |
75-
| falcon-7b-instruct ||||
76-
| flan-t5-large ||||
77-
| gemma-2-9b-it ||||
78-
| glm-4-9B | x | x | x |
79-
| gpt2 ||||
80-
| Gpt2-163M ||||
81-
| granite-3B-code-instruct ||||
61+
| Llama-2 ||||
62+
| Llama-3 ||||
63+
| Mistral-7B ||||
64+
| Mistral MOE ||||
65+
| DBRX | - | - | - |
66+
| Falcon ||||
67+
| Chinese LLaMA/Alpaca ||||
68+
| Vigogne(French) ||||
69+
| BERT | x | x | x |
70+
| Koala ||||
71+
| Baichuan ||||
72+
| Aquila 1 & 2 ||||
73+
| Starcoder models ||||
74+
| Refact ||||
75+
| MPT ||||
76+
| Bloom ||||
77+
| Yi models ||||
78+
| stablelm models ||||
79+
| DeepSeek models | x | x | x |
80+
| Qwen models ||||
81+
| PLaMo-13B ||||
82+
| Phi models ||||
83+
| PhiMoE ||||
84+
| GPT-2 ||||
85+
| Orion ||||
86+
| InternlLM2 ||||
87+
| CodeShell ||||
88+
| Gemma ||||
89+
| Mamba ||||
90+
| Xverse ||||
91+
| command-r models ||||
92+
| Grok-1 | - | - | - |
93+
| SEA-LION ||||
8294
| GritLM-7B ||||
83-
| internlm2_5-7b-chat ||||
84-
| koala-7B-HF ||||
85-
| Llama-2-7b-chat-hf ||||
86-
| Llama-3-Smaug-8B ||||
87-
| Llama2-Chinese-7b-Chat ||||
88-
| Llama3-8B ||||
89-
| Llama3-8b-chinese ||||
90-
| mamba-130m-hf ||||
91-
| Mistral-7B-Instruct-v0.2 ||||
92-
| Mixtral-8x7B-Instruct-v0.1 | x |||
93-
| mpt-7B ||||
94-
| OLMo-1B-hf ||||
95-
| OpenELM-3B-Instruct ||||
96-
| Orion-14b-base ||||
97-
| phi1 | x | x | x |
98-
| phi2 | x | x | x |
99-
| Phi-3-mini-4k-instruct ||||
100-
| plamo-13b ||||
101-
| pythia-70M | x | x | x |
102-
| Qwen-7B ||||
103-
| Qwen2-1.5B-Instruct || x ||
104-
| Refact-1_6B-fim ||||
105-
| SmolLM-135M ||||
106-
| stablelm-zephyr | x | x | x |
107-
| stablelm-2-zephyr-1_6b | x | x | x |
108-
| starcoderbase-1b ||||
109-
| starcoder2-3b ||||
110-
| vigogne-7b-chat ||||
111-
| xverse-7b-chat ||||
112-
| Yi-6b-Chat ||||
95+
| OLMo ||||
96+
| OLMo 2 ||||
97+
| OLMoE ||||
98+
| Granite models ||||
99+
| GPT-NeoX ||||
100+
| Pythia ||||
101+
| Snowflake-Arctic MoE | - | - | - |
102+
| Smaug ||||
103+
| Poro 34B ||||
104+
| Bitnet b1.58 models || x | x |
105+
| Flan-T5 ||||
106+
| Open Elm models | x |||
107+
| chatGLM3-6B + ChatGLM4-9b + GLMEdge-1.5b + GLMEdge-4b ||||
108+
| GLM-4-0414 ||||
109+
| SmolLM ||||
110+
| EXAONE-3.0-7.8B-Instruct ||||
111+
| FalconMamba Models ||||
112+
| Jais Models | - | x | x |
113+
| Bielik-11B-v2.3 ||||
114+
| RWKV-6 | - |||
115+
| QRWKV-6 ||||
116+
| GigaChat-20B-A3B | x | x | x |
117+
| Trillion-7B-preview ||||
118+
| Ling models ||||
119+
120+
121+
**Multimodal**
122+
| Model Name | FP16 | Q4_0 | Q8_0 |
123+
|:----------------------------|:-----:|:----:|:----:|
124+
| LLaVA 1.5 models, LLaVA 1.6 models | x | x | x |
125+
| BakLLaVA ||||
126+
| Obsidian || - | - |
127+
| ShareGPT4V | x | - | - |
128+
| MobileVLM 1.7B/3B models | - | - | - |
129+
| Yi-VL | - | - | - |
130+
| Mini CPM ||||
131+
| Moondream ||||
132+
| Bunny || - | - |
133+
| GLM-EDGE ||||
134+
| Qwen2-VL ||||
113135

114136

115137

0 commit comments

Comments
 (0)