Skip to content

Commit 3152f17

Browse files
authored
Merge branch 'ggml-org:master' into mradermacher
2 parents 21f1897 + 9a390c4 commit 3152f17

File tree

103 files changed

+4067
-1598
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+4067
-1598
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ jobs:
307307
run: |
308308
cd build
309309
# This is using llvmpipe and runs slower than other backends
310-
ctest -L main --verbose --timeout 2700
310+
ctest -L main --verbose --timeout 3600
311311
312312
ubuntu-22-cmake-hip:
313313
runs-on: ubuntu-22.04

.github/workflows/docker.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,7 @@ jobs:
4242
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
4343
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
4444
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
45-
# Note: the intel images are failing due to an out of disk space error
46-
# - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
45+
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
4746
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
4847
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
4948
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1616

1717
## Hot topics
1818

19+
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
1920
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
20-
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
21+
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
2122
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
2223
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
2324
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim

common/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,8 @@ if (LLAMA_LLGUIDANCE)
119119

120120
ExternalProject_Add(llguidance_ext
121121
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
122-
# v0.7.10:
123-
GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
122+
# v0.7.19 (+ fancy-regex build fix):
123+
GIT_TAG b59f98f85269892a7de3d3641ad155366f13daa6
124124
PREFIX ${CMAKE_BINARY_DIR}/llguidance
125125
SOURCE_DIR ${LLGUIDANCE_SRC}
126126
BUILD_IN_SOURCE TRUE

common/arg.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ using json = nlohmann::ordered_json;
4040

4141
std::initializer_list<enum llama_example> mmproj_examples = {
4242
LLAMA_EXAMPLE_LLAVA,
43-
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
43+
LLAMA_EXAMPLE_SERVER,
4444
};
4545

4646
static std::string read_file(const std::string & fname) {
@@ -2204,32 +2204,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
22042204
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
22052205
add_opt(common_arg(
22062206
{"--mmproj"}, "FILE",
2207-
"path to a multimodal projector file. see tools/mtmd/README.md",
2207+
"path to a multimodal projector file. see tools/mtmd/README.md\n"
2208+
"note: if -hf is used, this argument can be omitted",
22082209
[](common_params & params, const std::string & value) {
22092210
params.mmproj.path = value;
22102211
}
2211-
).set_examples(mmproj_examples));
2212+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
22122213
add_opt(common_arg(
22132214
{"--mmproj-url"}, "URL",
22142215
"URL to a multimodal projector file. see tools/mtmd/README.md",
22152216
[](common_params & params, const std::string & value) {
22162217
params.mmproj.url = value;
22172218
}
2218-
).set_examples(mmproj_examples));
2219+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
22192220
add_opt(common_arg(
22202221
{"--no-mmproj"},
22212222
"explicitly disable multimodal projector, useful when using -hf",
22222223
[](common_params & params) {
22232224
params.no_mmproj = true;
22242225
}
2225-
).set_examples(mmproj_examples));
2226+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
22262227
add_opt(common_arg(
22272228
{"--no-mmproj-offload"},
22282229
"do not offload multimodal projector to GPU",
22292230
[](common_params & params) {
22302231
params.mmproj_use_gpu = false;
22312232
}
2232-
).set_examples(mmproj_examples));
2233+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
22332234
add_opt(common_arg(
22342235
{"--image"}, "FILE",
22352236
"path to an image file. use with multimodal models. Specify multiple times for batching",
@@ -2436,6 +2437,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24362437
}
24372438
}
24382439
));
2440+
add_opt(common_arg(
2441+
{"--no-op-offload"},
2442+
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
2443+
[](common_params & params) {
2444+
params.no_op_offload = true;
2445+
}
2446+
));
24392447
add_opt(common_arg(
24402448
{"--lora"}, "FNAME",
24412449
"path to LoRA adapter (can be repeated to use multiple adapters)",
@@ -2627,6 +2635,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26272635
params.i_chunk = value;
26282636
}
26292637
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2638+
add_opt(common_arg(
2639+
{"--parse-special"},
2640+
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
2641+
[](common_params & params) {
2642+
params.parse_special = true;
2643+
}
2644+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
26302645
add_opt(common_arg(
26312646
{"-pps"},
26322647
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),

common/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1117,6 +1117,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11171117
cparams.offload_kqv = !params.no_kv_offload;
11181118
cparams.flash_attn = params.flash_attn;
11191119
cparams.no_perf = params.no_perf;
1120+
cparams.op_offload = !params.no_op_offload;
11201121

11211122
if (params.reranking) {
11221123
cparams.embeddings = true;

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,7 @@ struct common_params {
332332
bool no_kv_offload = false; // disable KV offloading
333333
bool warmup = true; // warmup run
334334
bool check_tensors = false; // validate tensor data
335+
bool no_op_offload = false; // globally disable offload host tensor operations to device
335336

336337
bool single_turn = false; // single turn chat conversation
337338

@@ -409,6 +410,7 @@ struct common_params {
409410

410411
bool process_output = false; // collect data for the output tensor
411412
bool compute_ppl = true; // whether to compute perplexity
413+
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
412414

413415
// cvector-generator params
414416
int n_pca_batch = 100;

common/llguidance.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab)
189189
/* .tokenize_fn = */ llama_sampler_llg_tokenize_fn,
190190
/* .use_approximate_greedy_tokenize_fn = */ false,
191191
/* .tokenize_user_data = */ vocab,
192+
/* .slices = */ nullptr,
192193
};
193194

194195
char error_buffer[1024];

convert_hf_to_gguf.py

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,11 @@ def load_hparams(dir_model: Path):
426426
logger.warning(f"Failed to load model config from {dir_model}: {e}")
427427
logger.warning("Trying to load config.json instead")
428428
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
429-
return json.load(f)
429+
config = json.load(f)
430+
if "llm_config" in config:
431+
# rename for InternVL
432+
config["text_config"] = config["llm_config"]
433+
return config
430434

431435
@classmethod
432436
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -794,6 +798,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
794798
if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
795799
# ref: https://huggingface.co/mistral-community/pixtral-12b
796800
res = "pixtral"
801+
if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
802+
# ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
803+
res = "seed-coder"
797804

798805
if res is None:
799806
logger.warning("\n")
@@ -2606,6 +2613,11 @@ def set_gguf_parameters(self):
26062613
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
26072614
if self.hf_arch == "Qwen2Model":
26082615
name = f"model.{name}" # map to Qwen2ForCausalLM tensors
2616+
if "language_model." in name:
2617+
name = name.replace("language_model.", "") # for InternVL
2618+
if name.startswith("mlp") or name.startswith("vision_model"):
2619+
# skip visual tensors
2620+
return []
26092621
yield from super().modify_tensors(data_torch, name, bid)
26102622

26112623

@@ -2709,6 +2721,62 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27092721
return [] # skip other tensors
27102722

27112723

2724+
@ModelBase.register("InternVisionModel")
2725+
class InternVisionModel(VisionModel):
2726+
def set_gguf_parameters(self):
2727+
super().set_gguf_parameters()
2728+
hparams = self.hparams
2729+
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
2730+
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
2731+
# hidden_act
2732+
if hparams["hidden_act"] == "silu":
2733+
self.gguf_writer.add_vision_use_silu(True)
2734+
elif hparams["hidden_act"] == "gelu":
2735+
self.gguf_writer.add_vision_use_gelu(True)
2736+
else:
2737+
raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
2738+
# downsample_ratio
2739+
downsample_ratio = self.global_config.get("downsample_ratio")
2740+
assert downsample_ratio is not None
2741+
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
2742+
2743+
def tensor_force_quant(self, name, new_name, bid, n_dims):
2744+
del bid, name, n_dims # unused
2745+
if ".patch_embd." in new_name:
2746+
return gguf.GGMLQuantizationType.F16
2747+
if ".position_embd." in new_name:
2748+
return gguf.GGMLQuantizationType.F32
2749+
return False
2750+
2751+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2752+
del bid # unused
2753+
if name.startswith("vision_model") or name.startswith("mlp"):
2754+
# process visual tensors
2755+
# correct name
2756+
if name.startswith("vision_model"):
2757+
name = "vision_tower." + name
2758+
if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
2759+
name += ".weight"
2760+
# split QKV tensors if needed
2761+
if ".qkv." in name:
2762+
if data_torch.ndim == 2: # weight
2763+
c3, _ = data_torch.shape
2764+
else: # bias
2765+
c3 = data_torch.shape[0]
2766+
assert c3 % 3 == 0
2767+
c = c3 // 3
2768+
wq = data_torch[:c]
2769+
wk = data_torch[c: c * 2]
2770+
wv = data_torch[c * 2:]
2771+
return [
2772+
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.q_proj")), wq),
2773+
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.k_proj")), wk),
2774+
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.v_proj")), wv),
2775+
]
2776+
return [(self.map_tensor_name(name), data_torch)]
2777+
return [] # skip other tensors
2778+
2779+
27122780
@ModelBase.register("WavTokenizerDec")
27132781
class WavTokenizerDecModel(TextModel):
27142782
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
@@ -3360,6 +3428,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
33603428
head_dim = n_embd // num_heads
33613429
num_groups = num_heads // q_per_kv
33623430

3431+
name = name.replace("language_model.", "") # InternVL
3432+
if name.startswith("mlp") or name.startswith("vision_model"):
3433+
# skip visual tensors
3434+
return []
3435+
33633436
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
33643437
qkv = data_torch
33653438

@@ -3433,6 +3506,10 @@ def set_gguf_parameters(self):
34333506
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
34343507
n_head = self.hparams["num_attention_heads"]
34353508
n_kv_head = self.hparams.get("num_key_value_heads")
3509+
name = name.replace("language_model.", "") # InternVL
3510+
if name.startswith("mlp") or name.startswith("vision_model"):
3511+
# skip visual tensors
3512+
return []
34363513
if name.endswith(("q_proj.weight", "q_proj.bias")):
34373514
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
34383515
if name.endswith(("k_proj.weight", "k_proj.bias")):

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ class TOKENIZER_TYPE(IntEnum):
116116
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
117117
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
118118
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
119+
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
119120
]
120121

121122

0 commit comments

Comments
 (0)