Skip to content

Commit 21e31e2

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .github/workflows/docker.yml # README.md # build-xcframework.sh # common/CMakeLists.txt # examples/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cuda/CMakeLists.txt # ggml/src/ggml-metal/ggml-metal.m # ggml/src/ggml-metal/ggml-metal.metal # ggml/src/ggml-sycl/CMakeLists.txt # ggml/src/ggml-sycl/backend.hpp # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-sycl/vecdotq.hpp # scripts/compare-llama-bench.py # src/CMakeLists.txt # src/llama-model.cpp # src/llama.cpp # tests/test-backend-ops.cpp # tests/test-opt.cpp # tools/llama-bench/README.md # tools/llama-bench/llama-bench.cpp # tools/mtmd/CMakeLists.txt # tools/mtmd/README.md # tools/mtmd/clip.cpp # tools/rpc/rpc-server.cpp # tools/server/CMakeLists.txt # tools/server/README.md
2 parents 2819f78 + de4c07f commit 21e31e2

File tree

90 files changed

+4357
-1355
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

90 files changed

+4357
-1355
lines changed

common/arg.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ using json = nlohmann::ordered_json;
4141

4242
std::initializer_list<enum llama_example> mmproj_examples = {
4343
LLAMA_EXAMPLE_LLAVA,
44-
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
44+
LLAMA_EXAMPLE_SERVER,
4545
};
4646

4747
static std::string read_file(const std::string & fname) {
@@ -2205,32 +2205,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
22052205
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
22062206
add_opt(common_arg(
22072207
{"--mmproj"}, "FILE",
2208-
"path to a multimodal projector file. see tools/mtmd/README.md",
2208+
"path to a multimodal projector file. see tools/mtmd/README.md\n"
2209+
"note: if -hf is used, this argument can be omitted",
22092210
[](common_params & params, const std::string & value) {
22102211
params.mmproj.path = value;
22112212
}
2212-
).set_examples(mmproj_examples));
2213+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
22132214
add_opt(common_arg(
22142215
{"--mmproj-url"}, "URL",
22152216
"URL to a multimodal projector file. see tools/mtmd/README.md",
22162217
[](common_params & params, const std::string & value) {
22172218
params.mmproj.url = value;
22182219
}
2219-
).set_examples(mmproj_examples));
2220+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
22202221
add_opt(common_arg(
22212222
{"--no-mmproj"},
22222223
"explicitly disable multimodal projector, useful when using -hf",
22232224
[](common_params & params) {
22242225
params.no_mmproj = true;
22252226
}
2226-
).set_examples(mmproj_examples));
2227+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
22272228
add_opt(common_arg(
22282229
{"--no-mmproj-offload"},
22292230
"do not offload multimodal projector to GPU",
22302231
[](common_params & params) {
22312232
params.mmproj_use_gpu = false;
22322233
}
2233-
).set_examples(mmproj_examples));
2234+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
22342235
add_opt(common_arg(
22352236
{"--image"}, "FILE",
22362237
"path to an image file. use with multimodal models. Specify multiple times for batching",
@@ -2437,6 +2438,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24372438
}
24382439
}
24392440
));
2441+
add_opt(common_arg(
2442+
{"--no-op-offload"},
2443+
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
2444+
[](common_params & params) {
2445+
params.no_op_offload = true;
2446+
}
2447+
));
24402448
add_opt(common_arg(
24412449
{"--lora"}, "FNAME",
24422450
"path to LoRA adapter (can be repeated to use multiple adapters)",

common/common.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "json-schema-to-grammar.cpp"
1616
#include "llama.h"
1717
#include "chat.cpp"
18+
#include "ggml/src/ggml-opt.cpp" //dear god pls
1819

1920
#include <algorithm>
2021
#include <cinttypes>
@@ -1120,6 +1121,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11201121
cparams.offload_kqv = !params.no_kv_offload;
11211122
cparams.flash_attn = params.flash_attn;
11221123
cparams.no_perf = params.no_perf;
1124+
cparams.op_offload = !params.no_op_offload;
11231125

11241126
if (params.reranking) {
11251127
cparams.embeddings = true;
@@ -1571,3 +1573,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
15711573

15721574
return result;
15731575
}
1576+
1577+
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
1578+
const int64_t ne_datapoint = llama_n_ctx(ctx);
1579+
const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
1580+
ggml_opt_dataset_t result = ggml_opt_dataset_init(
1581+
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
1582+
1583+
llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
1584+
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
1585+
1586+
for (int64_t idata = 0; idata < ndata; ++idata) {
1587+
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
1588+
memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
1589+
}
1590+
1591+
return result;
1592+
}

common/common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ struct common_params {
328328
bool no_kv_offload = false; // disable KV offloading
329329
bool warmup = true; // warmup run
330330
bool check_tensors = false; // validate tensor data
331+
bool no_op_offload = false; // globally disable offload host tensor operations to device
331332

332333
bool single_turn = false; // single turn chat conversation
333334

@@ -661,3 +662,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
661662
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
662663

663664
}
665+
666+
//
667+
// training utils
668+
//
669+
670+
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);

common/llguidance.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab)
189189
/* .tokenize_fn = */ llama_sampler_llg_tokenize_fn,
190190
/* .use_approximate_greedy_tokenize_fn = */ false,
191191
/* .tokenize_user_data = */ vocab,
192+
/* .slices = */ nullptr,
192193
};
193194

194195
char error_buffer[1024];

convert_hf_to_gguf.py

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,11 @@ def load_hparams(dir_model: Path):
426426
logger.warning(f"Failed to load model config from {dir_model}: {e}")
427427
logger.warning("Trying to load config.json instead")
428428
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
429-
return json.load(f)
429+
config = json.load(f)
430+
if "llm_config" in config:
431+
# rename for InternVL
432+
config["text_config"] = config["llm_config"]
433+
return config
430434

431435
@classmethod
432436
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -794,6 +798,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
794798
if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
795799
# ref: https://huggingface.co/mistral-community/pixtral-12b
796800
res = "pixtral"
801+
if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
802+
# ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
803+
res = "seed-coder"
797804

798805
if res is None:
799806
logger.warning("\n")
@@ -2606,6 +2613,11 @@ def set_gguf_parameters(self):
26062613
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
26072614
if self.hf_arch == "Qwen2Model":
26082615
name = f"model.{name}" # map to Qwen2ForCausalLM tensors
2616+
if "language_model." in name:
2617+
name = name.replace("language_model.", "") # for InternVL
2618+
if name.startswith("mlp") or name.startswith("vision_model"):
2619+
# skip visual tensors
2620+
return []
26092621
yield from super().modify_tensors(data_torch, name, bid)
26102622

26112623

@@ -2709,6 +2721,62 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27092721
return [] # skip other tensors
27102722

27112723

2724+
@ModelBase.register("InternVisionModel")
2725+
class InternVisionModel(VisionModel):
2726+
def set_gguf_parameters(self):
2727+
super().set_gguf_parameters()
2728+
hparams = self.hparams
2729+
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
2730+
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
2731+
# hidden_act
2732+
if hparams["hidden_act"] == "silu":
2733+
self.gguf_writer.add_vision_use_silu(True)
2734+
elif hparams["hidden_act"] == "gelu":
2735+
self.gguf_writer.add_vision_use_gelu(True)
2736+
else:
2737+
raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
2738+
# downsample_ratio
2739+
downsample_ratio = self.global_config.get("downsample_ratio")
2740+
assert downsample_ratio is not None
2741+
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
2742+
2743+
def tensor_force_quant(self, name, new_name, bid, n_dims):
2744+
del bid, name, n_dims # unused
2745+
if ".patch_embd." in new_name:
2746+
return gguf.GGMLQuantizationType.F16
2747+
if ".position_embd." in new_name:
2748+
return gguf.GGMLQuantizationType.F32
2749+
return False
2750+
2751+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2752+
del bid # unused
2753+
if name.startswith("vision_model") or name.startswith("mlp"):
2754+
# process visual tensors
2755+
# correct name
2756+
if name.startswith("vision_model"):
2757+
name = "vision_tower." + name
2758+
if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
2759+
name += ".weight"
2760+
# split QKV tensors if needed
2761+
if ".qkv." in name:
2762+
if data_torch.ndim == 2: # weight
2763+
c3, _ = data_torch.shape
2764+
else: # bias
2765+
c3 = data_torch.shape[0]
2766+
assert c3 % 3 == 0
2767+
c = c3 // 3
2768+
wq = data_torch[:c]
2769+
wk = data_torch[c: c * 2]
2770+
wv = data_torch[c * 2:]
2771+
return [
2772+
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.q_proj")), wq),
2773+
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.k_proj")), wk),
2774+
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.v_proj")), wv),
2775+
]
2776+
return [(self.map_tensor_name(name), data_torch)]
2777+
return [] # skip other tensors
2778+
2779+
27122780
@ModelBase.register("WavTokenizerDec")
27132781
class WavTokenizerDecModel(TextModel):
27142782
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
@@ -3360,6 +3428,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
33603428
head_dim = n_embd // num_heads
33613429
num_groups = num_heads // q_per_kv
33623430

3431+
name = name.replace("language_model.", "") # InternVL
3432+
if name.startswith("mlp") or name.startswith("vision_model"):
3433+
# skip visual tensors
3434+
return []
3435+
33633436
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
33643437
qkv = data_torch
33653438

@@ -3433,6 +3506,10 @@ def set_gguf_parameters(self):
34333506
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
34343507
n_head = self.hparams["num_attention_heads"]
34353508
n_kv_head = self.hparams.get("num_key_value_heads")
3509+
name = name.replace("language_model.", "") # InternVL
3510+
if name.startswith("mlp") or name.startswith("vision_model"):
3511+
# skip visual tensors
3512+
return []
34363513
if name.endswith(("q_proj.weight", "q_proj.bias")):
34373514
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
34383515
if name.endswith(("k_proj.weight", "k_proj.bias")):

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ class TOKENIZER_TYPE(IntEnum):
116116
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
117117
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
118118
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
119+
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
119120
]
120121

121122

docs/multimodal.md

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Multimodal
2+
3+
llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools support this feature:
4+
- [llama-mtmd-cli](../tools/mtmd/README.md)
5+
- [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
6+
7+
To enable it, can use use one of the 2 methods below:
8+
9+
- Use `-hf` option with a supported model (see a list of pre-quantized model below)
10+
- To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
11+
- To load a model using `-hf` while using a custom mmproj file, use `--mmproj local_file.gguf`
12+
- Use `-m model.gguf` option with `--mmproj file.gguf` to specify text and multimodal projector respectively
13+
14+
By default, multimodal projector will be offloaded to GPU. To disable this, add `--no-mmproj-offload`
15+
16+
For example:
17+
18+
```sh
19+
# simple usage with CLI
20+
llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
21+
22+
# simple usage with server
23+
llama-server -hf ggml-org/gemma-3-4b-it-GGUF
24+
25+
# using local file
26+
llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.gguf
27+
28+
# no GPU offload
29+
llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
30+
```
31+
32+
## Pre-quantized models
33+
34+
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default.
35+
36+
Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`
37+
38+
NOTE: some models may require large context window, for example: `-c 8192`
39+
40+
```sh
41+
# Gemma 3
42+
(tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
43+
(tool_name) -hf ggml-org/gemma-3-12b-it-GGUF
44+
(tool_name) -hf ggml-org/gemma-3-27b-it-GGUF
45+
46+
# SmolVLM
47+
(tool_name) -hf ggml-org/SmolVLM-Instruct-GGUF
48+
(tool_name) -hf ggml-org/SmolVLM-256M-Instruct-GGUF
49+
(tool_name) -hf ggml-org/SmolVLM-500M-Instruct-GGUF
50+
(tool_name) -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF
51+
(tool_name) -hf ggml-org/SmolVLM2-256M-Video-Instruct-GGUF
52+
(tool_name) -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
53+
54+
# Pixtral 12B
55+
(tool_name) -hf ggml-org/pixtral-12b-GGUF
56+
57+
# Qwen 2 VL
58+
(tool_name) -hf ggml-org/Qwen2-VL-2B-Instruct-GGUF
59+
(tool_name) -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF
60+
61+
# Qwen 2.5 VL
62+
(tool_name) -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF
63+
(tool_name) -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF
64+
(tool_name) -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF
65+
(tool_name) -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF
66+
67+
# Mistral Small 3.1 24B (IQ2_M quantization)
68+
(tool_name) -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF
69+
70+
# InternVL 2.5 and 3
71+
(tool_name) -hf ggml-org/InternVL2_5-1B-GGUF
72+
(tool_name) -hf ggml-org/InternVL2_5-4B-GGUF
73+
(tool_name) -hf ggml-org/InternVL3-1B-Instruct-GGUF
74+
(tool_name) -hf ggml-org/InternVL3-2B-Instruct-GGUF
75+
(tool_name) -hf ggml-org/InternVL3-8B-Instruct-GGUF
76+
(tool_name) -hf ggml-org/InternVL3-14B-Instruct-GGUF
77+
```

examples/training/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
set(TARGET llama-finetune)
2+
add_executable(${TARGET} finetune.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5+
target_compile_features(${TARGET} PRIVATE cxx_std_11)

examples/training/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# llama.cpp/examples/training
2+
3+
This directory contains examples related to language model training using llama.cpp/GGML.
4+
So far finetuning is technically functional (for FP32 models and limited hardware setups) but the code is very much WIP.
5+
Finetuning of Stories 260K and LLaMA 3.2 1b seems to work with 24 GB of memory.
6+
**For CPU training, compile llama.cpp without any additional backends such as CUDA.**
7+
**For CUDA training, use the maximum number of GPU layers.**
8+
9+
Proof of concept:
10+
11+
``` sh
12+
export model_name=llama_3.2-1b && export quantization=f32
13+
./build/bin/finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
14+
./build/bin/perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
15+
```
16+
17+
The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs.

0 commit comments

Comments
 (0)