Skip to content

Commit 3fd5df0

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents a66dd1a + 1782cdf commit 3fd5df0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1303
-269
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ lcov-report/
4545
tags
4646
.build/
4747
build*
48+
release
49+
debug
4850
!build-info.cmake
4951
!build-info.cpp.in
5052
!build-info.sh

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
4040
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
4141
42-
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
42+
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` (from clang-tools v15+) to format the added code
4343
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
4444
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
4545
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$

common/arg.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -813,13 +813,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
813813
).set_env("LLAMA_ARG_FLASH_ATTN"));
814814
add_opt(common_arg(
815815
{"-p", "--prompt"}, "PROMPT",
816-
ex == LLAMA_EXAMPLE_MAIN
817-
? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
818-
: "prompt to start generation with",
816+
"prompt to start generation with; for system message, use -sys",
819817
[](common_params & params, const std::string & value) {
820818
params.prompt = value;
821819
}
822820
).set_excludes({LLAMA_EXAMPLE_SERVER}));
821+
add_opt(common_arg(
822+
{"-sys", "--system-prompt"}, "PROMPT",
823+
"system prompt to use with model (if applicable, depending on chat template)",
824+
[](common_params & params, const std::string & value) {
825+
params.system_prompt = value;
826+
}
827+
).set_examples({LLAMA_EXAMPLE_MAIN}));
823828
add_opt(common_arg(
824829
{"--no-perf"},
825830
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ struct common_params {
261261
std::string hf_repo = ""; // HF repo // NOLINT
262262
std::string hf_file = ""; // HF file // NOLINT
263263
std::string prompt = ""; // NOLINT
264+
std::string system_prompt = ""; // NOLINT
264265
std::string prompt_file = ""; // store the external prompt file name // NOLINT
265266
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
266267
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT

convert_hf_to_gguf.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -699,6 +699,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
699699
if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
700700
# ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
701701
res = "deepseek-r1-qwen"
702+
if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
703+
# ref: https://huggingface.co/Xenova/gpt-4o
704+
res = "gpt-4o"
702705

703706
if res is None:
704707
logger.warning("\n")
@@ -2512,7 +2515,8 @@ def set_gguf_parameters(self):
25122515
rms_eps = self.find_hparam(["rms_norm_eps"])
25132516
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
25142517
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
2515-
rope_dims = n_embd // n_head
2518+
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
2519+
rope_dims = int(rot_pct * n_embd) // n_head
25162520

25172521
self.gguf_writer.add_context_length(max_pos_embds)
25182522
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
@@ -2536,7 +2540,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
25362540
n_head = self.find_hparam(["num_attention_heads", "n_head"])
25372541
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
25382542
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
2539-
rope_dims = n_embd // n_head
2543+
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
2544+
rope_dims = int(rot_pct * n_embd) // n_head
25402545

25412546
# write rope scaling for long context (128k) model
25422547
rope_scaling = self.find_hparam(['rope_scaling'], True)
@@ -2565,7 +2570,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
25652570
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
25662571

25672572
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
2568-
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
2573+
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.')
25692574

25702575
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
25712576
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))

convert_hf_to_gguf_update.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ class TOKENIZER_TYPE(IntEnum):
109109
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
110110
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
111111
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
112+
{"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
112113
]
113114

114115

@@ -131,6 +132,10 @@ def download_model(model):
131132

132133
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
133134

135+
if name == "gpt-4o":
136+
# Xenova/gpt-4o is tokenizer-only, it does not contain config.json
137+
files = ["tokenizer.json", "tokenizer_config.json"]
138+
134139
if tokt == TOKENIZER_TYPE.SPM:
135140
files.append("tokenizer.model")
136141

examples/llava/README-granitevision.md

Lines changed: 34 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
Download the model and point your `GRANITE_MODEL` environment variable to the path.
44

55
```bash
6-
$ git clone https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview
7-
$ export GRANITE_MODEL=./granite-vision-3.1-2b-preview
6+
$ git clone https://huggingface.co/ibm-granite/granite-vision-3.2-2b
7+
$ export GRANITE_MODEL=./granite-vision-3.2-2b
88
```
99

1010

@@ -41,17 +41,26 @@ If you actually inspect the `.keys()` of the loaded tensors, you should see a lo
4141

4242

4343
### 2. Creating the Visual Component GGUF
44-
To create the GGUF for the visual components, we need to write a config for the visual encoder; make sure the config contains the correct `image_grid_pinpoints`
44+
Next, create a new directory to hold the visual components, and copy the llava.clip/projector files, as shown below.
4545

46+
```bash
47+
$ ENCODER_PATH=$PWD/visual_encoder
48+
$ mkdir $ENCODER_PATH
49+
50+
$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
51+
$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
52+
```
53+
54+
Now, we need to write a config for the visual encoder. In order to convert the model, be sure to use the correct `image_grid_pinpoints`, as these may vary based on the model. You can find the `image_grid_pinpoints` in `$GRANITE_MODEL/config.json`.
4655

47-
Note: we refer to this file as `$VISION_CONFIG` later on.
4856
```json
4957
{
5058
"_name_or_path": "siglip-model",
5159
"architectures": [
5260
"SiglipVisionModel"
5361
],
5462
"image_grid_pinpoints": [
63+
[384,384],
5564
[384,768],
5665
[384,1152],
5766
[384,1536],
@@ -94,42 +103,32 @@ Note: we refer to this file as `$VISION_CONFIG` later on.
94103
}
95104
```
96105

97-
Create a new directory to hold the visual components, and copy the llava.clip/projector files, as well as the vision config into it.
98-
99-
```bash
100-
$ ENCODER_PATH=$PWD/visual_encoder
101-
$ mkdir $ENCODER_PATH
102-
103-
$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
104-
$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
105-
$ cp $VISION_CONFIG $ENCODER_PATH/config.json
106-
```
107-
108-
At which point you should have something like this:
106+
At this point you should have something like this:
109107
```bash
110108
$ ls $ENCODER_PATH
111109
config.json llava.projector pytorch_model.bin
112110
```
113111

114-
Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the siglip visual encoder - in the transformers model, you can find these numbers in the [preprocessor_config.json](https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview/blob/main/preprocessor_config.json).
112+
Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the SigLIP visual encoder - in the transformers model, you can find these numbers in the `preprocessor_config.json`.
115113
```bash
116114
$ python convert_image_encoder_to_gguf.py \
117115
-m $ENCODER_PATH \
118116
--llava-projector $ENCODER_PATH/llava.projector \
119117
--output-dir $ENCODER_PATH \
120118
--clip-model-is-vision \
121119
--clip-model-is-siglip \
122-
--image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5
120+
--image-mean 0.5 0.5 0.5 \
121+
--image-std 0.5 0.5 0.5
123122
```
124123

125-
this will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the abs path of this file as the `$VISUAL_GGUF_PATH.`
124+
This will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the absolute path of this file as the `$VISUAL_GGUF_PATH.`
126125

127126

128127
### 3. Creating the LLM GGUF.
129128
The granite vision model contains a granite LLM as its language model. For now, the easiest way to get the GGUF for LLM is by loading the composite model in `transformers` and exporting the LLM so that it can be directly converted with the normal conversion path.
130129

131130
First, set the `LLM_EXPORT_PATH` to the path to export the `transformers` LLM to.
132-
```
131+
```bash
133132
$ export LLM_EXPORT_PATH=$PWD/granite_vision_llm
134133
```
135134

@@ -142,7 +141,7 @@ if not MODEL_PATH:
142141
raise ValueError("env var GRANITE_MODEL is unset!")
143142

144143
LLM_EXPORT_PATH = os.getenv("LLM_EXPORT_PATH")
145-
if not MODEL_PATH:
144+
if not LLM_EXPORT_PATH:
146145
raise ValueError("env var LLM_EXPORT_PATH is unset!")
147146

148147
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
@@ -166,18 +165,26 @@ $ python convert_hf_to_gguf.py --outfile $LLM_GGUF_PATH $LLM_EXPORT_PATH
166165
```
167166

168167

169-
### 4. Running the Model in Llama cpp
170-
Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. Sample usage:
168+
### 4. Quantization
169+
If you want to quantize the LLM, you can do so with `llama-quantize` as you would any other LLM. For example:
170+
```bash
171+
$ ./build/bin/llama-quantize $LLM_EXPORT_PATH/granite_llm.gguf $LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf Q4_K_M
172+
$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf
173+
```
174+
175+
Note that currently you cannot quantize the visual encoder because granite vision models use SigLIP as the visual encoder, which has tensor dimensions that are not divisible by 32.
176+
171177

172-
Note - the test image shown below can be found [here](https://github-production-user-asset-6210df.s3.amazonaws.com/10740300/415512792-d90d5562-8844-4f34-a0a5-77f62d5a58b5.jpg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20250221%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250221T054145Z&X-Amz-Expires=300&X-Amz-Signature=86c60be490aa49ef7d53f25d6c973580a8273904fed11ed2453d0a38240ee40a&X-Amz-SignedHeaders=host).
178+
### 5. Running the Model in Llama cpp
179+
Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
173180

174181
```bash
175182
$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \
176183
--mmproj $VISUAL_GGUF_PATH \
177-
--image cherry_blossom.jpg \
184+
--image ./media/llama0-banner.png \
178185
-c 16384 \
179-
-p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat type of flowers are in this picture?\n<|assistant|>\n" \
186+
-p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat does the text in this image say?\n<|assistant|>\n" \
180187
--temp 0
181188
```
182189

183-
Sample response: `The flowers in the picture are cherry blossoms, which are known for their delicate pink petals and are often associated with the beauty of spring.`
190+
Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"`

examples/main/main.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,10 @@ int main(int argc, char ** argv) {
219219
// print chat template example in conversation mode
220220
if (params.conversation_mode) {
221221
if (params.enable_chat_template) {
222+
if (!params.prompt.empty()) {
223+
LOG_WRN("*** User-specified prompt in conversation mode will be ignored, did you mean to set --system-prompt (-sys) instead?\n");
224+
}
225+
222226
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
223227
} else {
224228
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
@@ -276,7 +280,7 @@ int main(int argc, char ** argv) {
276280
{
277281
auto prompt = (params.conversation_mode && params.enable_chat_template)
278282
// format the system prompt in conversation mode (fallback to default if empty)
279-
? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
283+
? chat_add_and_format("system", params.system_prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.system_prompt)
280284
// otherwise use the prompt as is
281285
: params.prompt;
282286
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
@@ -476,8 +480,8 @@ int main(int argc, char ** argv) {
476480
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
477481
#endif
478482
LOG_INF( "%s", control_message);
479-
if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
480-
LOG_INF( " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
483+
if (params.conversation_mode && params.enable_chat_template && params.system_prompt.empty()) {
484+
LOG_INF( " - Not using system message. To change it, set a different value via -sys PROMPT\n");
481485
}
482486
LOG_INF("\n");
483487

7 Bytes
Binary file not shown.

examples/server/webui/src/components/SettingDialog.tsx

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -148,13 +148,13 @@ const SETTING_SECTIONS: SettingSection[] = [
148148
fields: [
149149
{
150150
type: SettingInputType.CHECKBOX,
151-
label: 'Expand though process by default for generating message',
151+
label: 'Expand thought process by default when generating messages',
152152
key: 'showThoughtInProgress',
153153
},
154154
{
155155
type: SettingInputType.CHECKBOX,
156156
label:
157-
'Exclude thought process when sending request to API (Recommended for DeepSeek-R1)',
157+
'Exclude thought process when sending requests to API (Recommended for DeepSeek-R1)',
158158
key: 'excludeThoughtOnReq',
159159
},
160160
],
@@ -247,7 +247,7 @@ const SETTING_SECTIONS: SettingSection[] = [
247247
This feature uses{' '}
248248
<OpenInNewTab href="https://pyodide.org">pyodide</OpenInNewTab>,
249249
downloaded from CDN. To use this feature, ask the LLM to generate
250-
python code inside a markdown code block. You will see a "Run"
250+
Python code inside a Markdown code block. You will see a "Run"
251251
button on the code block, near the "Copy" button.
252252
</small>
253253
</>
@@ -274,7 +274,7 @@ export default function SettingDialog({
274274
);
275275

276276
const resetConfig = () => {
277-
if (window.confirm('Are you sure to reset all settings?')) {
277+
if (window.confirm('Are you sure you want to reset all settings?')) {
278278
setLocalConfig(CONFIG_DEFAULT);
279279
}
280280
};
@@ -296,9 +296,9 @@ export default function SettingDialog({
296296
return;
297297
}
298298
} else if (mustBeNumeric) {
299-
const trimedValue = value.toString().trim();
300-
const numVal = Number(trimedValue);
301-
if (isNaN(numVal) || !isNumeric(numVal) || trimedValue.length === 0) {
299+
const trimmedValue = value.toString().trim();
300+
const numVal = Number(trimmedValue);
301+
if (isNaN(numVal) || !isNumeric(numVal) || trimmedValue.length === 0) {
302302
alert(`Value for ${key} must be numeric`);
303303
return;
304304
}

0 commit comments

Comments
 (0)