Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion SECURITY.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
### Untrusted environments or networks

If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
* Encrypt your data if sending it over the network.

### Multi-Tenant environments
Expand Down
3 changes: 1 addition & 2 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -976,14 +976,13 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-gritlm",
"llama-imatrix",
"llama-infill",
"llama-llava-cli",
"llama-mtmd-cli",
"llama-llava-clip-quantize-cli",
"llama-lookahead",
"llama-lookup",
"llama-lookup-create",
"llama-lookup-merge",
"llama-lookup-stats",
"llama-minicpmv-cli",
"llama-parallel",
"llama-passkey",
"llama-perplexity",
Expand Down
100 changes: 81 additions & 19 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,8 +419,12 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
def load_hparams(dir_model: Path):
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
architectures = hparams.get("architectures")
if "text_config" in hparams:
hparams = {**hparams, **hparams["text_config"]}
if architectures is not None:
# preserve "architectures" from root level config
hparams["architectures"] = architectures
return hparams

@classmethod
Expand Down Expand Up @@ -1061,6 +1065,8 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
class VisionModel(ModelBase):
model_arch = gguf.MODEL_ARCH.CLIP_VISION
n_text_embd = 0
preprocessor_config: dict[str, Any]
global_config: dict[str, Any]

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -1075,24 +1081,33 @@ def __init__(self, *args, **kwargs):

if "vision_config" not in self.hparams:
raise ValueError("vision_config not found in hparams")
# move vision config to the top level
# move vision config to the top level, while preserving the original hparams in global_config
self.global_config = self.hparams
self.hparams = self.hparams["vision_config"]

# load preprocessor config
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f)

def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION)

def set_gguf_parameters(self):
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PROJECTION_DIM, self.n_embd_text)
self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_VISION_ENCODER, True)
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
self.gguf_writer.add_vision_has_vision_encoder(True)

# vision config
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.IMAGE_SIZE, self.find_hparam(["image_size"]))
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PATCH_SIZE, self.find_hparam(["patch_size"]))
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.EMBEDDING_LENGTH, self.find_hparam(["hidden_size"]))
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.FEED_FORWARD_LENGTH, self.find_hparam(["intermediate_size"]))
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.BLOCK_COUNT, self.find_hparam(["num_hidden_layers"]))
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Attention.HEAD_COUNT, self.find_hparam(["num_attention_heads"]))
self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
self.gguf_writer.add_vision_block_count(self.find_hparam(["num_hidden_layers"]))
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))

# preprocessor config
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_mean"])

def write_vocab(self):
raise ValueError("VisionModel does not support vocab writing")
Expand Down Expand Up @@ -1703,11 +1718,23 @@ def prepare_tensors(self):
raise ValueError(f"Unprocessed norms: {norms}")


@ModelBase.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
@ModelBase.register(
"LLaMAForCausalLM",
"LlamaForCausalLM",
"MistralForCausalLM",
"MixtralForCausalLM",
"Idefics3ForConditionalGeneration",
"SmolVLMForConditionalGeneration")
class LlamaModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLAMA
undo_permute = True

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# fix for SmolVLM2, missing `num_attention_heads` in config.json
if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration":
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)

def set_vocab(self):
try:
self._set_vocab_sentencepiece()
Expand Down Expand Up @@ -1770,6 +1797,12 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name

if is_vision_tensor:
return [] # skip vision tensors
elif name.startswith("model.text_model"):
name = name.replace("text_model.", "") # for SmolVLM

if self.undo_permute:
if name.endswith(("q_proj.weight", "q_proj.bias")):
Expand Down Expand Up @@ -1852,6 +1885,41 @@ def prepare_tensors(self):
raise ValueError(f"Unprocessed experts: {experts}")


@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
class SmolVLMModel(VisionModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# fix for SmolVLM2, missing some keys in config.json
# default values are taken from transformers code
if self.hparams["model_type"] == "smolvlm_vision":
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152)
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072)
self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 12)

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3)
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
self.gguf_writer.add_vision_use_gelu(True)

def tensor_force_quant(self, name, new_name, bid, n_dims):
del bid, new_name, n_dims # unused
if ".embeddings." in name:
return gguf.GGMLQuantizationType.F32
return False

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name

if is_vision_tensor:
return [(self.map_tensor_name(name), data_torch)]

return [] # skip other tensors


@ModelBase.register("Llama4ForConditionalGeneration")
class Llama4Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA4
Expand Down Expand Up @@ -3591,12 +3659,10 @@ class Gemma3VisionModel(VisionModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_string(gguf.Keys.ClipVision.PROJECTOR_TYPE, "gemma3")
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3)
# default values below are taken from HF tranformers code
self.gguf_writer.add_float32(gguf.Keys.ClipVision.Attention.LAYERNORM_EPS, hparams.get("layer_norm_eps", 1e-6))
self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_MEAN, [0.5, 0.5, 0.5])
self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_STD, [0.5, 0.5, 0.5])
self.gguf_writer.add_bool (gguf.Keys.ClipVision.USE_GELU, True)
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
self.gguf_writer.add_vision_use_gelu(True)

def tensor_force_quant(self, name, new_name, bid, n_dims):
del bid, new_name, n_dims # unused
Expand All @@ -3614,10 +3680,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
# process vision tensors
name = name.replace("_weight", ".weight")
if "fc1" in name:
name = name.replace("fc1", "fc2")
else:
name = name.replace("fc2", "fc1")

# correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
# the other norm values are part of SigLIP model, and they are already correct
Expand Down
26 changes: 13 additions & 13 deletions examples/llava/MobileVLM-README.md → docs/multimodal/MobileVLM.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.

## Usage
Build with cmake or run `make llama-llava-cli` to build it.

After building, run: `./llama-llava-cli` to see the usage. For example:
Build the `llama-mtmd-cli` binary.

After building, run: `./llama-mtmd-cli` to see the usage. For example:

```sh
./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
./llama-mtmd-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
--image path/to/an/image.jpg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
--chat-template deepseek
```

## Model conversion
Expand Down Expand Up @@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
### case 1
**input**
```sh
/data/local/tmp/llama-llava-cli \
/data/local/tmp/llama-mtmd-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
Expand All @@ -102,7 +102,7 @@ llama_print_timings: total time = 34731.93 ms
### case 2
**input**
```sh
/data/local/tmp/llama-llava-cli \
/data/local/tmp/llama-mtmd-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
Expand All @@ -123,10 +123,10 @@ llama_print_timings: total time = 34570.79 ms

## Some result on Android with `Snapdragon 778G` chip
### MobileVLM-1.7B case
#### llava-cli release-b2005
#### mtmd-cli release-b2005
**input**
```sh
/data/local/tmp/llama-llava-cli \
/data/local/tmp/llama-mtmd-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
Expand All @@ -147,7 +147,7 @@ llama_print_timings: prompt eval time = 8119.49 ms / 191 tokens ( 42.51 m
llama_print_timings: eval time = 1005.75 ms / 14 runs ( 71.84 ms per token, 13.92 tokens per second)
llama_print_timings: total time = 28038.34 ms / 205 tokens
```
#### llava-cli latest-version
#### mtmd-cli latest-version
**input**

Just the same as above.
Expand All @@ -169,7 +169,7 @@ llama_print_timings: eval time = 43894.02 ms / 13 runs ( 3376.46 m
llama_print_timings: total time = 865441.76 ms / 204 tokens
```
### MobileVLM_V2-1.7B case
#### llava-cli release-2005b
#### mtmd-cli release-2005b
**input**

Just the same as above.
Expand Down Expand Up @@ -200,7 +200,7 @@ make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
### case 1
**input**
```sh
./llama-llava-cli \
./llama-mtmd-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
--image /data/local/tmp/demo.jpeg \
Expand All @@ -224,7 +224,7 @@ llama_print_timings: total time = 1352.63 ms / 252 tokens
### case 2
**input**
```sh
./llama-llava-cli \
./llama-mtmd-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
Expand Down
7 changes: 4 additions & 3 deletions examples/llava/README-gemma3.md → docs/multimodal/gemma3.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@ llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF

## How to get mmproj.gguf?

Simply to add `--mmproj` in when converting model via `convert_hf_to_gguf.py`:

```bash
cd gemma-3-4b-it
python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py .

# output file is mmproj.gguf
python ../llama.cpp/convert_hf_to_gguf.py --outfile model.gguf --outtype f16 --mmproj .
# output file: mmproj-model.gguf
```

## How to run it?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b).

## Usage
Build with cmake or run `make llama-llava-cli` to build it.
Build the `llama-mtmd-cli` binary.

After building, run: `./llama-llava-cli` to see the usage. For example:
After building, run: `./llama-mtmd-cli` to see the usage. For example:

```sh
./llama-llava-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf --image img_path/image.jpg -p "<|system|>\n system prompt <image><|user|>\n prompt <|assistant|>\n"
./llama-mtmd-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf
```

**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,15 +176,11 @@ Note that currently you cannot quantize the visual encoder because granite visio


### 5. Running the Model in Llama cpp
Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.

```bash
$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \
$ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \
--mmproj $VISUAL_GGUF_PATH \
--image ./media/llama0-banner.png \
-c 16384 \
-p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat does the text in this image say?\n<|assistant|>\n" \
--temp 0
```

Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"`
Loading