apicalshark
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/arg.cpp‎
Lines changed: 8 additions & 3 deletions b/‎common/arg.cpp‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎common/common.h‎
Lines changed: 1 addition & 0 deletions b/‎common/common.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 8 additions & 3 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎convert_hf_to_gguf_update.py‎
Lines changed: 5 additions & 0 deletions b/‎convert_hf_to_gguf_update.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/llava/README-granitevision.md‎
Lines changed: 34 additions & 27 deletions b/‎examples/llava/README-granitevision.md‎
Lines changed: 34 additions & 27 deletions
diff --git a/‎examples/main/main.cpp‎
Lines changed: 7 additions & 3 deletions b/‎examples/main/main.cpp‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎examples/server/public/index.html.gz‎
7 Bytes b/‎examples/server/public/index.html.gz‎
7 Bytes
diff --git a/‎examples/server/webui/src/components/SettingDialog.tsx‎
Lines changed: 7 additions & 7 deletions b/‎examples/server/webui/src/components/SettingDialog.tsx‎
Lines changed: 7 additions & 7 deletions
@@ -45,6 +45,8 @@ lcov-report/
 tags
 .build/
 build*
+release
+debug
 !build-info.cmake
 !build-info.cpp.in
 !build-info.sh
 
@@ -39,7 +39,7 @@
 
     _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
 
-- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
+- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` (from clang-tools v15+) to format the added code
 - For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
 - Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
 
@@ -813,13 +813,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_ARG_FLASH_ATTN"));
     add_opt(common_arg(
         {"-p", "--prompt"}, "PROMPT",
-        ex == LLAMA_EXAMPLE_MAIN
-            ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
-            : "prompt to start generation with",
+        "prompt to start generation with; for system message, use -sys",
         [](common_params & params, const std::string & value) {
             params.prompt = value;
         }
     ).set_excludes({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-sys", "--system-prompt"}, "PROMPT",
+        "system prompt to use with model (if applicable, depending on chat template)",
+        [](common_params & params, const std::string & value) {
+            params.system_prompt = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--no-perf"},
         string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
 
@@ -261,6 +261,7 @@ struct common_params {
     std::string hf_repo              = ""; // HF repo                                                       // NOLINT
     std::string hf_file              = ""; // HF file                                                       // NOLINT
     std::string prompt               = "";                                                                  // NOLINT
+    std::string system_prompt        = "";                                                                  // NOLINT
     std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
     std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
     std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
 
@@ -699,6 +699,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
             # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
             res = "deepseek-r1-qwen"
+        if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
+            # ref: https://huggingface.co/Xenova/gpt-4o
+            res = "gpt-4o"
 
         if res is None:
             logger.warning("\n")
@@ -2512,7 +2515,8 @@ def set_gguf_parameters(self):
         rms_eps = self.find_hparam(["rms_norm_eps"])
         max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
         orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rope_dims = n_embd // n_head
+        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        rope_dims = int(rot_pct * n_embd) // n_head
 
         self.gguf_writer.add_context_length(max_pos_embds)
         self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
@@ -2536,7 +2540,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         n_head = self.find_hparam(["num_attention_heads", "n_head"])
         max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
         orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rope_dims = n_embd // n_head
+        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        rope_dims = int(rot_pct * n_embd) // n_head
 
         # write rope scaling for long context (128k) model
         rope_scaling = self.find_hparam(['rope_scaling'], True)
@@ -2565,7 +2570,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
             raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
 
         if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
-            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.')
 
         yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
         yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
 
@@ -109,6 +109,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "megrez",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
     {"name": "deepseek-v3",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
     {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
+    {"name": "gpt-4o",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
 ]
 
 
@@ -131,6 +132,10 @@ def download_model(model):
 
     files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
 
+    if name == "gpt-4o":
+        # Xenova/gpt-4o is tokenizer-only, it does not contain config.json
+        files = ["tokenizer.json", "tokenizer_config.json"]
+
     if tokt == TOKENIZER_TYPE.SPM:
         files.append("tokenizer.model")
 
 
@@ -3,8 +3,8 @@
 Download the model and point your `GRANITE_MODEL` environment variable to the path.
 
 ```bash
-$ git clone https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview
-$ export GRANITE_MODEL=./granite-vision-3.1-2b-preview
+$ git clone https://huggingface.co/ibm-granite/granite-vision-3.2-2b
+$ export GRANITE_MODEL=./granite-vision-3.2-2b
 ```
 
 
@@ -41,17 +41,26 @@ If you actually inspect the `.keys()` of the loaded tensors, you should see a lo
 
 
 ### 2. Creating the Visual Component GGUF
-To create the GGUF for the visual components, we need to write a config for the visual encoder; make sure the config contains the correct `image_grid_pinpoints`
+Next, create a new directory to hold the visual components, and copy the llava.clip/projector files, as shown below.
 
+```bash
+$ ENCODER_PATH=$PWD/visual_encoder
+$ mkdir $ENCODER_PATH
+
+$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
+$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
+```
+
+Now, we need to write a config for the visual encoder. In order to convert the model, be sure to use the correct `image_grid_pinpoints`, as these may vary based on the model. You can find the `image_grid_pinpoints` in `$GRANITE_MODEL/config.json`.
 
-Note: we refer to this file as `$VISION_CONFIG` later on.
 ```json
 {
     "_name_or_path": "siglip-model",
     "architectures": [
       "SiglipVisionModel"
     ],
     "image_grid_pinpoints": [
+        [384,384],
         [384,768],
         [384,1152],
         [384,1536],
@@ -94,42 +103,32 @@ Note: we refer to this file as `$VISION_CONFIG` later on.
 }
 ```
 
-Create a new directory to hold the visual components, and copy the llava.clip/projector files, as well as the vision config into it.
-
-```bash
-$ ENCODER_PATH=$PWD/visual_encoder
-$ mkdir $ENCODER_PATH
-
-$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
-$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
-$ cp $VISION_CONFIG $ENCODER_PATH/config.json
-```
-
-At which point you should have something like this:
+At this point you should have something like this:
 ```bash
 $ ls $ENCODER_PATH
 config.json             llava.projector         pytorch_model.bin
 ```
 
-Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the siglip visual encoder - in the transformers model, you can find these numbers in the [preprocessor_config.json](https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview/blob/main/preprocessor_config.json).
+Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the SigLIP visual encoder - in the transformers model, you can find these numbers in the `preprocessor_config.json`.
 ```bash
 $ python convert_image_encoder_to_gguf.py \
     -m $ENCODER_PATH \
     --llava-projector $ENCODER_PATH/llava.projector \
     --output-dir $ENCODER_PATH \
     --clip-model-is-vision \
     --clip-model-is-siglip \
-    --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5
+    --image-mean 0.5 0.5 0.5 \
+    --image-std 0.5 0.5 0.5
 ```
 
-this will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the abs path of this file as the `$VISUAL_GGUF_PATH.`
+This will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the absolute path of this file as the `$VISUAL_GGUF_PATH.`
 
 
 ### 3. Creating the LLM GGUF.
 The granite vision model contains a granite LLM as its language model. For now, the easiest way to get the GGUF for LLM is by loading the composite model in `transformers` and exporting the LLM so that it can be directly converted with the normal conversion path.
 
 First, set the `LLM_EXPORT_PATH` to the path to export the `transformers` LLM to.
-```
+```bash
 $ export LLM_EXPORT_PATH=$PWD/granite_vision_llm
 ```
 
@@ -142,7 +141,7 @@ if not MODEL_PATH:
     raise ValueError("env var GRANITE_MODEL is unset!")
 
 LLM_EXPORT_PATH = os.getenv("LLM_EXPORT_PATH")
-if not MODEL_PATH:
+if not LLM_EXPORT_PATH:
     raise ValueError("env var LLM_EXPORT_PATH is unset!")
 
 tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
@@ -166,18 +165,26 @@ $ python convert_hf_to_gguf.py --outfile $LLM_GGUF_PATH $LLM_EXPORT_PATH
 ```
 
 
-### 4. Running the Model in Llama cpp
-Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. Sample usage:
+### 4. Quantization
+If you want to quantize the LLM, you can do so with `llama-quantize` as you would any other LLM. For example:
+```bash
+$ ./build/bin/llama-quantize $LLM_EXPORT_PATH/granite_llm.gguf $LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf Q4_K_M
+$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf
+```
+
+Note that currently you cannot quantize the visual encoder because granite vision models use SigLIP as the visual encoder, which has tensor dimensions that are not divisible by 32.
+
 
-Note - the test image shown below can be found [here](https://github-production-user-asset-6210df.s3.amazonaws.com/10740300/415512792-d90d5562-8844-4f34-a0a5-77f62d5a58b5.jpg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20250221%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250221T054145Z&X-Amz-Expires=300&X-Amz-Signature=86c60be490aa49ef7d53f25d6c973580a8273904fed11ed2453d0a38240ee40a&X-Amz-SignedHeaders=host).
+### 5. Running the Model in Llama cpp
+Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
 
 ```bash
 $ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \
     --mmproj $VISUAL_GGUF_PATH \
-    --image cherry_blossom.jpg \
+    --image ./media/llama0-banner.png \
     -c 16384 \
-    -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat type of flowers are in this picture?\n<|assistant|>\n" \
+    -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat does the text in this image say?\n<|assistant|>\n" \
     --temp 0
 ```
 
-Sample response: `The flowers in the picture are cherry blossoms, which are known for their delicate pink petals and are often associated with the beauty of spring.`
+Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"`
@@ -219,6 +219,10 @@ int main(int argc, char ** argv) {
     // print chat template example in conversation mode
     if (params.conversation_mode) {
         if (params.enable_chat_template) {
+            if (!params.prompt.empty()) {
+                LOG_WRN("*** User-specified prompt in conversation mode will be ignored, did you mean to set --system-prompt (-sys) instead?\n");
+            }
+
             LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
         } else {
             LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
@@ -276,7 +280,7 @@ int main(int argc, char ** argv) {
     {
         auto prompt = (params.conversation_mode && params.enable_chat_template)
             // format the system prompt in conversation mode (fallback to default if empty)
-            ? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
+            ? chat_add_and_format("system", params.system_prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.system_prompt)
             // otherwise use the prompt as is
             : params.prompt;
         if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
@@ -476,8 +480,8 @@ int main(int argc, char ** argv) {
         LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
 #endif
         LOG_INF(       "%s", control_message);
-        if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
-            LOG_INF(   " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
+        if (params.conversation_mode && params.enable_chat_template && params.system_prompt.empty()) {
+            LOG_INF(   " - Not using system message. To change it, set a different value via -sys PROMPT\n");
         }
         LOG_INF("\n");
 
 
@@ -148,13 +148,13 @@ const SETTING_SECTIONS: SettingSection[] = [
     fields: [
       {
         type: SettingInputType.CHECKBOX,
-        label: 'Expand though process by default for generating message',
+        label: 'Expand thought process by default when generating messages',
         key: 'showThoughtInProgress',
       },
       {
         type: SettingInputType.CHECKBOX,
         label:
-          'Exclude thought process when sending request to API (Recommended for DeepSeek-R1)',
+          'Exclude thought process when sending requests to API (Recommended for DeepSeek-R1)',
         key: 'excludeThoughtOnReq',
       },
     ],
@@ -247,7 +247,7 @@ const SETTING_SECTIONS: SettingSection[] = [
               This feature uses{' '}
               <OpenInNewTab href="https://pyodide.org">pyodide</OpenInNewTab>,
               downloaded from CDN. To use this feature, ask the LLM to generate
-              python code inside a markdown code block. You will see a "Run"
+              Python code inside a Markdown code block. You will see a "Run"
               button on the code block, near the "Copy" button.
             </small>
           </>
@@ -274,7 +274,7 @@ export default function SettingDialog({
   );
 
   const resetConfig = () => {
-    if (window.confirm('Are you sure to reset all settings?')) {
+    if (window.confirm('Are you sure you want to reset all settings?')) {
       setLocalConfig(CONFIG_DEFAULT);
     }
   };
@@ -296,9 +296,9 @@ export default function SettingDialog({
           return;
         }
       } else if (mustBeNumeric) {
-        const trimedValue = value.toString().trim();
-        const numVal = Number(trimedValue);
-        if (isNaN(numVal) || !isNumeric(numVal) || trimedValue.length === 0) {
+        const trimmedValue = value.toString().trim();
+        const numVal = Number(trimmedValue);
+        if (isNaN(numVal) || !isNumeric(numVal) || trimmedValue.length === 0) {
           alert(`Value for ${key} must be numeric`);
           return;
         }