ggml-org
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 12 additions & 4 deletions b/‎.github/workflows/build.yml‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 0 deletions b/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 13 additions & 1 deletion b/‎Makefile‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎common/arg.cpp‎
Lines changed: 48 additions & 0 deletions b/‎common/arg.cpp‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎common/common.h‎
Lines changed: 2 additions & 2 deletions b/‎common/common.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/speculative.cpp‎
Lines changed: 5 additions & 5 deletions b/‎common/speculative.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎common/speculative.h‎
Lines changed: 1 addition & 1 deletion b/‎common/speculative.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/backend/SYCL.md‎
Lines changed: 14 additions & 2 deletions b/‎docs/backend/SYCL.md‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎docs/build.md‎
Lines changed: 8 additions & 0 deletions b/‎docs/build.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎examples/llama.swiftui/llama.swiftui/UI/ContentView.swift‎
Lines changed: 18 additions & 7 deletions b/‎examples/llama.swiftui/llama.swiftui/UI/ContentView.swift‎
Lines changed: 18 additions & 7 deletions
@@ -173,7 +173,15 @@ jobs:
           name: llama-bin-macos-x64.zip
 
   ubuntu-cpu-cmake:
-    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        include:
+          - build: 'x64'
+            os: ubuntu-22.04
+          - build: 'arm64'
+            os: ubuntu-22.04-arm
+
+    runs-on: ${{ matrix.os }}
 
     steps:
       - name: Clone
@@ -239,14 +247,14 @@ jobs:
         run: |
           cp LICENSE ./build/bin/
           cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
-          name: llama-bin-ubuntu-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
+          name: llama-bin-ubuntu-${{ matrix.build }}.zip
 
   ubuntu-latest-cmake-sanitizer:
     runs-on: ubuntu-latest
 
@@ -1,10 +1,12 @@
 # Pull requests (for contributors)
 
+- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
 - Test your changes:
     - Execute [the full CI locally on your machine](ci/README.md) before publishing
     - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
     - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
     - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
+- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments
 
 
@@ -680,6 +680,10 @@ ifdef GGML_CUDA_CCBIN
 	MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN)
 endif # GGML_CUDA_CCBIN
 
+ifdef GGML_CUDA_NO_FA
+	MK_NVCCFLAGS += -DGGML_CUDA_NO_FA
+endif # GGML_CUDA_NO_FA
+
 ifdef GGML_CUDA_FA_ALL_QUANTS
 	MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
 endif # GGML_CUDA_FA_ALL_QUANTS
@@ -800,6 +804,10 @@ ifdef GGML_CUDA_NO_PEER_COPY
 	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY
 
+ifdef GGML_CUDA_NO_FA
+	HIPFLAGS += -DGGML_CUDA_NO_FA
+endif # GGML_CUDA_NO_FA
+
 	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
 	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
 	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
@@ -847,7 +855,7 @@ ifdef GGML_MUSA
 	CXX := $(MUSA_PATH)/bin/clang++
 	MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
 
-	MUSAFLAGS  = -x musa -mtgpu
+	MUSAFLAGS  = -fsigned-char -x musa -mtgpu
 	MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
 
 ifdef GGML_CUDA_FORCE_MMQ
@@ -876,6 +884,10 @@ ifdef GGML_CUDA_NO_PEER_COPY
 	MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY
 
+ifdef GGML_CUDA_NO_FA
+	MUSAFLAGS += -DGGML_CUDA_NO_FA
+endif # GGML_CUDA_NO_FA
+
 ifdef GGML_CUDA_FA_ALL_QUANTS
 	MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
 endif # GGML_CUDA_FA_ALL_QUANTS
 
@@ -2502,5 +2502,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
 
+    add_opt(common_arg(
+        {"--fim-qwen-1.5b-default"},
+        string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
+            params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
+            params.port = 8012;
+            params.n_gpu_layers = 99;
+            params.flash_attn = true;
+            params.n_ubatch = 1024;
+            params.n_batch = 1024;
+            params.n_ctx = 0;
+            params.n_cache_reuse = 256;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--fim-qwen-3b-default"},
+        string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
+            params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
+            params.port = 8012;
+            params.n_gpu_layers = 99;
+            params.flash_attn = true;
+            params.n_ubatch = 1024;
+            params.n_batch = 1024;
+            params.n_ctx = 0;
+            params.n_cache_reuse = 256;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--fim-qwen-7b-default"},
+        string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+            params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+            params.port = 8012;
+            params.n_gpu_layers = 99;
+            params.flash_attn = true;
+            params.n_ubatch = 1024;
+            params.n_batch = 1024;
+            params.n_ctx = 0;
+            params.n_cache_reuse = 256;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
     return ctx_arg;
 }
@@ -185,10 +185,10 @@ struct common_params_speculative {
 
     int32_t n_ctx        =     0; // draft context size
     int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
+    int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding
     int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
     float   p_split      =  0.1f; // speculative decoding split probability
-    float   p_min        =  0.9f; // minimum speculative decoding probability (greedy)
+    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
 
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
 
@@ -252,11 +252,6 @@ llama_tokens common_speculative_gen_draft(
         // add drafted token for each sequence
         const llama_token id = cur_p->data[0].id;
 
-        // only collect very high-confidence draft tokens
-        if (cur_p->data[0].p < params.p_min) {
-            break;
-        }
-
         common_sampler_accept(smpl, id, true);
 
         result.push_back(id);
@@ -265,6 +260,11 @@ llama_tokens common_speculative_gen_draft(
             break;
         }
 
+        // only collect very high-confidence draft tokens
+        if (cur_p->data[0].p < params.p_min) {
+            break;
+        }
+
         common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
 
         // evaluate the drafted tokens on the draft model
 
@@ -9,7 +9,7 @@ struct common_speculative_params {
     int n_draft = 16;  // max drafted tokens
     int n_reuse = 256;
 
-    float p_min = 0.9f; // min probability required to accept a token in the draft
+    float p_min = 0.75f; // min probability required to accept a token in the draft
 };
 
 struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
 
@@ -42,6 +42,16 @@ The following release is verified with good quality:
 
 ## News
 
+- 2025.2
+  - Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
+    |GPU|Base tokens/s|Increased tokens/s|Percent|
+    |-|-|-|-|
+    |PVC 1550|39|73|+87%|
+    |Flex 170|39|50|+28%|
+    |Arc770|42|55|+30%|
+    |MTL|13|16|+23%|
+    |ARL-H|14|17|+21%|
+
 - 2024.11
   - Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.
 
@@ -97,8 +107,8 @@ SYCL backend supports Intel GPU Family:
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
 | Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
-| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
-| Intel iGPU                    | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 |
+| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake, Arrow Lake    |
+| Intel iGPU                    | Support | iGPU in 13700k,iGPU in 13400, i5-1250P, i7-1260P, i7-1165G7 |
 
 *Notes:*
 
@@ -660,8 +670,10 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | Name              | Value            | Function                                                                                                                  |
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
+| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
 
+
 ## Known Issues
 
 - `Split-mode:[row]` is not supported.
 
@@ -206,6 +206,14 @@ This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GP
   cmake --build build --config Release
   ```
 
+  For static build:
+
+  ```bash
+  cmake -B build -DGGML_MUSA=ON \
+    -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+  cmake --build build --config Release
+  ```
+
 The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
 
 The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
 
@@ -124,15 +124,26 @@ struct ContentView: View {
                     }
                 }
             }.sheet(isPresented: $showingHelp) {    // Sheet for help modal
-                VStack(alignment: .leading) {
+                NavigationView {
                     VStack(alignment: .leading) {
-                        Text("1. Make sure the model is in GGUF Format")
-                               .padding()
-                        Text("2. Copy the download link of the quantized model")
-                               .padding()
+                        VStack(alignment: .leading) {
+                            Text("1. Make sure the model is in GGUF Format")
+                                    .padding()
+                            Text("2. Copy the download link of the quantized model")
+                                    .padding()
+                        }
+                        Spacer()
+                    }
+                    .navigationTitle("Help")
+                    .navigationBarTitleDisplayMode(.inline)
+                    .toolbar {
+                        ToolbarItem(placement: .navigationBarTrailing) {
+                            Button("Done") {
+                                showingHelp = false
+                            }
+                        }
                     }
-                    Spacer()
-                   }
+                }
             }
         }
     }
Original file line number	Diff line number	Diff line change
`@@ -124,15 +124,26 @@ struct ContentView: View {`
`124`	`124`	`}`
`125`	`125`	`}`
`126`	`126`	`}.sheet(isPresented: $showingHelp) { // Sheet for help modal`
`127`		`- VStack(alignment: .leading) {`
	`127`	`+ NavigationView {`
`128`	`128`	`VStack(alignment: .leading) {`
`129`		`- Text("1. Make sure the model is in GGUF Format")`
`130`		`- .padding()`
`131`		`- Text("2. Copy the download link of the quantized model")`
`132`		`- .padding()`
	`129`	`+ VStack(alignment: .leading) {`
	`130`	`+ Text("1. Make sure the model is in GGUF Format")`
	`131`	`+ .padding()`
	`132`	`+ Text("2. Copy the download link of the quantized model")`
	`133`	`+ .padding()`
	`134`	`+ }`
	`135`	`+ Spacer()`
	`136`	`+ }`
	`137`	`+ .navigationTitle("Help")`
	`138`	`+ .navigationBarTitleDisplayMode(.inline)`
	`139`	`+ .toolbar {`
	`140`	`+ ToolbarItem(placement: .navigationBarTrailing) {`
	`141`	`+ Button("Done") {`
	`142`	`+ showingHelp = false`
	`143`	`+ }`
	`144`	`+ }`
`133`	`145`	`}`
`134`		`- Spacer()`
`135`		`- }`
	`146`	`+ }`
`136`	`147`	`}`
`137`	`148`	`}`
`138`	`149`	`}`