Skip to content

Commit fe8c79b

Browse files
author
Olivier Chafik
committed
Merge remote-tracking branch 'origin/master' into tool-bench-prod
2 parents 3fe208a + 0b52745 commit fe8c79b

File tree

105 files changed

+4868
-1064
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

105 files changed

+4868
-1064
lines changed

.github/workflows/build.yml

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,15 @@ jobs:
173173
name: llama-bin-macos-x64.zip
174174

175175
ubuntu-cpu-cmake:
176-
runs-on: ubuntu-22.04
176+
strategy:
177+
matrix:
178+
include:
179+
- build: 'x64'
180+
os: ubuntu-22.04
181+
- build: 'arm64'
182+
os: ubuntu-22.04-arm
183+
184+
runs-on: ${{ matrix.os }}
177185

178186
steps:
179187
- name: Clone
@@ -239,14 +247,14 @@ jobs:
239247
run: |
240248
cp LICENSE ./build/bin/
241249
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
242-
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
250+
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
243251
244252
- name: Upload artifacts
245253
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
246254
uses: actions/upload-artifact@v4
247255
with:
248-
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
249-
name: llama-bin-ubuntu-x64.zip
256+
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
257+
name: llama-bin-ubuntu-${{ matrix.build }}.zip
250258

251259
ubuntu-latest-cmake-sanitizer:
252260
runs-on: ubuntu-latest

CONTRIBUTING.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
# Pull requests (for contributors)
22

3+
- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
34
- Test your changes:
45
- Execute [the full CI locally on your machine](ci/README.md) before publishing
56
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
67
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
78
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
9+
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
810
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
911
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
1012

Makefile

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,10 @@ ifdef GGML_CUDA_CCBIN
680680
MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN)
681681
endif # GGML_CUDA_CCBIN
682682

683+
ifdef GGML_CUDA_NO_FA
684+
MK_NVCCFLAGS += -DGGML_CUDA_NO_FA
685+
endif # GGML_CUDA_NO_FA
686+
683687
ifdef GGML_CUDA_FA_ALL_QUANTS
684688
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
685689
endif # GGML_CUDA_FA_ALL_QUANTS
@@ -800,6 +804,10 @@ ifdef GGML_CUDA_NO_PEER_COPY
800804
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
801805
endif # GGML_CUDA_NO_PEER_COPY
802806

807+
ifdef GGML_CUDA_NO_FA
808+
HIPFLAGS += -DGGML_CUDA_NO_FA
809+
endif # GGML_CUDA_NO_FA
810+
803811
OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
804812
OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
805813
OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
@@ -847,7 +855,7 @@ ifdef GGML_MUSA
847855
CXX := $(MUSA_PATH)/bin/clang++
848856
MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
849857

850-
MUSAFLAGS = -x musa -mtgpu
858+
MUSAFLAGS = -fsigned-char -x musa -mtgpu
851859
MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
852860

853861
ifdef GGML_CUDA_FORCE_MMQ
@@ -876,6 +884,10 @@ ifdef GGML_CUDA_NO_PEER_COPY
876884
MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
877885
endif # GGML_CUDA_NO_PEER_COPY
878886

887+
ifdef GGML_CUDA_NO_FA
888+
MUSAFLAGS += -DGGML_CUDA_NO_FA
889+
endif # GGML_CUDA_NO_FA
890+
879891
ifdef GGML_CUDA_FA_ALL_QUANTS
880892
MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
881893
endif # GGML_CUDA_FA_ALL_QUANTS

common/arg.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2502,5 +2502,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25022502
}
25032503
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
25042504

2505+
add_opt(common_arg(
2506+
{"--fim-qwen-1.5b-default"},
2507+
string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
2508+
[](common_params & params) {
2509+
params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
2510+
params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
2511+
params.port = 8012;
2512+
params.n_gpu_layers = 99;
2513+
params.flash_attn = true;
2514+
params.n_ubatch = 1024;
2515+
params.n_batch = 1024;
2516+
params.n_ctx = 0;
2517+
params.n_cache_reuse = 256;
2518+
}
2519+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2520+
2521+
add_opt(common_arg(
2522+
{"--fim-qwen-3b-default"},
2523+
string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
2524+
[](common_params & params) {
2525+
params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
2526+
params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
2527+
params.port = 8012;
2528+
params.n_gpu_layers = 99;
2529+
params.flash_attn = true;
2530+
params.n_ubatch = 1024;
2531+
params.n_batch = 1024;
2532+
params.n_ctx = 0;
2533+
params.n_cache_reuse = 256;
2534+
}
2535+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2536+
2537+
add_opt(common_arg(
2538+
{"--fim-qwen-7b-default"},
2539+
string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
2540+
[](common_params & params) {
2541+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2542+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2543+
params.port = 8012;
2544+
params.n_gpu_layers = 99;
2545+
params.flash_attn = true;
2546+
params.n_ubatch = 1024;
2547+
params.n_batch = 1024;
2548+
params.n_ctx = 0;
2549+
params.n_cache_reuse = 256;
2550+
}
2551+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2552+
25052553
return ctx_arg;
25062554
}

common/common.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,10 +185,10 @@ struct common_params_speculative {
185185

186186
int32_t n_ctx = 0; // draft context size
187187
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
188-
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
188+
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
189189
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
190190
float p_split = 0.1f; // speculative decoding split probability
191-
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
191+
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
192192

193193
struct cpu_params cpuparams;
194194
struct cpu_params cpuparams_batch;

common/speculative.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -252,11 +252,6 @@ llama_tokens common_speculative_gen_draft(
252252
// add drafted token for each sequence
253253
const llama_token id = cur_p->data[0].id;
254254

255-
// only collect very high-confidence draft tokens
256-
if (cur_p->data[0].p < params.p_min) {
257-
break;
258-
}
259-
260255
common_sampler_accept(smpl, id, true);
261256

262257
result.push_back(id);
@@ -265,6 +260,11 @@ llama_tokens common_speculative_gen_draft(
265260
break;
266261
}
267262

263+
// only collect very high-confidence draft tokens
264+
if (cur_p->data[0].p < params.p_min) {
265+
break;
266+
}
267+
268268
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
269269

270270
// evaluate the drafted tokens on the draft model

common/speculative.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ struct common_speculative_params {
99
int n_draft = 16; // max drafted tokens
1010
int n_reuse = 256;
1111

12-
float p_min = 0.9f; // min probability required to accept a token in the draft
12+
float p_min = 0.75f; // min probability required to accept a token in the draft
1313
};
1414

1515
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);

docs/backend/SYCL.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,16 @@ The following release is verified with good quality:
4242

4343
## News
4444

45+
- 2025.2
46+
- Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
47+
|GPU|Base tokens/s|Increased tokens/s|Percent|
48+
|-|-|-|-|
49+
|PVC 1550|39|73|+87%|
50+
|Flex 170|39|50|+28%|
51+
|Arc770|42|55|+30%|
52+
|MTL|13|16|+23%|
53+
|ARL-H|14|17|+21%|
54+
4555
- 2024.11
4656
- Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.
4757

@@ -97,8 +107,8 @@ SYCL backend supports Intel GPU Family:
97107
| Intel Data Center Max Series | Support | Max 1550, 1100 |
98108
| Intel Data Center Flex Series | Support | Flex 170 |
99109
| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
100-
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
101-
| Intel iGPU | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 |
110+
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake |
111+
| Intel iGPU | Support | iGPU in 13700k,iGPU in 13400, i5-1250P, i7-1260P, i7-1165G7 |
102112

103113
*Notes:*
104114

@@ -660,8 +670,10 @@ use 1 SYCL GPUs: [0] with Max compute units:512
660670
| Name | Value | Function |
661671
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
662672
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
673+
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
663674
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
664675

676+
665677
## Known Issues
666678

667679
- `Split-mode:[row]` is not supported.

docs/build.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,14 @@ This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GP
206206
cmake --build build --config Release
207207
```
208208

209+
For static build:
210+
211+
```bash
212+
cmake -B build -DGGML_MUSA=ON \
213+
-DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
214+
cmake --build build --config Release
215+
```
216+
209217
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
210218

211219
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.

examples/llama.swiftui/llama.swiftui/UI/ContentView.swift

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -124,15 +124,26 @@ struct ContentView: View {
124124
}
125125
}
126126
}.sheet(isPresented: $showingHelp) { // Sheet for help modal
127-
VStack(alignment: .leading) {
127+
NavigationView {
128128
VStack(alignment: .leading) {
129-
Text("1. Make sure the model is in GGUF Format")
130-
.padding()
131-
Text("2. Copy the download link of the quantized model")
132-
.padding()
129+
VStack(alignment: .leading) {
130+
Text("1. Make sure the model is in GGUF Format")
131+
.padding()
132+
Text("2. Copy the download link of the quantized model")
133+
.padding()
134+
}
135+
Spacer()
136+
}
137+
.navigationTitle("Help")
138+
.navigationBarTitleDisplayMode(.inline)
139+
.toolbar {
140+
ToolbarItem(placement: .navigationBarTrailing) {
141+
Button("Done") {
142+
showingHelp = false
143+
}
144+
}
133145
}
134-
Spacer()
135-
}
146+
}
136147
}
137148
}
138149
}

0 commit comments

Comments
 (0)