Skip to content

Commit 9011787

Browse files
Merge branch 'master' of https://github.com/ggerganov/llama.cpp
2 parents 14965bb + 36c258e commit 9011787

File tree

86 files changed

+3977
-1661
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+3977
-1661
lines changed

.github/workflows/build.yml

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,15 @@ jobs:
173173
name: llama-bin-macos-x64.zip
174174

175175
ubuntu-cpu-cmake:
176-
runs-on: ubuntu-22.04
176+
strategy:
177+
matrix:
178+
include:
179+
- build: 'x64'
180+
os: ubuntu-22.04
181+
- build: 'arm64'
182+
os: ubuntu-22.04-arm
183+
184+
runs-on: ${{ matrix.os }}
177185

178186
steps:
179187
- name: Clone
@@ -239,14 +247,14 @@ jobs:
239247
run: |
240248
cp LICENSE ./build/bin/
241249
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
242-
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
250+
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
243251
244252
- name: Upload artifacts
245253
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
246254
uses: actions/upload-artifact@v4
247255
with:
248-
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
249-
name: llama-bin-ubuntu-x64.zip
256+
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
257+
name: llama-bin-ubuntu-${{ matrix.build }}.zip
250258

251259
ubuntu-latest-cmake-sanitizer:
252260
runs-on: ubuntu-latest

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ examples/server/*.css.hpp
9898
examples/server/*.html.hpp
9999
examples/server/*.js.hpp
100100
examples/server/*.mjs.hpp
101+
examples/server/*.gz.hpp
101102
!build_64.sh
102103
!examples/*.bat
103104
!examples/*/*.kts

CONTRIBUTING.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
# Pull requests (for contributors)
22

3+
- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
34
- Test your changes:
45
- Execute [the full CI locally on your machine](ci/README.md) before publishing
56
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
67
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
78
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
9+
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
810
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
911
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
1012

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -847,7 +847,7 @@ ifdef GGML_MUSA
847847
CXX := $(MUSA_PATH)/bin/clang++
848848
MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
849849

850-
MUSAFLAGS = -x musa -mtgpu
850+
MUSAFLAGS = -fsigned-char -x musa -mtgpu
851851
MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
852852

853853
ifdef GGML_CUDA_FORCE_MMQ
@@ -1364,7 +1364,7 @@ llama-server: \
13641364
examples/server/index.html.hpp \
13651365
examples/server/loading.html.hpp \
13661366
common/chat.cpp \
1367-
common/chat.hpp \
1367+
common/chat.h \
13681368
common/chat-template.hpp \
13691369
common/json.hpp \
13701370
common/minja.hpp \

common/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,7 @@ add_library(${TARGET} STATIC
5757
arg.h
5858
base64.hpp
5959
chat.cpp
60-
chat.hpp
61-
chat-template.hpp
60+
chat.h
6261
common.cpp
6362
common.h
6463
console.cpp
@@ -68,7 +67,8 @@ add_library(${TARGET} STATIC
6867
llguidance.cpp
6968
log.cpp
7069
log.h
71-
minja.hpp
70+
minja/chat-template.hpp
71+
minja/minja.hpp
7272
ngram-cache.cpp
7373
ngram-cache.h
7474
sampling.cpp

common/arg.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "log.h"
44
#include "sampling.h"
5+
#include "chat.h"
56

67
#include <algorithm>
78
#include <climits>
@@ -2501,5 +2502,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25012502
}
25022503
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
25032504

2505+
add_opt(common_arg(
2506+
{"--fim-qwen-1.5b-default"},
2507+
string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
2508+
[](common_params & params) {
2509+
params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
2510+
params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
2511+
params.port = 8012;
2512+
params.n_gpu_layers = 99;
2513+
params.flash_attn = true;
2514+
params.n_ubatch = 1024;
2515+
params.n_batch = 1024;
2516+
params.n_ctx = 0;
2517+
params.n_cache_reuse = 256;
2518+
}
2519+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2520+
2521+
add_opt(common_arg(
2522+
{"--fim-qwen-3b-default"},
2523+
string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
2524+
[](common_params & params) {
2525+
params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
2526+
params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
2527+
params.port = 8012;
2528+
params.n_gpu_layers = 99;
2529+
params.flash_attn = true;
2530+
params.n_ubatch = 1024;
2531+
params.n_batch = 1024;
2532+
params.n_ctx = 0;
2533+
params.n_cache_reuse = 256;
2534+
}
2535+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2536+
2537+
add_opt(common_arg(
2538+
{"--fim-qwen-7b-default"},
2539+
string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
2540+
[](common_params & params) {
2541+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2542+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2543+
params.port = 8012;
2544+
params.n_gpu_layers = 99;
2545+
params.flash_attn = true;
2546+
params.n_ubatch = 1024;
2547+
params.n_batch = 1024;
2548+
params.n_ctx = 0;
2549+
params.n_cache_reuse = 256;
2550+
}
2551+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2552+
25042553
return ctx_arg;
25052554
}

0 commit comments

Comments
 (0)