Skip to content

Commit e5be53a

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents aa26baf + b3c9a65 commit e5be53a

File tree

99 files changed

+16188
-12853
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+16188
-12853
lines changed

.github/workflows/build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -774,7 +774,7 @@ jobs:
774774
env:
775775
OPENBLAS_VERSION: 0.3.23
776776
SDE_VERSION: 9.33.0-2024-01-07
777-
VULKAN_VERSION: 1.3.261.1
777+
VULKAN_VERSION: 1.4.304.1
778778

779779
strategy:
780780
matrix:
@@ -1379,7 +1379,7 @@ jobs:
13791379
id: pack_artifacts
13801380
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
13811381
run: |
1382-
zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
1382+
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
13831383
13841384
- name: Upload artifacts
13851385
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

AUTHORS

Lines changed: 60 additions & 1 deletion
Large diffs are not rendered by default.

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -836,7 +836,7 @@ ifdef GGML_MUSA
836836
else
837837
MUSA_PATH ?= /opt/musa
838838
endif
839-
MUSA_ARCHITECTURES ?= 21;22
839+
MUSA_ARCHITECTURES ?= 21;22;31
840840

841841
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
842842
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
172172
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
173173
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
174174
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
175+
- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT)
175176
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
176177
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
177178
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)

common/arg.cpp

Lines changed: 59 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -764,7 +764,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
764764
).set_env("LLAMA_ARG_CTX_SIZE"));
765765
add_opt(common_arg(
766766
{"-n", "--predict", "--n-predict"}, "N",
767-
string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
767+
string_format(
768+
ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
769+
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
770+
: "number of tokens to predict (default: %d, -1 = infinity)",
771+
params.n_predict),
768772
[](common_params & params, int value) {
769773
params.n_predict = value;
770774
}
@@ -849,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
849853
}
850854
}
851855
).set_excludes({LLAMA_EXAMPLE_SERVER}));
856+
add_opt(common_arg(
857+
{"-sysf", "--system-prompt-file"}, "FNAME",
858+
"a file containing the system prompt (default: none)",
859+
[](common_params & params, const std::string & value) {
860+
std::ifstream file(value);
861+
if (!file) {
862+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
863+
}
864+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
865+
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
866+
params.system_prompt.pop_back();
867+
}
868+
}
869+
).set_examples({LLAMA_EXAMPLE_MAIN}));
852870
add_opt(common_arg(
853871
{"--in-file"}, "FNAME",
854872
"an input file (repeat to specify multiple files)",
@@ -1867,18 +1885,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
18671885
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
18681886
add_opt(common_arg(
18691887
{"-o", "--output", "--output-file"}, "FNAME",
1870-
string_format("output file (default: '%s')",
1871-
ex == LLAMA_EXAMPLE_EXPORT_LORA
1872-
? params.lora_outfile.c_str()
1873-
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
1874-
? params.cvector_outfile.c_str()
1875-
: params.out_file.c_str()),
1888+
string_format("output file (default: '%s')", params.out_file.c_str()),
18761889
[](common_params & params, const std::string & value) {
18771890
params.out_file = value;
1878-
params.cvector_outfile = value;
1879-
params.lora_outfile = value;
18801891
}
1881-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
1892+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
18821893
add_opt(common_arg(
18831894
{"-ofreq", "--output-frequency"}, "N",
18841895
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -2571,5 +2582,43 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25712582
}
25722583
).set_examples({LLAMA_EXAMPLE_SERVER}));
25732584

2585+
add_opt(common_arg(
2586+
{"--fim-qwen-7b-spec"},
2587+
string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2588+
[](common_params & params) {
2589+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2590+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2591+
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2592+
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2593+
params.speculative.n_gpu_layers = 99;
2594+
params.port = 8012;
2595+
params.n_gpu_layers = 99;
2596+
params.flash_attn = true;
2597+
params.n_ubatch = 1024;
2598+
params.n_batch = 1024;
2599+
params.n_ctx = 0;
2600+
params.n_cache_reuse = 256;
2601+
}
2602+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2603+
2604+
add_opt(common_arg(
2605+
{"--fim-qwen-14b-spec"},
2606+
string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2607+
[](common_params & params) {
2608+
params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
2609+
params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
2610+
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2611+
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2612+
params.speculative.n_gpu_layers = 99;
2613+
params.port = 8012;
2614+
params.n_gpu_layers = 99;
2615+
params.flash_attn = true;
2616+
params.n_ubatch = 1024;
2617+
params.n_batch = 1024;
2618+
params.n_ctx = 0;
2619+
params.n_cache_reuse = 256;
2620+
}
2621+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2622+
25742623
return ctx_arg;
25752624
}

0 commit comments

Comments
 (0)