Skip to content

Commit 1147172

Browse files
authored
Merge branch 'ggml-org:master' into fix_grammar_file_in_server
2 parents f73ddae + 96e1280 commit 1147172

29 files changed

+644
-527
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -836,7 +836,7 @@ ifdef GGML_MUSA
836836
else
837837
MUSA_PATH ?= /opt/musa
838838
endif
839-
MUSA_ARCHITECTURES ?= 21;22
839+
MUSA_ARCHITECTURES ?= 21;22;31
840840

841841
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
842842
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
172172
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
173173
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
174174
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
175+
- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT)
175176
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
176177
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
177178
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)

common/arg.cpp

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1867,16 +1867,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
18671867
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
18681868
add_opt(common_arg(
18691869
{"-o", "--output", "--output-file"}, "FNAME",
1870-
string_format("output file (default: '%s')",
1871-
ex == LLAMA_EXAMPLE_EXPORT_LORA
1872-
? params.lora_outfile.c_str()
1873-
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
1874-
? params.cvector_outfile.c_str()
1875-
: params.out_file.c_str()),
1870+
string_format("output file (default: '%s')", params.out_file.c_str()),
18761871
[](common_params & params, const std::string & value) {
18771872
params.out_file = value;
1878-
params.cvector_outfile = value;
1879-
params.lora_outfile = value;
18801873
}
18811874
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
18821875
add_opt(common_arg(
@@ -2571,5 +2564,43 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25712564
}
25722565
).set_examples({LLAMA_EXAMPLE_SERVER}));
25732566

2567+
add_opt(common_arg(
2568+
{"--fim-qwen-7b-spec"},
2569+
string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2570+
[](common_params & params) {
2571+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2572+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2573+
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2574+
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2575+
params.speculative.n_gpu_layers = 99;
2576+
params.port = 8012;
2577+
params.n_gpu_layers = 99;
2578+
params.flash_attn = true;
2579+
params.n_ubatch = 1024;
2580+
params.n_batch = 1024;
2581+
params.n_ctx = 0;
2582+
params.n_cache_reuse = 256;
2583+
}
2584+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2585+
2586+
add_opt(common_arg(
2587+
{"--fim-qwen-14b-spec"},
2588+
string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2589+
[](common_params & params) {
2590+
params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
2591+
params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
2592+
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2593+
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2594+
params.speculative.n_gpu_layers = 99;
2595+
params.port = 8012;
2596+
params.n_gpu_layers = 99;
2597+
params.flash_attn = true;
2598+
params.n_ubatch = 1024;
2599+
params.n_batch = 1024;
2600+
params.n_ctx = 0;
2601+
params.n_cache_reuse = 256;
2602+
}
2603+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2604+
25742605
return ctx_arg;
25752606
}

common/chat.cpp

Lines changed: 163 additions & 146 deletions
Large diffs are not rendered by default.

common/chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ enum common_chat_format {
5353
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
5454
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
5555
COMMON_CHAT_FORMAT_HERMES_2_PRO,
56+
COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
5657
COMMON_CHAT_FORMAT_COMMAND_R7B,
5758
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
5859

common/common.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -407,8 +407,6 @@ struct common_params {
407407
int32_t i_pos = -1; // position of the passkey in the junk text
408408

409409
// imatrix params
410-
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
411-
412410
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
413411
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
414412
int32_t i_chunk = 0; // start processing from this chunk
@@ -420,16 +418,16 @@ struct common_params {
420418
int n_pca_batch = 100;
421419
int n_pca_iterations = 1000;
422420
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
423-
std::string cvector_outfile = "control_vector.gguf";
424421
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
425422
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
426423

427424
bool spm_infill = false; // suffix/prefix/middle pattern for infill
428425

429-
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
430-
431426
// batched-bench params
432427
bool batched_bench_output_jsonl = false;
428+
429+
// common params
430+
std::string out_file; // output filename for all example programs
433431
};
434432

435433
// call once at the start of a program if it uses libcommon

docs/build.md

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -197,28 +197,52 @@ The following compilation options are also available to tweak performance:
197197

198198
## MUSA
199199

200-
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
200+
This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.
201201

202-
- Using `CMake`:
202+
#### Download directly from Moore Threads
203203

204-
```bash
205-
cmake -B build -DGGML_MUSA=ON
206-
cmake --build build --config Release
207-
```
204+
You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
208205

209-
For static build:
206+
### Compilation
210207

211-
```bash
208+
```bash
209+
cmake -B build -DGGML_MUSA=ON
210+
cmake --build build --config Release
211+
```
212+
213+
#### Override Compute Capability Specifications
214+
215+
By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
216+
217+
```bash
218+
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
219+
```
220+
221+
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
222+
223+
#### Compilation options
224+
225+
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
226+
227+
- For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
228+
```
212229
cmake -B build -DGGML_MUSA=ON \
213230
-DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
214231
cmake --build build --config Release
215232
```
216233

217-
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
234+
### Runtime MUSA environmental variables
218235

219-
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
236+
You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
220237

221-
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
238+
```bash
239+
# Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
240+
MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
241+
```
242+
243+
### Unified Memory
244+
245+
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
222246

223247
## HIP
224248

examples/cvector-generator/cvector-generator.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,8 @@ static int prepare_entries(common_params & params, train_context & ctx_train) {
394394
int main(int argc, char ** argv) {
395395
common_params params;
396396

397+
params.out_file = "control_vector.gguf";
398+
397399
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
398400
return 1;
399401
}
@@ -498,7 +500,7 @@ int main(int argc, char ** argv) {
498500
}
499501

500502
// write output vectors to gguf
501-
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
503+
export_gguf(ctx_train.v_final, params.out_file, model_hint);
502504

503505
llama_backend_free();
504506

examples/export-lora/export-lora.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -413,20 +413,22 @@ static void print_usage(int, char ** argv) {
413413
int main(int argc, char ** argv) {
414414
common_params params;
415415

416+
params.out_file = "ggml-lora-merged-f16.gguf";
417+
416418
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
417419
return 1;
418420
}
419421

420422
g_verbose = (params.verbosity > 1);
421423
try {
422-
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
424+
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
423425
ctx.run_merge();
424426
} catch (const std::exception & err) {
425427
fprintf(stderr, "%s\n", err.what());
426428
exit(EXIT_FAILURE);
427429
}
428430

429-
printf("done, output file is %s\n", params.lora_outfile.c_str());
431+
printf("done, output file is %s\n", params.out_file.c_str());
430432

431433
return 0;
432434
}

examples/imatrix/imatrix.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -206,9 +206,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
206206

207207
void IMatrixCollector::save_imatrix(int ncall) const {
208208
auto fname = m_params.out_file;
209-
if (fname.empty()) {
210-
fname = "imatrix.dat";
211-
}
212209

213210
if (ncall > 0) {
214211
fname += ".at_";
@@ -583,6 +580,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
583580
int main(int argc, char ** argv) {
584581
common_params params;
585582

583+
params.out_file = "imatrix.dat" ;
584+
586585
params.n_ctx = 512;
587586
params.logits_all = true;
588587
params.escape = false;

0 commit comments

Comments
 (0)