Skip to content

Commit 1d5726a

Browse files
committed
Merge remote-tracking branch 'upstream/master' into fixes
2 parents cddda7e + daf2dd7 commit 1d5726a

File tree

144 files changed

+71030
-32784
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

144 files changed

+71030
-32784
lines changed

.devops/musa.Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
ARG UBUNTU_VERSION=22.04
22
# This needs to generally match the container host's environment.
3-
ARG MUSA_VERSION=rc4.0.1
3+
ARG MUSA_VERSION=rc4.2.0
44
# Target the MUSA build image
5-
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
5+
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
66

7-
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
7+
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
88

99
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
1010

.devops/rocm.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
ARG UBUNTU_VERSION=24.04
22

33
# This needs to generally match the container host's environment.
4-
ARG ROCM_VERSION=6.3
5-
ARG AMDGPU_VERSION=6.3
4+
ARG ROCM_VERSION=6.4
5+
ARG AMDGPU_VERSION=6.4
66

77
# Target the CUDA build image
88
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,7 @@ jobs:
515515
516516
ubuntu-22-cmake-musa:
517517
runs-on: ubuntu-22.04
518-
container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
518+
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
519519

520520
steps:
521521
- name: Clone

.github/workflows/close-issue.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
steps:
1818
- uses: actions/stale@v5
1919
with:
20-
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
20+
exempt-issue-labels: "refactoring,help wanted,good first issue,research,bug,roadmap"
2121
days-before-issue-stale: 30
2222
days-before-issue-close: 14
2323
stale-issue-label: "stale"

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ models/*
8282
models-mnt
8383
!models/.editorconfig
8484
!models/ggml-vocab-*.gguf*
85+
!models/templates
8586

8687
# Zig
8788
zig-out/

ci/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ docker run --privileged -it \
5454
-v $HOME/llama.cpp/ci-cache:/ci-cache \
5555
-v $HOME/llama.cpp/ci-results:/ci-results \
5656
-v $PWD:/ws -w /ws \
57-
mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
57+
mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
5858
```
5959

6060
Inside the container, execute the following commands:

common/arg.cpp

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -977,6 +977,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
977977
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
978978
string_process_escapes(seq_breaker);
979979
}
980+
for (auto & pair : params.speculative.replacements) {
981+
string_process_escapes(pair.first);
982+
string_process_escapes(pair.second);
983+
}
980984
}
981985

982986
if (!params.kv_overrides.empty()) {
@@ -2091,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20912095
params.no_kv_offload = true;
20922096
}
20932097
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
2098+
add_opt(common_arg(
2099+
{"-nr", "--no-repack"},
2100+
"disable weight repacking",
2101+
[](common_params & params) {
2102+
params.no_extra_bufts = true;
2103+
}
2104+
).set_env("LLAMA_ARG_NO_REPACK"));
20942105
add_opt(common_arg(
20952106
{"-ctk", "--cache-type-k"}, "TYPE",
20962107
string_format(
@@ -2369,6 +2380,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23692380
}
23702381
}
23712382
));
2383+
add_opt(common_arg(
2384+
{"--cpu-moe"},
2385+
"use CPU for Mixture of Experts (MoE) weights",
2386+
[](common_params & params) {
2387+
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2388+
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2389+
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2390+
}
2391+
).set_env("LLAMA_ARG_CPU_MOE"));
23722392
add_opt(common_arg(
23732393
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
23742394
"number of layers to store in VRAM",
@@ -3249,6 +3269,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32493269
params.speculative.model.path = value;
32503270
}
32513271
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3272+
add_opt(common_arg(
3273+
{"--spec-replace"}, "TARGET", "DRAFT",
3274+
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
3275+
[](common_params & params, const std::string & tgt, const std::string & dft) {
3276+
params.speculative.replacements.push_back({ tgt, dft });
3277+
}
3278+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
32523279
add_opt(common_arg(
32533280
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
32543281
string_format(
@@ -3438,34 +3465,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34383465
}
34393466
).set_examples({LLAMA_EXAMPLE_SERVER}));
34403467

3441-
// diffusion parameters
34423468
add_opt(common_arg(
34433469
{ "--diffusion-steps" }, "N",
34443470
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
34453471
[](common_params & params, int value) { params.diffusion.steps = value; }
34463472
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3473+
add_opt(common_arg(
3474+
{ "--diffusion-visual" },
3475+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3476+
params.diffusion.visual_mode ? "true" : "false"),
3477+
[](common_params & params) { params.diffusion.visual_mode = true; }
3478+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3479+
34473480
add_opt(common_arg(
34483481
{ "--diffusion-eps" }, "F",
34493482
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
34503483
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
34513484
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
34523485
add_opt(common_arg(
34533486
{ "--diffusion-algorithm" }, "N",
3454-
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
3487+
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
34553488
params.diffusion.algorithm),
34563489
[](common_params & params, int value) { params.diffusion.algorithm = value; }
34573490
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
34583491
add_opt(common_arg(
34593492
{ "--diffusion-alg-temp" }, "F",
3460-
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3493+
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
34613494
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
34623495
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3496+
34633497
add_opt(common_arg(
3464-
{ "--diffusion-visual" },
3465-
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3466-
params.diffusion.visual_mode ? "true" : "false"),
3467-
[](common_params & params) { params.diffusion.visual_mode = true; }
3498+
{ "--diffusion-block-length" }, "N",
3499+
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3500+
[](common_params & params, int value) { params.diffusion.block_length = value; }
3501+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3502+
add_opt(common_arg(
3503+
{ "--diffusion-cfg-scale" }, "F",
3504+
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3505+
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
34683506
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3507+
add_opt(common_arg(
3508+
{ "--diffusion-add-gumbel-noise" }, "F",
3509+
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3510+
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3511+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3512+
34693513

34703514
return ctx_arg;
34713515
}

common/chat.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1944,6 +1944,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
19441944
}
19451945
}
19461946
auto msg = builder.result();
1947-
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
1947+
if (!is_partial) {
1948+
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
1949+
}
19481950
return msg;
19491951
}

common/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,6 +1122,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
11221122
mparams.use_mmap = params.use_mmap;
11231123
mparams.use_mlock = params.use_mlock;
11241124
mparams.check_tensors = params.check_tensors;
1125+
mparams.use_extra_bufts = !params.no_extra_bufts;
11251126

11261127
if (params.kv_overrides.empty()) {
11271128
mparams.kv_overrides = NULL;

common/common.h

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ struct common_params_speculative {
201201
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
202202
float p_split = 0.1f; // speculative decoding split probability
203203
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
204+
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
204205

205206
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
206207
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -220,11 +221,17 @@ struct common_params_vocoder {
220221
};
221222

222223
struct common_params_diffusion {
223-
int32_t steps = 64; // number of diffusion steps
224-
float eps = 1e-3f; // epsilon for timesteps
225-
int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
226-
float alg_temp = 0.0f; // algorithm temperature
227-
bool visual_mode = false; // show progressive diffusion on screen
224+
int32_t steps = 128;
225+
bool visual_mode = false;
226+
227+
float eps = 0; // epsilon for timesteps
228+
int32_t block_length = 0; // block length for generation
229+
230+
int32_t algorithm = 4; // default algorithm: low-confidence
231+
float alg_temp = 0.0f; // algorithm temperature
232+
233+
float cfg_scale = 0; // classifier-free guidance scale
234+
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
228235
};
229236

230237
enum common_reasoning_format {
@@ -352,6 +359,7 @@ struct common_params {
352359
bool warmup = true; // warmup run
353360
bool check_tensors = false; // validate tensor data
354361
bool no_op_offload = false; // globally disable offload host tensor operations to device
362+
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
355363

356364
bool single_turn = false; // single turn chat conversation
357365

0 commit comments

Comments
 (0)