Skip to content

Commit 3e72aaf

Browse files
committed
Merge commit '8f8f2274ee3601fecf6e2d57b52f701c81bede21' into concedo_experimental
# Conflicts: # .devops/rocm.Dockerfile # .github/workflows/build.yml # .github/workflows/release.yml # CMakeLists.txt # examples/simple/simple.cpp # ggml/src/ggml-cann/common.h # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-opencl/kernels/tsembd.cl # ggml/src/ggml-sycl/binbcast.cpp # ggml/src/ggml-sycl/binbcast.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/tsembd.cpp # ggml/src/ggml-zdnn/ggml-zdnn.cpp # src/llama-model.cpp # tools/batched-bench/CMakeLists.txt # tools/cvector-generator/CMakeLists.txt # tools/export-lora/CMakeLists.txt # tools/gguf-split/CMakeLists.txt # tools/imatrix/CMakeLists.txt # tools/llama-bench/CMakeLists.txt # tools/llama-bench/llama-bench.cpp # tools/main/CMakeLists.txt # tools/main/README.md # tools/mtmd/CMakeLists.txt # tools/perplexity/CMakeLists.txt # tools/perplexity/perplexity.cpp # tools/quantize/CMakeLists.txt # tools/rpc/rpc-server.cpp # tools/run/CMakeLists.txt # tools/run/run.cpp # tools/tokenize/CMakeLists.txt # tools/tts/CMakeLists.txt
2 parents 04498a3 + 8f8f227 commit 3e72aaf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1986
-1451
lines changed

.clang-format

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@ AllowShortIfStatementsOnASingleLine: Never
2222
AllowShortLambdasOnASingleLine: Inline
2323
AllowShortLoopsOnASingleLine: false
2424
AlwaysBreakBeforeMultilineStrings: true
25+
# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
26+
AttributeMacros:
27+
- __host__
28+
- __device__
29+
- __global__
30+
- __forceinline__
31+
- __launch_bounds__
2532
BinPackArguments: true
2633
BinPackParameters: false # OnePerLine
2734
BitFieldColonSpacing: Both

common/arg.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1706,7 +1706,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
17061706
[](common_params & params, const std::string & value) {
17071707
params.system_prompt = value;
17081708
}
1709-
).set_examples({LLAMA_EXAMPLE_MAIN}));
1709+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
17101710
add_opt(common_arg(
17111711
{"--no-perf"},
17121712
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -2550,7 +2550,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25502550
{"--cpu-moe", "-cmoe"},
25512551
"keep all Mixture of Experts (MoE) weights in the CPU",
25522552
[](common_params & params) {
2553-
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2553+
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
25542554
}
25552555
).set_env("LLAMA_ARG_CPU_MOE"));
25562556
add_opt(common_arg(
@@ -2563,7 +2563,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25632563
for (int i = 0; i < value; ++i) {
25642564
// keep strings alive and avoid leaking memory by storing them in a static vector
25652565
static std::list<std::string> buft_overrides;
2566-
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2566+
buft_overrides.push_back(llm_ffn_exps_block_regex(i));
25672567
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
25682568
}
25692569
}
@@ -2572,7 +2572,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25722572
{"--cpu-moe-draft", "-cmoed"},
25732573
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
25742574
[](common_params & params) {
2575-
params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2575+
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
25762576
}
25772577
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
25782578
add_opt(common_arg(
@@ -2584,7 +2584,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25842584
}
25852585
for (int i = 0; i < value; ++i) {
25862586
static std::list<std::string> buft_overrides_draft;
2587-
buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2587+
buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
25882588
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
25892589
}
25902590
}

common/common.h

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -284,9 +284,9 @@ struct common_params {
284284
float rope_freq_base = 0.0f; // RoPE base frequency
285285
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
286286
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
287-
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
288-
float yarn_beta_fast = 32.0f; // YaRN low correction dim
289-
float yarn_beta_slow = 1.0f; // YaRN high correction dim
287+
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
288+
float yarn_beta_fast = -1.0f; // YaRN low correction dim
289+
float yarn_beta_slow = -1.0f; // YaRN high correction dim
290290
int32_t yarn_orig_ctx = 0; // YaRN original context length
291291

292292
// offload params
@@ -730,6 +730,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
730730

731731
}
732732

733+
//
734+
// MoE utils
735+
//
736+
737+
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
738+
739+
static std::string llm_ffn_exps_block_regex(int idx) {
740+
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
741+
}
742+
743+
static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
744+
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
745+
}
746+
733747
//
734748
// training utils
735749
//

common/json-schema-to-grammar.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
257257
};
258258

259259
static bool is_reserved_name(const std::string & name) {
260-
static std::unordered_set<std::string> RESERVED_NAMES;
261-
if (RESERVED_NAMES.empty()) {
262-
RESERVED_NAMES.insert("root");
263-
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
264-
for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
265-
}
260+
static const std::unordered_set<std::string> RESERVED_NAMES = [] {
261+
std::unordered_set<std::string> s;
262+
s.insert("root");
263+
for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
264+
for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
265+
return s;
266+
}();
266267
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
267268
}
268269

0 commit comments

Comments
 (0)