LostRuins
diff --git a/‎.clang-format‎
Lines changed: 7 additions & 0 deletions b/‎.clang-format‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 5 additions & 5 deletions b/‎common/arg.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎common/common.h‎
Lines changed: 17 additions & 3 deletions b/‎common/common.h‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎common/json-schema-to-grammar.cpp‎
Lines changed: 7 additions & 6 deletions b/‎common/json-schema-to-grammar.cpp‎
Lines changed: 7 additions & 6 deletions
@@ -22,6 +22,13 @@ AllowShortIfStatementsOnASingleLine: Never
 AllowShortLambdasOnASingleLine: Inline
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakBeforeMultilineStrings: true
+# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
+AttributeMacros:
+  - __host__
+  - __device__
+  - __global__
+  - __forceinline__
+  - __launch_bounds__
 BinPackArguments: true
 BinPackParameters: false # OnePerLine
 BitFieldColonSpacing: Both
 
@@ -1706,7 +1706,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.system_prompt = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
     add_opt(common_arg(
         {"--no-perf"},
         string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -2550,7 +2550,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--cpu-moe", "-cmoe"},
         "keep all Mixture of Experts (MoE) weights in the CPU",
         [](common_params & params) {
-            params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
         }
     ).set_env("LLAMA_ARG_CPU_MOE"));
     add_opt(common_arg(
@@ -2563,7 +2563,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             for (int i = 0; i < value; ++i) {
                 // keep strings alive and avoid leaking memory by storing them in a static vector
                 static std::list<std::string> buft_overrides;
-                buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
+                buft_overrides.push_back(llm_ffn_exps_block_regex(i));
                 params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
             }
         }
@@ -2572,7 +2572,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--cpu-moe-draft", "-cmoed"},
         "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
         [](common_params & params) {
-            params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
+            params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
     add_opt(common_arg(
@@ -2584,7 +2584,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             for (int i = 0; i < value; ++i) {
                 static std::list<std::string> buft_overrides_draft;
-                buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
+                buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
                 params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
             }
         }
 
@@ -284,9 +284,9 @@ struct common_params {
     float   rope_freq_base        =  0.0f; // RoPE base frequency
     float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
     float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
-    float   yarn_attn_factor      =  1.0f; // YaRN magnitude scaling factor
-    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
-    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
+    float   yarn_attn_factor      = -1.0f; // YaRN magnitude scaling factor
+    float   yarn_beta_fast        = -1.0f; // YaRN low correction dim
+    float   yarn_beta_slow        = -1.0f; // YaRN high correction dim
     int32_t yarn_orig_ctx         =     0; // YaRN original context length
 
     // offload params
@@ -730,6 +730,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 
 }
 
+//
+// MoE utils
+//
+
+const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
+
+static std::string llm_ffn_exps_block_regex(int idx) {
+    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
+}
+
+static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
+    return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
+}
+
 //
 // training utils
 //
 
@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
 };
 
 static bool is_reserved_name(const std::string & name) {
-    static std::unordered_set<std::string> RESERVED_NAMES;
-    if (RESERVED_NAMES.empty()) {
-        RESERVED_NAMES.insert("root");
-        for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
-        for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
-    }
+    static const std::unordered_set<std::string> RESERVED_NAMES = [] {
+        std::unordered_set<std::string> s;
+        s.insert("root");
+        for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
+        for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
+        return s;
+    }();
     return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
 }
Original file line number	Diff line number	Diff line change
`@@ -1706,7 +1706,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`1706`	`1706`	`[](common_params & params, const std::string & value) {`
`1707`	`1707`	`params.system_prompt = value;`
`1708`	`1708`	`}`
`1709`		`- ).set_examples({LLAMA_EXAMPLE_MAIN}));`
	`1709`	`+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));`
`1710`	`1710`	`add_opt(common_arg(`
`1711`	`1711`	`{"--no-perf"},`
`1712`	`1712`	`string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),`
`@@ -2550,7 +2550,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`2550`	`2550`	`{"--cpu-moe", "-cmoe"},`
`2551`	`2551`	`"keep all Mixture of Experts (MoE) weights in the CPU",`
`2552`	`2552`	`[](common_params & params) {`
`2553`		`- params.tensor_buft_overrides.push_back({"\\.ffn_(up\|down\|gate)_exps", ggml_backend_cpu_buffer_type()});`
	`2553`	`+ params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());`
`2554`	`2554`	`}`
`2555`	`2555`	`).set_env("LLAMA_ARG_CPU_MOE"));`
`2556`	`2556`	`add_opt(common_arg(`
`@@ -2563,7 +2563,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`2563`	`2563`	`for (int i = 0; i < value; ++i) {`
`2564`	`2564`	`// keep strings alive and avoid leaking memory by storing them in a static vector`
`2565`	`2565`	`static std::list<std::string> buft_overrides;`
`2566`		`- buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up\|down\|gate)_exps", i));`
	`2566`	`+ buft_overrides.push_back(llm_ffn_exps_block_regex(i));`
`2567`	`2567`	`params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});`
`2568`	`2568`	`}`
`2569`	`2569`	`}`
`@@ -2572,7 +2572,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`2572`	`2572`	`{"--cpu-moe-draft", "-cmoed"},`
`2573`	`2573`	`"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",`
`2574`	`2574`	`[](common_params & params) {`
`2575`		`- params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up\|down\|gate)_exps", ggml_backend_cpu_buffer_type()});`
	`2575`	`+ params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());`
`2576`	`2576`	`}`
`2577`	`2577`	`).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));`
`2578`	`2578`	`add_opt(common_arg(`
`@@ -2584,7 +2584,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`2584`	`2584`	`}`
`2585`	`2585`	`for (int i = 0; i < value; ++i) {`
`2586`	`2586`	`static std::list<std::string> buft_overrides_draft;`
`2587`		`- buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up\|down\|gate)_exps", i));`
	`2587`	`+ buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));`
`2588`	`2588`	`params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});`
`2589`	`2589`	`}`
`2590`	`2590`	`}`