Skip to content

Commit 724763f

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/vulkan.Dockerfile # .github/workflows/build.yml # .github/workflows/server.yml # common/common.cpp # examples/batched/README.md # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cpu/arch-fallback.h # ggml/src/ggml-opencl/ggml-opencl.cpp # scripts/sync-ggml.last # src/CMakeLists.txt # tests/test-backend-ops.cpp # tools/server/CMakeLists.txt
2 parents df30473 + 877566d commit 724763f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+5269
-3774
lines changed

common/arg.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1234,6 +1234,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12341234
[](common_params & params, const std::string & value) {
12351235
const auto sampler_names = string_split<std::string>(value, ';');
12361236
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
1237+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
12371238
}
12381239
).set_sparam());
12391240
add_opt(common_arg(
@@ -1263,27 +1264,31 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12631264
[](common_params & params, const std::string & value) {
12641265
params.sampling.temp = std::stof(value);
12651266
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
1267+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP;
12661268
}
12671269
).set_sparam());
12681270
add_opt(common_arg(
12691271
{"--top-k"}, "N",
12701272
string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
12711273
[](common_params & params, int value) {
12721274
params.sampling.top_k = value;
1275+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
12731276
}
12741277
).set_sparam());
12751278
add_opt(common_arg(
12761279
{"--top-p"}, "N",
12771280
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
12781281
[](common_params & params, const std::string & value) {
12791282
params.sampling.top_p = std::stof(value);
1283+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P;
12801284
}
12811285
).set_sparam());
12821286
add_opt(common_arg(
12831287
{"--min-p"}, "N",
12841288
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
12851289
[](common_params & params, const std::string & value) {
12861290
params.sampling.min_p = std::stof(value);
1291+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P;
12871292
}
12881293
).set_sparam());
12891294
add_opt(common_arg(
@@ -1298,13 +1303,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12981303
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
12991304
[](common_params & params, const std::string & value) {
13001305
params.sampling.xtc_probability = std::stof(value);
1306+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY;
13011307
}
13021308
).set_sparam());
13031309
add_opt(common_arg(
13041310
{"--xtc-threshold"}, "N",
13051311
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
13061312
[](common_params & params, const std::string & value) {
13071313
params.sampling.xtc_threshold = std::stof(value);
1314+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD;
13081315
}
13091316
).set_sparam());
13101317
add_opt(common_arg(
@@ -1323,13 +1330,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13231330
}
13241331
params.sampling.penalty_last_n = value;
13251332
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
1333+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N;
13261334
}
13271335
).set_sparam());
13281336
add_opt(common_arg(
13291337
{"--repeat-penalty"}, "N",
13301338
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
13311339
[](common_params & params, const std::string & value) {
13321340
params.sampling.penalty_repeat = std::stof(value);
1341+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT;
13331342
}
13341343
).set_sparam());
13351344
add_opt(common_arg(
@@ -1427,20 +1436,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14271436
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
14281437
[](common_params & params, int value) {
14291438
params.sampling.mirostat = value;
1439+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT;
14301440
}
14311441
).set_sparam());
14321442
add_opt(common_arg(
14331443
{"--mirostat-lr"}, "N",
14341444
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
14351445
[](common_params & params, const std::string & value) {
14361446
params.sampling.mirostat_eta = std::stof(value);
1447+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA;
14371448
}
14381449
).set_sparam());
14391450
add_opt(common_arg(
14401451
{"--mirostat-ent"}, "N",
14411452
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
14421453
[](common_params & params, const std::string & value) {
14431454
params.sampling.mirostat_tau = std::stof(value);
1455+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU;
14441456
}
14451457
).set_sparam());
14461458
add_opt(common_arg(

common/common.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <nlohmann/json.hpp>
1515
#include "json-schema-to-grammar.cpp"
1616
#include "llama.h"
17+
#include "sampling.h"
1718
#include "chat.cpp"
1819
#include "ggml/src/ggml-opt.cpp" //dear god pls
1920

@@ -957,6 +958,58 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
957958
// Model utils
958959
//
959960

961+
static inline void common_init_sampler_from_model(
962+
const llama_model * model,
963+
common_params_sampling & sparams) {
964+
965+
const uint64_t config = sparams.user_sampling_config;
966+
967+
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
968+
if (config & user_config) return;
969+
970+
char buf[64] = {0};
971+
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
972+
char * end = nullptr;
973+
int32_t v = strtol(buf, &end, 10);
974+
if (end && end != buf) dst = v;
975+
}
976+
};
977+
978+
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
979+
if (config & user_config) return;
980+
981+
char buf[128] = {0};
982+
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
983+
char * end = nullptr;
984+
float v = strtof(buf, &end);
985+
if (end && end != buf) dst = v;
986+
}
987+
};
988+
989+
// Sampling sequence
990+
if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
991+
char buf[512] = {0};
992+
if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
993+
const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
994+
if (!sampler_names.empty()) {
995+
sparams.samplers = common_sampler_types_from_names(sampler_names, true);
996+
}
997+
}
998+
}
999+
1000+
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K), sparams.top_k, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
1001+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P), sparams.top_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
1002+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P), sparams.min_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
1003+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
1004+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD), sparams.xtc_threshold, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
1005+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP), sparams.temp, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
1006+
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N), sparams.penalty_last_n, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
1007+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT), sparams.penalty_repeat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
1008+
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT), sparams.mirostat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
1009+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU), sparams.mirostat_tau, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
1010+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
1011+
}
1012+
9601013
struct common_init_result common_init_from_params(common_params & params) {
9611014
common_init_result iparams;
9621015
auto mparams = common_model_params_to_llama(params);
@@ -968,6 +1021,8 @@ struct common_init_result common_init_from_params(common_params & params) {
9681021
return iparams;
9691022
}
9701023

1024+
common_init_sampler_from_model(model, params.sampling);
1025+
9711026
const llama_vocab * vocab = llama_model_get_vocab(model);
9721027

9731028
auto cparams = common_context_params_to_llama(params);

common/common.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,22 @@ struct common_grammar_trigger {
136136
llama_token token = LLAMA_TOKEN_NULL;
137137
};
138138

139+
enum common_params_sampling_config : uint64_t {
140+
COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS = 1 << 0,
141+
COMMON_PARAMS_SAMPLING_CONFIG_TOP_K = 1 << 1,
142+
COMMON_PARAMS_SAMPLING_CONFIG_TOP_P = 1 << 2,
143+
COMMON_PARAMS_SAMPLING_CONFIG_MIN_P = 1 << 3,
144+
COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
145+
COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD = 1 << 5,
146+
COMMON_PARAMS_SAMPLING_CONFIG_TEMP = 1 << 6,
147+
COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N = 1 << 7,
148+
COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT = 1 << 8,
149+
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT = 1 << 9,
150+
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU = 1 << 10,
151+
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
152+
};
153+
154+
139155
// sampling parameters
140156
struct common_params_sampling {
141157
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -168,6 +184,8 @@ struct common_params_sampling {
168184
bool no_perf = false; // disable performance metrics
169185
bool timing_per_token = false;
170186

187+
uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
188+
171189
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
172190

173191

convert_hf_to_gguf.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -565,7 +565,7 @@ def prepare_tensors(self):
565565
gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
566566
)
567567
)
568-
or not new_name.endswith(".weight")
568+
or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
569569
):
570570
data_qtype = gguf.GGMLQuantizationType.F32
571571

@@ -4183,6 +4183,21 @@ def set_vocab(self):
41834183
super().set_vocab()
41844184

41854185

4186+
@ModelBase.register("RND1")
4187+
class RND1Model(Qwen2MoeModel):
4188+
model_arch = gguf.MODEL_ARCH.RND1
4189+
4190+
def set_gguf_parameters(self):
4191+
super().set_gguf_parameters()
4192+
4193+
# RND1 specific parameters
4194+
# RND1 uses bidirectional attention
4195+
self.gguf_writer.add_causal_attention(False)
4196+
4197+
if (mask_token_id := self.hparams.get("mask_token_id")) is not None:
4198+
self.gguf_writer.add_mask_token_id(mask_token_id)
4199+
4200+
41864201
@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration")
41874202
class Qwen3VLVisionModel(MmprojModel):
41884203
def __init__(self, *args, **kwargs):

convert_lora_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def parse_args() -> argparse.Namespace:
242242
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
243243
)
244244
parser.add_argument(
245-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
245+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f32",
246246
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
247247
)
248248
parser.add_argument(

examples/diffusion/README.md

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,54 @@ More Info:
66
- https://github.com/ggml-org/llama.cpp/pull/14644
77
- https://github.com/ggml-org/llama.cpp/pull/14771
88

9+
## Parameters
10+
The diffusion CLI supports various parameters to control the generation process:
911

10-
Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual`
12+
### Core Diffusion Parameters
13+
- `--diffusion-steps`: Number of diffusion steps (default: 256)
14+
- `--diffusion-algorithm`: Algorithm for token selection
15+
- `0`: ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
16+
- `1`: ENTROPY_BASED - Entropy-based selection
17+
- `2`: MARGIN_BASED - Margin-based selection
18+
- `3`: RANDOM - Random selection
19+
- `4`: CONFIDENCE_BASED - Confidence-based selection (default)
20+
- More documentation here https://github.com/DreamLM/Dream
21+
- `--diffusion-visual`: Enable live visualization during generation
1122

12-
Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual`
23+
### Scheduling Parameters
24+
Choose one of the following scheduling methods:
1325

26+
**Timestep-based scheduling:**
27+
- `--diffusion-eps`: Epsilon value for timestep scheduling (e.g., 0.001)
28+
29+
**Block-based scheduling:**
30+
- `--diffusion-block-length`: Block size for block-based scheduling (e.g., 32)
31+
32+
### Sampling Parameters
33+
- `--temp`: Temperature for sampling (0.0 = greedy/deterministic, higher = more random)
34+
- `--top-k`: Top-k filtering for sampling
35+
- `--top-p`: Top-p (nucleus) filtering for sampling
36+
- `--seed`: Random seed for reproducibility
37+
38+
### Model Parameters
39+
- `-m`: Path to the GGUF model file
40+
- `-p`: Input prompt text
41+
- `-ub`: Maximum sequence length (ubatch size)
42+
- `-c`: Context size
43+
- `-b`: Batch size
44+
45+
### Examples
46+
#### Dream architechture:
47+
```
48+
llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual
49+
```
50+
51+
#### LLaDA architechture:
52+
```
53+
llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual
54+
```
55+
56+
#### RND1 architecture:
57+
```
58+
llama-diffusion-cli -m RND1-Base-0910.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-algorithm 1 --diffusion-steps 256 --diffusion-visual --temp 0.5 --diffusion-eps 0.001
59+
```

ggml/src/ggml-cpu/arch-fallback.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,7 @@
4949
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
5050
// repack.cpp
5151
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
52-
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
5352
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
54-
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
5553
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
5654
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
5755
// repack.cpp

0 commit comments

Comments
 (0)