Skip to content

Commit e5b4f3e

Browse files
Merge pull request #340 from janhq/update-dev-from-master-2025-11-26-00-36
Sync master with upstream release b7157
2 parents 0e68971 + 583cb83 commit e5b4f3e

35 files changed

+1162
-127
lines changed

CODEOWNERS

Lines changed: 8 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22
# multiplie collaborators per item can be specified
33

44
/.devops/*.Dockerfile @ngxson
5-
/.github/actions/ @slaren @CISC
5+
/.github/actions/ @CISC
66
/.github/workflows/ @CISC
7-
/.github/workflows/release.yml @slaren
8-
/.github/workflows/winget.yml @slaren
97
/ci/ @ggerganov
108
/cmake/ @ggerganov
119
/common/CMakeLists.txt @ggerganov
@@ -40,41 +38,34 @@
4038
/examples/passkey/ @ggerganov
4139
/examples/retrieval/ @ggerganov
4240
/examples/save-load-state/ @ggerganov
43-
/examples/simple-chat/ @slaren
44-
/examples/simple/ @slaren
4541
/examples/speculative-simple/ @ggerganov
4642
/examples/speculative/ @ggerganov
4743
/ggml/cmake/ @ggerganov
48-
/ggml/include/ @ggerganov @slaren
49-
/ggml/src/ggml-alloc.c @slaren
50-
/ggml/src/ggml-backend* @slaren
51-
/ggml/src/ggml-blas/ @slaren
52-
/ggml/src/ggml-common.h @ggerganov @slaren
53-
/ggml/src/ggml-cpu/ @ggerganov @slaren
44+
/ggml/include/ @ggerganov
45+
/ggml/src/ggml-common.h @ggerganov
46+
/ggml/src/ggml-cpu/ @ggerganov
5447
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
55-
/ggml/src/ggml-cuda/common.cuh @slaren
5648
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
57-
/ggml/src/ggml-cuda/ggml-cuda.cu @slaren
5849
/ggml/src/ggml-cuda/mmf.* @JohannesGaessler @am17an
5950
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
6051
/ggml/src/ggml-cuda/mmvf.* @JohannesGaessler
6152
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
6253
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
6354
/ggml/src/ggml-hip/ @IMbackK
6455
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
65-
/ggml/src/ggml-impl.h @ggerganov @slaren
56+
/ggml/src/ggml-impl.h @ggerganov
6657
/ggml/src/ggml-metal/ @ggerganov
6758
/ggml/src/ggml-opencl/ @lhez @max-krasnyansky
6859
/ggml/src/ggml-hexagon/ @max-krasnyansky @lhez
6960
/ggml/src/ggml-opt.cpp @JohannesGaessler
7061
/ggml/src/ggml-quants.* @ggerganov
7162
/ggml/src/ggml-rpc/ @rgerganov
72-
/ggml/src/ggml-threading.* @ggerganov @slaren
63+
/ggml/src/ggml-threading.* @ggerganov
7364
/ggml/src/ggml-vulkan/ @0cc4m
7465
/ggml/src/ggml-webgpu/ @reeselevine
7566
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
76-
/ggml/src/ggml.c @ggerganov @slaren
77-
/ggml/src/ggml.cpp @ggerganov @slaren
67+
/ggml/src/ggml.c @ggerganov
68+
/ggml/src/ggml.cpp @ggerganov
7869
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
7970
/gguf-py/ @CISC
8071
/media/ @ggerganov
@@ -86,15 +77,11 @@
8677
/src/llama-arch.* @CISC
8778
/src/llama-chat.* @ngxson
8879
/src/llama-graph.* @CISC
89-
/src/llama-model-loader.* @slaren
9080
/src/llama-model.* @CISC
9181
/src/llama-vocab.* @CISC
9282
/src/models/ @CISC
9383
/tests/ @ggerganov
94-
/tests/test-backend-ops.cpp @slaren
95-
/tests/test-thread-safety.cpp @slaren
9684
/tools/batched-bench/ @ggerganov
97-
/tools/llama-bench/ @slaren
9885
/tools/main/ @ggerganov
9986
/tools/mtmd/ @ngxson
10087
/tools/perplexity/ @ggerganov
@@ -106,8 +93,6 @@
10693
/tools/tokenize/ @ggerganov
10794
/tools/tts/ @ggerganov
10895
/vendor/ @ggerganov
109-
/.clang-format @slaren
110-
/.clang-tidy @slaren
11196
/AUTHORS @ggerganov
11297
/CMakeLists.txt @ggerganov
11398
/CONTRIBUTING.md @ggerganov

common/arg.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,6 +1232,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12321232
[](common_params & params, const std::string & value) {
12331233
const auto sampler_names = string_split<std::string>(value, ';');
12341234
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
1235+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
12351236
}
12361237
).set_sparam());
12371238
add_opt(common_arg(
@@ -1261,27 +1262,31 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12611262
[](common_params & params, const std::string & value) {
12621263
params.sampling.temp = std::stof(value);
12631264
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
1265+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP;
12641266
}
12651267
).set_sparam());
12661268
add_opt(common_arg(
12671269
{"--top-k"}, "N",
12681270
string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
12691271
[](common_params & params, int value) {
12701272
params.sampling.top_k = value;
1273+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
12711274
}
12721275
).set_sparam());
12731276
add_opt(common_arg(
12741277
{"--top-p"}, "N",
12751278
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
12761279
[](common_params & params, const std::string & value) {
12771280
params.sampling.top_p = std::stof(value);
1281+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P;
12781282
}
12791283
).set_sparam());
12801284
add_opt(common_arg(
12811285
{"--min-p"}, "N",
12821286
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
12831287
[](common_params & params, const std::string & value) {
12841288
params.sampling.min_p = std::stof(value);
1289+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P;
12851290
}
12861291
).set_sparam());
12871292
add_opt(common_arg(
@@ -1296,13 +1301,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12961301
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
12971302
[](common_params & params, const std::string & value) {
12981303
params.sampling.xtc_probability = std::stof(value);
1304+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY;
12991305
}
13001306
).set_sparam());
13011307
add_opt(common_arg(
13021308
{"--xtc-threshold"}, "N",
13031309
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
13041310
[](common_params & params, const std::string & value) {
13051311
params.sampling.xtc_threshold = std::stof(value);
1312+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD;
13061313
}
13071314
).set_sparam());
13081315
add_opt(common_arg(
@@ -1321,13 +1328,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13211328
}
13221329
params.sampling.penalty_last_n = value;
13231330
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
1331+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N;
13241332
}
13251333
).set_sparam());
13261334
add_opt(common_arg(
13271335
{"--repeat-penalty"}, "N",
13281336
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
13291337
[](common_params & params, const std::string & value) {
13301338
params.sampling.penalty_repeat = std::stof(value);
1339+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT;
13311340
}
13321341
).set_sparam());
13331342
add_opt(common_arg(
@@ -1425,20 +1434,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14251434
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
14261435
[](common_params & params, int value) {
14271436
params.sampling.mirostat = value;
1437+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT;
14281438
}
14291439
).set_sparam());
14301440
add_opt(common_arg(
14311441
{"--mirostat-lr"}, "N",
14321442
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
14331443
[](common_params & params, const std::string & value) {
14341444
params.sampling.mirostat_eta = std::stof(value);
1445+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA;
14351446
}
14361447
).set_sparam());
14371448
add_opt(common_arg(
14381449
{"--mirostat-ent"}, "N",
14391450
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
14401451
[](common_params & params, const std::string & value) {
14411452
params.sampling.mirostat_tau = std::stof(value);
1453+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU;
14421454
}
14431455
).set_sparam());
14441456
add_opt(common_arg(

common/common.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "common.h"
99
#include "log.h"
1010
#include "llama.h"
11+
#include "sampling.h"
1112

1213
#include <algorithm>
1314
#include <cinttypes>
@@ -949,6 +950,58 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
949950
// Model utils
950951
//
951952

953+
static inline void common_init_sampler_from_model(
954+
const llama_model * model,
955+
common_params_sampling & sparams) {
956+
957+
const uint64_t config = sparams.user_sampling_config;
958+
959+
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
960+
if (config & user_config) return;
961+
962+
char buf[64] = {0};
963+
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
964+
char * end = nullptr;
965+
int32_t v = strtol(buf, &end, 10);
966+
if (end && end != buf) dst = v;
967+
}
968+
};
969+
970+
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
971+
if (config & user_config) return;
972+
973+
char buf[128] = {0};
974+
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
975+
char * end = nullptr;
976+
float v = strtof(buf, &end);
977+
if (end && end != buf) dst = v;
978+
}
979+
};
980+
981+
// Sampling sequence
982+
if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
983+
char buf[512] = {0};
984+
if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
985+
const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
986+
if (!sampler_names.empty()) {
987+
sparams.samplers = common_sampler_types_from_names(sampler_names, true);
988+
}
989+
}
990+
}
991+
992+
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K), sparams.top_k, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
993+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P), sparams.top_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
994+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P), sparams.min_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
995+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
996+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD), sparams.xtc_threshold, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
997+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP), sparams.temp, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
998+
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N), sparams.penalty_last_n, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
999+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT), sparams.penalty_repeat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
1000+
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT), sparams.mirostat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
1001+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU), sparams.mirostat_tau, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
1002+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
1003+
}
1004+
9521005
struct common_init_result common_init_from_params(common_params & params) {
9531006
common_init_result iparams;
9541007
auto mparams = common_model_params_to_llama(params);
@@ -960,6 +1013,8 @@ struct common_init_result common_init_from_params(common_params & params) {
9601013
return iparams;
9611014
}
9621015

1016+
common_init_sampler_from_model(model, params.sampling);
1017+
9631018
const llama_vocab * vocab = llama_model_get_vocab(model);
9641019

9651020
auto cparams = common_context_params_to_llama(params);

common/common.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,22 @@ struct common_grammar_trigger {
140140
llama_token token = LLAMA_TOKEN_NULL;
141141
};
142142

143+
enum common_params_sampling_config : uint64_t {
144+
COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS = 1 << 0,
145+
COMMON_PARAMS_SAMPLING_CONFIG_TOP_K = 1 << 1,
146+
COMMON_PARAMS_SAMPLING_CONFIG_TOP_P = 1 << 2,
147+
COMMON_PARAMS_SAMPLING_CONFIG_MIN_P = 1 << 3,
148+
COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
149+
COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD = 1 << 5,
150+
COMMON_PARAMS_SAMPLING_CONFIG_TEMP = 1 << 6,
151+
COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N = 1 << 7,
152+
COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT = 1 << 8,
153+
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT = 1 << 9,
154+
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU = 1 << 10,
155+
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
156+
};
157+
158+
143159
// sampling parameters
144160
struct common_params_sampling {
145161
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -172,6 +188,8 @@ struct common_params_sampling {
172188
bool no_perf = false; // disable performance metrics
173189
bool timing_per_token = false;
174190

191+
uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
192+
175193
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
176194

177195

convert_hf_to_gguf.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10061,6 +10061,25 @@ class LazyTorchTensor(gguf.LazyBase):
1006110061
torch.uint8: np.uint8,
1006210062
}
1006310063

10064+
# only used when byteswapping data. Only correct size is needed
10065+
_dtype_byteswap_map: dict[torch.dtype, type] = {
10066+
torch.float64: np.float64,
10067+
torch.float32: np.float32,
10068+
torch.bfloat16: np.float16,
10069+
torch.float16: np.float16,
10070+
torch.int64: np.int64,
10071+
torch.uint64: np.uint64,
10072+
torch.int32: np.int32,
10073+
torch.uint32: np.uint32,
10074+
torch.int16: np.int16,
10075+
torch.uint16: np.uint16,
10076+
torch.int8: np.int8,
10077+
torch.uint8: np.uint8,
10078+
torch.bool: np.uint8,
10079+
torch.float8_e4m3fn: np.uint8,
10080+
torch.float8_e5m2: np.uint8,
10081+
}
10082+
1006410083
# used for safetensors slices
1006510084
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
1006610085
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
@@ -10104,19 +10123,31 @@ def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
1010410123
@classmethod
1010510124
def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor:
1010610125
def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor:
10126+
def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray:
10127+
if sys.byteorder == 'big':
10128+
# switch data back to big endian
10129+
tensor = tensor.view(dtype).byteswap(inplace=False)
10130+
return tensor
1010710131
dtype = cls._dtype_str_map[tensor.dtype]
10108-
return torch.from_numpy(tensor.mmap_bytes()).view(dtype).reshape(tensor.shape)
10132+
numpy_dtype = cls._dtype_byteswap_map[dtype]
10133+
return torch.from_numpy(byteswap_tensor(tensor.mmap_bytes(), numpy_dtype)).view(dtype).reshape(tensor.shape)
1010910134
dtype = cls._dtype_str_map[t.dtype]
1011010135
shape = t.shape
1011110136
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r))
1011210137
return cast(torch.Tensor, lazy)
1011310138

1011410139
@classmethod
1011510140
def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor):
10141+
def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray:
10142+
if sys.byteorder == 'big':
10143+
# switch data back to big endian
10144+
tensor = tensor.view(dtype).byteswap(inplace=False)
10145+
return tensor
1011610146
dtype = cls._dtype_str_map[remote_tensor.dtype]
10147+
numpy_dtype = cls._dtype_byteswap_map[dtype]
1011710148
shape = remote_tensor.shape
1011810149
meta = cls.meta_with_dtype_and_shape(dtype, shape)
10119-
lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.frombuffer(r.data(), dtype=dtype).reshape(shape))
10150+
lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.from_numpy(byteswap_tensor(np.frombuffer(r.data(), dtype=numpy_dtype), numpy_dtype)).view(dtype).reshape(shape))
1012010151
return cast(torch.Tensor, lazy)
1012110152

1012210153
@classmethod

ggml/include/ggml.h

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,7 @@ extern "C" {
530530
GGML_OP_ARANGE,
531531
GGML_OP_TIMESTEP_EMBEDDING,
532532
GGML_OP_ARGSORT,
533+
GGML_OP_TOP_K,
533534
GGML_OP_LEAKY_RELU,
534535
GGML_OP_TRI,
535536
GGML_OP_FILL,
@@ -2258,18 +2259,25 @@ extern "C" {
22582259
struct ggml_tensor * a,
22592260
enum ggml_sort_order order);
22602261

2261-
GGML_API struct ggml_tensor * ggml_arange(
2262+
// similar to ggml_top_k but implemented as `argsort` + `view`
2263+
GGML_API struct ggml_tensor * ggml_argsort_top_k(
22622264
struct ggml_context * ctx,
2263-
float start,
2264-
float stop,
2265-
float step);
2265+
struct ggml_tensor * a,
2266+
int k);
22662267

22672268
// top k elements per row
2269+
// note: the resulting top k indices are in no particular order
22682270
GGML_API struct ggml_tensor * ggml_top_k(
22692271
struct ggml_context * ctx,
22702272
struct ggml_tensor * a,
22712273
int k);
22722274

2275+
GGML_API struct ggml_tensor * ggml_arange(
2276+
struct ggml_context * ctx,
2277+
float start,
2278+
float stop,
2279+
float step);
2280+
22732281
#define GGML_KQ_MASK_PAD 64
22742282

22752283
// q: [n_embd_k, n_batch, n_head, ne3 ]

0 commit comments

Comments
 (0)