Skip to content

Commit 2b4c792

Browse files
committed
Merge remote-tracking branch 'upstream/master' into backend-sampling
2 parents a02adf4 + 877566d commit 2b4c792

35 files changed

+4405
-3036
lines changed

common/arg.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,6 +1232,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12321232
[](common_params & params, const std::string & value) {
12331233
const auto sampler_names = string_split<std::string>(value, ';');
12341234
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
1235+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
12351236
}
12361237
).set_sparam());
12371238
add_opt(common_arg(
@@ -1261,27 +1262,31 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12611262
[](common_params & params, const std::string & value) {
12621263
params.sampling.temp = std::stof(value);
12631264
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
1265+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP;
12641266
}
12651267
).set_sparam());
12661268
add_opt(common_arg(
12671269
{"--top-k"}, "N",
12681270
string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
12691271
[](common_params & params, int value) {
12701272
params.sampling.top_k = value;
1273+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
12711274
}
12721275
).set_sparam());
12731276
add_opt(common_arg(
12741277
{"--top-p"}, "N",
12751278
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
12761279
[](common_params & params, const std::string & value) {
12771280
params.sampling.top_p = std::stof(value);
1281+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P;
12781282
}
12791283
).set_sparam());
12801284
add_opt(common_arg(
12811285
{"--min-p"}, "N",
12821286
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
12831287
[](common_params & params, const std::string & value) {
12841288
params.sampling.min_p = std::stof(value);
1289+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P;
12851290
}
12861291
).set_sparam());
12871292
add_opt(common_arg(
@@ -1296,13 +1301,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12961301
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
12971302
[](common_params & params, const std::string & value) {
12981303
params.sampling.xtc_probability = std::stof(value);
1304+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY;
12991305
}
13001306
).set_sparam());
13011307
add_opt(common_arg(
13021308
{"--xtc-threshold"}, "N",
13031309
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
13041310
[](common_params & params, const std::string & value) {
13051311
params.sampling.xtc_threshold = std::stof(value);
1312+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD;
13061313
}
13071314
).set_sparam());
13081315
add_opt(common_arg(
@@ -1321,13 +1328,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13211328
}
13221329
params.sampling.penalty_last_n = value;
13231330
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
1331+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N;
13241332
}
13251333
).set_sparam());
13261334
add_opt(common_arg(
13271335
{"--repeat-penalty"}, "N",
13281336
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
13291337
[](common_params & params, const std::string & value) {
13301338
params.sampling.penalty_repeat = std::stof(value);
1339+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT;
13311340
}
13321341
).set_sparam());
13331342
add_opt(common_arg(
@@ -1425,20 +1434,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14251434
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
14261435
[](common_params & params, int value) {
14271436
params.sampling.mirostat = value;
1437+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT;
14281438
}
14291439
).set_sparam());
14301440
add_opt(common_arg(
14311441
{"--mirostat-lr"}, "N",
14321442
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
14331443
[](common_params & params, const std::string & value) {
14341444
params.sampling.mirostat_eta = std::stof(value);
1445+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA;
14351446
}
14361447
).set_sparam());
14371448
add_opt(common_arg(
14381449
{"--mirostat-ent"}, "N",
14391450
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
14401451
[](common_params & params, const std::string & value) {
14411452
params.sampling.mirostat_tau = std::stof(value);
1453+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU;
14421454
}
14431455
).set_sparam());
14441456
add_opt(common_arg(

common/common.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -950,6 +950,58 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
950950
// Model utils
951951
//
952952

953+
static inline void common_init_sampler_from_model(
954+
const llama_model * model,
955+
common_params_sampling & sparams) {
956+
957+
const uint64_t config = sparams.user_sampling_config;
958+
959+
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
960+
if (config & user_config) return;
961+
962+
char buf[64] = {0};
963+
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
964+
char * end = nullptr;
965+
int32_t v = strtol(buf, &end, 10);
966+
if (end && end != buf) dst = v;
967+
}
968+
};
969+
970+
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
971+
if (config & user_config) return;
972+
973+
char buf[128] = {0};
974+
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
975+
char * end = nullptr;
976+
float v = strtof(buf, &end);
977+
if (end && end != buf) dst = v;
978+
}
979+
};
980+
981+
// Sampling sequence
982+
if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
983+
char buf[512] = {0};
984+
if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
985+
const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
986+
if (!sampler_names.empty()) {
987+
sparams.samplers = common_sampler_types_from_names(sampler_names, true);
988+
}
989+
}
990+
}
991+
992+
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K), sparams.top_k, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
993+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P), sparams.top_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
994+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P), sparams.min_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
995+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
996+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD), sparams.xtc_threshold, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
997+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP), sparams.temp, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
998+
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N), sparams.penalty_last_n, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
999+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT), sparams.penalty_repeat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
1000+
get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT), sparams.mirostat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
1001+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU), sparams.mirostat_tau, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
1002+
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
1003+
}
1004+
9531005
struct common_init_result common_init_from_params(common_params & params) {
9541006
common_init_result iparams;
9551007
auto mparams = common_model_params_to_llama(params);
@@ -961,6 +1013,8 @@ struct common_init_result common_init_from_params(common_params & params) {
9611013
return iparams;
9621014
}
9631015

1016+
common_init_sampler_from_model(model, params.sampling);
1017+
9641018
const llama_vocab * vocab = llama_model_get_vocab(model);
9651019

9661020
auto cparams = common_context_params_to_llama(params);

common/common.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,22 @@ struct common_grammar_trigger {
140140
llama_token token = LLAMA_TOKEN_NULL;
141141
};
142142

143+
enum common_params_sampling_config : uint64_t {
144+
COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS = 1 << 0,
145+
COMMON_PARAMS_SAMPLING_CONFIG_TOP_K = 1 << 1,
146+
COMMON_PARAMS_SAMPLING_CONFIG_TOP_P = 1 << 2,
147+
COMMON_PARAMS_SAMPLING_CONFIG_MIN_P = 1 << 3,
148+
COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
149+
COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD = 1 << 5,
150+
COMMON_PARAMS_SAMPLING_CONFIG_TEMP = 1 << 6,
151+
COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N = 1 << 7,
152+
COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT = 1 << 8,
153+
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT = 1 << 9,
154+
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU = 1 << 10,
155+
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
156+
};
157+
158+
143159
// sampling parameters
144160
struct common_params_sampling {
145161
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -172,6 +188,8 @@ struct common_params_sampling {
172188
bool no_perf = false; // disable performance metrics
173189
bool timing_per_token = false;
174190

191+
uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
192+
175193
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
176194

177195

convert_hf_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -565,7 +565,7 @@ def prepare_tensors(self):
565565
gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
566566
)
567567
)
568-
or not new_name.endswith(".weight")
568+
or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
569569
):
570570
data_qtype = gguf.GGMLQuantizationType.F32
571571

convert_lora_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def parse_args() -> argparse.Namespace:
242242
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
243243
)
244244
parser.add_argument(
245-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
245+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f32",
246246
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
247247
)
248248
parser.add_argument(

examples/batched/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
The example demonstrates batched generation from a given prompt
44

55
```bash
6-
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
6+
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 --kv-unified
77

88
...
99

ggml/CMakeLists.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,17 @@ if(GIT_EXE)
2525
)
2626
endif()
2727

28-
# Build the version string with optional dirty flag
2928
set(GGML_VERSION "${GGML_VERSION_BASE}")
30-
if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
31-
set(GGML_VERSION "${GGML_VERSION}-dirty")
32-
endif()
3329

3430
if(NOT GGML_BUILD_COMMIT)
3531
set(GGML_BUILD_COMMIT "unknown")
3632
endif()
3733

34+
# Build the commit string with optional dirty flag
35+
if(DEFINED GGML_GIT_DIRTY AND GGML_GIT_DIRTY EQUAL 1)
36+
set(GGML_BUILD_COMMIT "${GGML_BUILD_COMMIT}-dirty")
37+
endif()
38+
3839
include(CheckIncludeFileCXX)
3940

4041
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

ggml/src/CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,14 @@ function(ggml_add_cpu_backend_variant tag_name)
328328
set(GGML_INTERNAL_${feat} OFF)
329329
endforeach()
330330

331+
foreach (feat ${ARGN})
332+
set(GGML_INTERNAL_${feat} ON)
333+
endforeach()
334+
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
335+
foreach (feat RVV)
336+
set(GGML_INTERNAL_${feat} OFF)
337+
endforeach()
338+
331339
foreach (feat ${ARGN})
332340
set(GGML_INTERNAL_${feat} ON)
333341
endforeach()
@@ -402,6 +410,13 @@ if (GGML_CPU_ALL_VARIANTS)
402410
else()
403411
message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
404412
endif()
413+
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
414+
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
415+
ggml_add_cpu_backend_variant(riscv64_0)
416+
ggml_add_cpu_backend_variant(riscv64_v RVV)
417+
else()
418+
message(FATAL_ERROR "Unsupported RISC-V target OS: ${CMAKE_SYSTEM_NAME}")
419+
endif()
405420
else()
406421
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
407422
endif()

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -452,22 +452,35 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
452452
ggml-cpu/spacemit/ime_kernels.h
453453
)
454454
endif()
455-
set(MARCH_STR "rv64gc")
456-
if (GGML_RV_ZFH)
457-
string(APPEND MARCH_STR "_zfh")
458-
endif()
459-
if (GGML_XTHEADVECTOR)
460-
string(APPEND MARCH_STR "_xtheadvector")
461-
elseif (GGML_RVV)
462-
string(APPEND MARCH_STR "_v")
463-
if (GGML_RV_ZVFH)
464-
string(APPEND MARCH_STR "_zvfh")
455+
if(NOT GGML_CPU_ALL_VARIANTS)
456+
set(MARCH_STR "rv64gc")
457+
if (GGML_RV_ZFH)
458+
string(APPEND MARCH_STR "_zfh")
465459
endif()
460+
if (GGML_XTHEADVECTOR)
461+
string(APPEND MARCH_STR "_xtheadvector")
462+
elseif (GGML_RVV)
463+
string(APPEND MARCH_STR "_v")
464+
if (GGML_RV_ZVFH)
465+
string(APPEND MARCH_STR "_zvfh")
466+
endif()
467+
endif()
468+
if (GGML_RV_ZICBOP)
469+
string(APPEND MARCH_STR "_zicbop")
470+
endif()
471+
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
472+
else()
473+
# Begin with the lowest baseline
474+
set(ARCH_DEFINITIONS "")
475+
476+
if (GGML_INTERNAL_RVV)
477+
message(STATUS "RVV enabled")
478+
list(APPEND ARCH_DEFINITIONS GGML_USE_RVV)
479+
list(APPEND ARCH_FLAGS -march=rv64gc_v -mabi=lp64d)
480+
endif()
481+
482+
ggml_add_cpu_backend_features(${GGML_CPU_NAME} riscv ${ARCH_DEFINITIONS})
466483
endif()
467-
if (GGML_RV_ZICBOP)
468-
string(APPEND MARCH_STR "_zicbop")
469-
endif()
470-
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
471484
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
472485
message(STATUS "s390x detected")
473486
list(APPEND GGML_CPU_SOURCES

ggml/src/ggml-cpu/arch-fallback.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,8 @@
5151
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
5252
// repack.cpp
5353
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
54-
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
5554
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
5655
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
57-
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
5856
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
5957
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
6058
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)

0 commit comments

Comments
 (0)