Skip to content

Commit e50c7f9

Browse files
committed
Merge branch 'concedo_experimental' into crokeso
2 parents 5942bf9 + 45b7d72 commit e50c7f9

File tree

124 files changed

+8997
-30235
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

124 files changed

+8997
-30235
lines changed

.clang-format

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ AllowShortIfStatementsOnASingleLine: Never
2222
AllowShortLambdasOnASingleLine: Inline
2323
AllowShortLoopsOnASingleLine: false
2424
AlwaysBreakBeforeMultilineStrings: true
25-
BinPackArguments: true
26-
BinPackParameters: true # OnePerLine
25+
BinPackArguments: false
26+
BinPackParameters: false # OnePerLine
2727
BitFieldColonSpacing: Both
2828
BreakBeforeBraces: Custom # Attach
2929
BraceWrapping:
@@ -70,15 +70,18 @@ ExperimentalAutoDetectBinPacking: false
7070
FixNamespaceComments: true
7171
IncludeBlocks: Regroup
7272
IncludeCategories:
73-
- Regex: '^<.*\.h>'
73+
- Regex: '".*"'
7474
Priority: 1
7575
SortPriority: 0
76-
- Regex: '^<.*'
76+
- Regex: '^<.*\.h>'
7777
Priority: 2
7878
SortPriority: 0
79-
- Regex: '.*'
79+
- Regex: '^<.*'
8080
Priority: 3
8181
SortPriority: 0
82+
- Regex: '.*'
83+
Priority: 4
84+
SortPriority: 0
8285
IncludeIsMainRegex: '([-_](test|unittest))?$'
8386
IncludeIsMainSourceRegex: ''
8487
IndentAccessModifiers: false

.github/workflows/kcpp-build-release-macos.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ env:
1313

1414
jobs:
1515
osx:
16-
runs-on: macos-latest
16+
runs-on: macos-14
1717
steps:
1818
- name: Clone
1919
id: checkout
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: AutoGuess Tests
2+
3+
on:
4+
pull_request:
5+
branches:
6+
- concedo_experimental
7+
paths:
8+
- 'kcpp_adapters/AutoGuess.json'
9+
10+
jobs:
11+
test-autoguess:
12+
runs-on: ubuntu-latest
13+
14+
steps:
15+
- name: Checkout code
16+
uses: actions/checkout@v4
17+
18+
- name: Set up Python
19+
uses: actions/setup-python@v4
20+
with:
21+
python-version: '3.x' # Adjust to your preferred Python version
22+
23+
- name: Install dependencies
24+
run: |
25+
python -m pip install --upgrade pip
26+
pip install requests transformers jinja2 tiktoken protobuf blobfile sentencepiece
27+
git clone https://github.com/kallewoof/gated-tokenizers.git tests/gated-tokenizers
28+
29+
- name: Run AutoGuess tests
30+
run: python tests/test_autoguess.py

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,9 @@ rocblas.dll
142142
hipblas.dll
143143
koboldcpp_hipblas.so
144144
koboldcpp_hipblas.dll
145+
146+
.tokenizer_configs
147+
145148
/*.dll
146149

147150
bin/

common/arg.cpp

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1467,6 +1467,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14671467
params.swa_full = true;
14681468
}
14691469
).set_env("LLAMA_ARG_SWA_FULL"));
1470+
add_opt(common_arg(
1471+
{"--kv-unified", "-kvu"},
1472+
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
1473+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
1474+
[](common_params & params) {
1475+
params.kv_unified = true;
1476+
}
1477+
).set_env("LLAMA_ARG_KV_SPLIT"));
14701478
add_opt(common_arg(
14711479
{"--no-context-shift"},
14721480
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -1607,7 +1615,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
16071615
[](common_params & params, const std::string & value) {
16081616
params.antiprompt.emplace_back(value);
16091617
}
1610-
).set_examples({LLAMA_EXAMPLE_MAIN}));
1618+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
16111619
add_opt(common_arg(
16121620
{"-sp", "--special"},
16131621
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
@@ -2650,6 +2658,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26502658
params.i_chunk = value;
26512659
}
26522660
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2661+
add_opt(common_arg(
2662+
{"--show-statistics"},
2663+
string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
2664+
[](common_params & params) {
2665+
params.show_statistics = true;
2666+
}
2667+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
26532668
add_opt(common_arg(
26542669
{"--parse-special"},
26552670
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
@@ -3426,5 +3441,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34263441
}
34273442
).set_examples({LLAMA_EXAMPLE_SERVER}));
34283443

3444+
// diffusion parameters
3445+
add_opt(common_arg(
3446+
{ "--diffusion-steps" }, "N",
3447+
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3448+
[](common_params & params, int value) { params.diffusion.steps = value; }
3449+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3450+
add_opt(common_arg(
3451+
{ "--diffusion-eps" }, "F",
3452+
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3453+
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3454+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3455+
add_opt(common_arg(
3456+
{ "--diffusion-algorithm" }, "N",
3457+
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
3458+
params.diffusion.algorithm),
3459+
[](common_params & params, int value) { params.diffusion.algorithm = value; }
3460+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3461+
add_opt(common_arg(
3462+
{ "--diffusion-alg-temp" }, "F",
3463+
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3464+
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3465+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3466+
add_opt(common_arg(
3467+
{ "--diffusion-visual" },
3468+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3469+
params.diffusion.visual_mode ? "true" : "false"),
3470+
[](common_params & params) { params.diffusion.visual_mode = true; }
3471+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3472+
34293473
return ctx_arg;
34303474
}

common/common.cpp

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,15 @@ void string_replace_all(std::string & s, const std::string & search, const std::
456456
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
457457
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
458458
}
459+
460+
bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
461+
bool has_suffix = string_ends_with(str, suffix);
462+
if (has_suffix) {
463+
str = str.substr(0, str.size() - suffix.size());
464+
}
465+
return has_suffix;
466+
}
467+
459468
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
460469
if (!str.empty() && !stop.empty()) {
461470
const char text_last_char = str.back();
@@ -1013,15 +1022,21 @@ struct common_init_result common_init_from_params(common_params & params) {
10131022
params.sampling.ignore_eos = false;
10141023
}
10151024

1016-
if (params.sampling.ignore_eos) {
1017-
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1018-
if (llama_vocab_is_eog(vocab, i)) {
1019-
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1020-
params.sampling.logit_bias.push_back({i, -INFINITY});
1021-
}
1025+
// initialize once
1026+
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1027+
if (llama_vocab_is_eog(vocab, i)) {
1028+
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1029+
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
10221030
}
10231031
}
10241032

1033+
if (params.sampling.ignore_eos) {
1034+
// add EOG biases to the active set of logit biases
1035+
params.sampling.logit_bias.insert(
1036+
params.sampling.logit_bias.end(),
1037+
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
1038+
}
1039+
10251040
if (params.sampling.penalty_last_n == -1) {
10261041
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
10271042
params.sampling.penalty_last_n = llama_n_ctx(lctx);
@@ -1166,6 +1181,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11661181
cparams.no_perf = params.no_perf;
11671182
cparams.op_offload = !params.no_op_offload;
11681183
cparams.swa_full = params.swa_full;
1184+
cparams.kv_unified = params.kv_unified;
11691185

11701186
cparams.type_k = params.cache_type_k;
11711187
cparams.type_v = params.cache_type_v;

common/common.h

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ enum llama_example {
7777
LLAMA_EXAMPLE_LOOKUP,
7878
LLAMA_EXAMPLE_PARALLEL,
7979
LLAMA_EXAMPLE_TTS,
80+
LLAMA_EXAMPLE_DIFFUSION,
8081

8182
LLAMA_EXAMPLE_COUNT,
8283
};
@@ -173,7 +174,8 @@ struct common_params_sampling {
173174
std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
174175
std::set<llama_token> preserved_tokens;
175176

176-
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
177+
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
178+
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
177179

178180
// print the parameters into a string
179181
std::string print() const;
@@ -213,6 +215,14 @@ struct common_params_vocoder {
213215
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
214216
};
215217

218+
struct common_params_diffusion {
219+
int32_t steps = 64; // number of diffusion steps
220+
float eps = 1e-3f; // epsilon for timesteps
221+
int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
222+
float alg_temp = 0.0f; // algorithm temperature
223+
bool visual_mode = false; // show progressive diffusion on screen
224+
};
225+
216226
enum common_reasoning_format {
217227
COMMON_REASONING_FORMAT_NONE,
218228
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
@@ -264,6 +274,7 @@ struct common_params {
264274
struct common_params_sampling sampling;
265275
struct common_params_speculative speculative;
266276
struct common_params_vocoder vocoder;
277+
struct common_params_diffusion diffusion;
267278

268279
struct common_params_model model;
269280

@@ -326,6 +337,7 @@ struct common_params {
326337
bool no_perf = false; // disable performance metrics
327338
bool ctx_shift = true; // context shift on inifinite text generation
328339
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
340+
bool kv_unified = false; // enable unified KV cache
329341

330342
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
331343
bool use_mmap = true; // use mmap for faster loads
@@ -416,9 +428,10 @@ struct common_params {
416428
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
417429
int32_t i_chunk = 0; // start processing from this chunk
418430

419-
bool process_output = false; // collect data for the output tensor
420-
bool compute_ppl = true; // whether to compute perplexity
421-
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
431+
bool process_output = false; // collect data for the output tensor
432+
bool compute_ppl = true; // whether to compute perplexity
433+
bool show_statistics = false; // show imatrix statistics per tensor
434+
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
422435

423436
// cvector-generator params
424437
int n_pca_batch = 100;
@@ -518,6 +531,7 @@ static bool string_starts_with(const std::string & str,
518531

519532
// While we wait for C++20's std::string::ends_with...
520533
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
534+
bool string_remove_suffix(std::string & str, const std::string_view & suffix);
521535
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
522536

523537
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);

0 commit comments

Comments
 (0)