Skip to content

Commit aa7ec94

Browse files
Merge pull request #96 from menloresearch/update-dev-from-master-2025-05-21-00-09
Sync master with upstream release b5438
2 parents f0ba011 + fb1cab2 commit aa7ec94

File tree

29 files changed

+1604
-1092
lines changed

29 files changed

+1604
-1092
lines changed

common/arg.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1445,6 +1445,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14451445
params.n_keep = value;
14461446
}
14471447
));
1448+
add_opt(common_arg(
1449+
{"--swa-full"},
1450+
string_format("use full-size SWA cache (default: %s)\n"
1451+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
1452+
[](common_params & params) {
1453+
params.swa_full = true;
1454+
}
1455+
).set_env("LLAMA_ARG_SWA_FULL"));
14481456
add_opt(common_arg(
14491457
{"--no-context-shift"},
14501458
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -2057,13 +2065,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20572065
params.grp_attn_w = value;
20582066
}
20592067
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
2060-
add_opt(common_arg(
2061-
{"-dkvc", "--dump-kv-cache"},
2062-
"verbose print of the KV cache",
2063-
[](common_params & params) {
2064-
params.dump_kv_cache = true;
2065-
}
2066-
));
20672068
add_opt(common_arg(
20682069
{"-nkvo", "--no-kv-offload"},
20692070
"disable KV offload",

common/common.cpp

Lines changed: 1 addition & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1136,6 +1136,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11361136
cparams.flash_attn = params.flash_attn;
11371137
cparams.no_perf = params.no_perf;
11381138
cparams.op_offload = !params.no_op_offload;
1139+
cparams.swa_full = params.swa_full;
11391140

11401141
if (params.reranking) {
11411142
cparams.embeddings = true;
@@ -1328,81 +1329,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
13281329
return text;
13291330
}
13301331

1331-
//
1332-
// KV cache utils
1333-
//
1334-
1335-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1336-
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1337-
1338-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1339-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1340-
1341-
llama_kv_cache_view_cell * c_curr = view.cells;
1342-
llama_seq_id * cs_curr = view.cells_sequences;
1343-
1344-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1345-
if (i % row_size == 0) {
1346-
printf("\n%5d: ", i);
1347-
}
1348-
int seq_count = 0;
1349-
for (int j = 0; j < view.n_seq_max; j++) {
1350-
if (cs_curr[j] >= 0) { seq_count++; }
1351-
}
1352-
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
1353-
}
1354-
1355-
printf("\n=== Done dumping\n");
1356-
}
1357-
1358-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1359-
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1360-
1361-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1362-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1363-
1364-
std::unordered_map<llama_seq_id, size_t> seqs;
1365-
llama_kv_cache_view_cell * c_curr = view.cells;
1366-
llama_seq_id * cs_curr = view.cells_sequences;
1367-
1368-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1369-
for (int j = 0; j < view.n_seq_max; j++) {
1370-
if (cs_curr[j] < 0) { continue; }
1371-
if (seqs.find(cs_curr[j]) == seqs.end()) {
1372-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1373-
const size_t sz = seqs.size();
1374-
seqs[cs_curr[j]] = sz;
1375-
}
1376-
}
1377-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1378-
}
1379-
1380-
printf("=== Sequence legend: ");
1381-
for (const auto & it : seqs) {
1382-
printf("%zu=%d, ", it.second, it.first);
1383-
}
1384-
printf("'+'=other sequence ids");
1385-
1386-
c_curr = view.cells;
1387-
cs_curr = view.cells_sequences;
1388-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1389-
if (i % row_size == 0) {
1390-
printf("\n%5d: ", i);
1391-
}
1392-
for (int j = 0; j < view.n_seq_max; j++) {
1393-
if (cs_curr[j] >= 0) {
1394-
const auto & it = seqs.find(cs_curr[j]);
1395-
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
1396-
} else {
1397-
putchar('.');
1398-
}
1399-
}
1400-
putchar(' ');
1401-
}
1402-
1403-
printf("\n=== Done dumping\n");
1404-
}
1405-
14061332
//
14071333
// Embedding utils
14081334
//

common/common.h

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -323,13 +323,13 @@ struct common_params {
323323
bool flash_attn = false; // flash attention
324324
bool no_perf = false; // disable performance metrics
325325
bool ctx_shift = true; // context shift on inifinite text generation
326+
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
326327

327328
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
328329
bool use_mmap = true; // use mmap for faster loads
329330
bool use_mlock = false; // use mlock to keep model in memory
330331
bool verbose_prompt = false; // print prompt tokens before generation
331332
bool display_prompt = true; // print prompt before generation
332-
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
333333
bool no_kv_offload = false; // disable KV offloading
334334
bool warmup = true; // warmup run
335335
bool check_tensors = false; // validate tensor data
@@ -621,16 +621,6 @@ std::string common_detokenize(
621621
const std::vector<llama_token> & tokens,
622622
bool special = true);
623623

624-
//
625-
// KV cache utils
626-
//
627-
628-
// Dump the KV cache view with the number of sequences per cell.
629-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
630-
631-
// Dump the KV cache view showing individual sequences in each cell (long output).
632-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
633-
634624
//
635625
// Embedding utils
636626
//

docs/backend/CANN.md

Lines changed: 74 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -56,60 +56,82 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
5656

5757
## Model Supports
5858

59-
| Model Name | FP16 | Q8_0 | Q4_0 |
59+
| Model Name | FP16 | Q4_0 | Q8_0 |
6060
|:----------------------------|:-----:|:----:|:----:|
61-
| AquilaChat2-7B ||||
62-
| Baichuan-7b ||||
63-
| Baichuan2-7B-Chat ||||
64-
| bitnet_b1_58-large ||||
65-
| bloom-560m || x ||
66-
| bloomz-alpaca-560m || x ||
67-
| c4ai-command-r-35B-v01 | x | x | x |
68-
| chatglm3-6B | x | x | x |
69-
| chinese-alpaca-2-1.3b ||||
70-
| CodeShell-7B ||||
71-
| deepseek-ai_deepseek-coder-1.3B-base | x | x | x |
72-
| deepseek-ai_DeepSeek-V2-Lite | x | x | x |
73-
| deepseek-coder-6.7B-instruct | x | x | x |
74-
| DeepSeek-V2-Lite-64x1.5B | x | x | x |
75-
| falcon-7b-instruct ||||
76-
| flan-t5-large ||||
77-
| gemma-2-9b-it ||||
78-
| glm-4-9B | x | x | x |
79-
| gpt2 ||||
80-
| Gpt2-163M ||||
81-
| granite-3B-code-instruct ||||
61+
| Llama-2 ||||
62+
| Llama-3 ||||
63+
| Mistral-7B ||||
64+
| Mistral MOE ||||
65+
| DBRX | - | - | - |
66+
| Falcon ||||
67+
| Chinese LLaMA/Alpaca ||||
68+
| Vigogne(French) ||||
69+
| BERT | x | x | x |
70+
| Koala ||||
71+
| Baichuan ||||
72+
| Aquila 1 & 2 ||||
73+
| Starcoder models ||||
74+
| Refact ||||
75+
| MPT ||||
76+
| Bloom ||||
77+
| Yi models ||||
78+
| stablelm models ||||
79+
| DeepSeek models | x | x | x |
80+
| Qwen models ||||
81+
| PLaMo-13B ||||
82+
| Phi models ||||
83+
| PhiMoE ||||
84+
| GPT-2 ||||
85+
| Orion ||||
86+
| InternlLM2 ||||
87+
| CodeShell ||||
88+
| Gemma ||||
89+
| Mamba ||||
90+
| Xverse ||||
91+
| command-r models ||||
92+
| Grok-1 | - | - | - |
93+
| SEA-LION ||||
8294
| GritLM-7B ||||
83-
| internlm2_5-7b-chat ||||
84-
| koala-7B-HF ||||
85-
| Llama-2-7b-chat-hf ||||
86-
| Llama-3-Smaug-8B ||||
87-
| Llama2-Chinese-7b-Chat ||||
88-
| Llama3-8B ||||
89-
| Llama3-8b-chinese ||||
90-
| mamba-130m-hf ||||
91-
| Mistral-7B-Instruct-v0.2 ||||
92-
| Mixtral-8x7B-Instruct-v0.1 | x |||
93-
| mpt-7B ||||
94-
| OLMo-1B-hf ||||
95-
| OpenELM-3B-Instruct ||||
96-
| Orion-14b-base ||||
97-
| phi1 | x | x | x |
98-
| phi2 | x | x | x |
99-
| Phi-3-mini-4k-instruct ||||
100-
| plamo-13b ||||
101-
| pythia-70M | x | x | x |
102-
| Qwen-7B ||||
103-
| Qwen2-1.5B-Instruct || x ||
104-
| Refact-1_6B-fim ||||
105-
| SmolLM-135M ||||
106-
| stablelm-zephyr | x | x | x |
107-
| stablelm-2-zephyr-1_6b | x | x | x |
108-
| starcoderbase-1b ||||
109-
| starcoder2-3b ||||
110-
| vigogne-7b-chat ||||
111-
| xverse-7b-chat ||||
112-
| Yi-6b-Chat ||||
95+
| OLMo ||||
96+
| OLMo 2 ||||
97+
| OLMoE ||||
98+
| Granite models ||||
99+
| GPT-NeoX ||||
100+
| Pythia ||||
101+
| Snowflake-Arctic MoE | - | - | - |
102+
| Smaug ||||
103+
| Poro 34B ||||
104+
| Bitnet b1.58 models || x | x |
105+
| Flan-T5 ||||
106+
| Open Elm models | x |||
107+
| chatGLM3-6B + ChatGLM4-9b + GLMEdge-1.5b + GLMEdge-4b ||||
108+
| GLM-4-0414 ||||
109+
| SmolLM ||||
110+
| EXAONE-3.0-7.8B-Instruct ||||
111+
| FalconMamba Models ||||
112+
| Jais Models | - | x | x |
113+
| Bielik-11B-v2.3 ||||
114+
| RWKV-6 | - |||
115+
| QRWKV-6 ||||
116+
| GigaChat-20B-A3B | x | x | x |
117+
| Trillion-7B-preview ||||
118+
| Ling models ||||
119+
120+
121+
**Multimodal**
122+
| Model Name | FP16 | Q4_0 | Q8_0 |
123+
|:----------------------------|:-----:|:----:|:----:|
124+
| LLaVA 1.5 models, LLaVA 1.6 models | x | x | x |
125+
| BakLLaVA ||||
126+
| Obsidian || - | - |
127+
| ShareGPT4V | x | - | - |
128+
| MobileVLM 1.7B/3B models | - | - | - |
129+
| Yi-VL | - | - | - |
130+
| Mini CPM ||||
131+
| Moondream ||||
132+
| Bunny || - | - |
133+
| GLM-EDGE ||||
134+
| Qwen2-VL ||||
113135

114136

115137

examples/lookahead/lookahead.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
5050
const int N = 5; // n-gram size
5151
const int G = 15; // max verification n-grams
5252

53-
const bool dump_kv_cache = params.dump_kv_cache;
54-
5553
// init llama.cpp
5654
llama_backend_init();
5755
llama_numa_init(params.numa);
@@ -152,9 +150,6 @@ int main(int argc, char ** argv) {
152150
// here we keep adding new n-grams as we go
153151
ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
154152

155-
// debug
156-
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
157-
158153
const auto t_dec_start = ggml_time_us();
159154

160155
// sample first token
@@ -172,12 +167,6 @@ int main(int argc, char ** argv) {
172167
}
173168

174169
while (true) {
175-
// debug
176-
if (dump_kv_cache) {
177-
llama_kv_cache_view_update(ctx, &kvc_view);
178-
common_kv_cache_dump_view_seqs(kvc_view, 40);
179-
}
180-
181170
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
182171
//
183172
// Example for W = 5, N = 4, G = 2:
@@ -473,8 +462,6 @@ int main(int argc, char ** argv) {
473462

474463
common_sampler_free(smpl);
475464

476-
llama_kv_cache_view_free(&kvc_view);
477-
478465
llama_batch_free(batch);
479466

480467
llama_backend_free();

examples/lookup/lookup.cpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@ int main(int argc, char ** argv){
2424
// max. number of additional tokens to draft if match is found
2525
const int n_draft = params.speculative.n_max;
2626

27-
const bool dump_kv_cache = params.dump_kv_cache;
28-
2927
// init llama.cpp
3028
llama_backend_init();
3129
llama_numa_init(params.numa);
@@ -110,18 +108,9 @@ int main(int argc, char ** argv){
110108

111109
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
112110

113-
// debug
114-
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
115-
116111
const auto t_dec_start = ggml_time_us();
117112

118113
while (true) {
119-
// debug
120-
if (dump_kv_cache) {
121-
llama_kv_cache_view_update(ctx, &kvc_view);
122-
common_kv_cache_dump_view_seqs(kvc_view, 40);
123-
}
124-
125114
// print current draft sequence
126115
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
127116

examples/parallel/parallel.cpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,6 @@ int main(int argc, char ** argv) {
178178
// insert new requests as soon as the previous one is done
179179
const bool cont_batching = params.cont_batching;
180180

181-
const bool dump_kv_cache = params.dump_kv_cache;
182-
183181
// is the system prompt shared in the cache
184182
const bool is_sp_shared = params.is_pp_shared;
185183

@@ -241,8 +239,6 @@ int main(int argc, char ** argv) {
241239
int32_t n_total_gen = 0;
242240
int32_t n_cache_miss = 0;
243241

244-
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
245-
246242
const auto t_main_start = ggml_time_us();
247243

248244
LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
@@ -272,11 +268,6 @@ int main(int argc, char ** argv) {
272268
LOG_INF("Processing requests ...\n\n");
273269

274270
while (true) {
275-
if (dump_kv_cache) {
276-
llama_kv_cache_view_update(ctx, &kvc_view);
277-
common_kv_cache_dump_view_seqs(kvc_view, 40);
278-
}
279-
280271
common_batch_clear(batch);
281272

282273
// decode any currently ongoing sequences

0 commit comments

Comments
 (0)