Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1445,6 +1445,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_keep = value;
}
));
add_opt(common_arg(
{"--swa-full"},
string_format("use full-size SWA cache (default: %s)\n"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
[](common_params & params) {
params.swa_full = true;
}
).set_env("LLAMA_ARG_SWA_FULL"));
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
Expand Down Expand Up @@ -2057,13 +2065,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.grp_attn_w = value;
}
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg(
{"-dkvc", "--dump-kv-cache"},
"verbose print of the KV cache",
[](common_params & params) {
params.dump_kv_cache = true;
}
));
add_opt(common_arg(
{"-nkvo", "--no-kv-offload"},
"disable KV offload",
Expand Down
76 changes: 1 addition & 75 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1136,6 +1136,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full;

if (params.reranking) {
cparams.embeddings = true;
Expand Down Expand Up @@ -1328,81 +1329,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
return text;
}

//
// KV cache utils
//

void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";

printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);

llama_kv_cache_view_cell * c_curr = view.cells;
llama_seq_id * cs_curr = view.cells_sequences;

for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
if (i % row_size == 0) {
printf("\n%5d: ", i);
}
int seq_count = 0;
for (int j = 0; j < view.n_seq_max; j++) {
if (cs_curr[j] >= 0) { seq_count++; }
}
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
}

printf("\n=== Done dumping\n");
}

void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);

std::unordered_map<llama_seq_id, size_t> seqs;
llama_kv_cache_view_cell * c_curr = view.cells;
llama_seq_id * cs_curr = view.cells_sequences;

for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
for (int j = 0; j < view.n_seq_max; j++) {
if (cs_curr[j] < 0) { continue; }
if (seqs.find(cs_curr[j]) == seqs.end()) {
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
const size_t sz = seqs.size();
seqs[cs_curr[j]] = sz;
}
}
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
}

printf("=== Sequence legend: ");
for (const auto & it : seqs) {
printf("%zu=%d, ", it.second, it.first);
}
printf("'+'=other sequence ids");

c_curr = view.cells;
cs_curr = view.cells_sequences;
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
if (i % row_size == 0) {
printf("\n%5d: ", i);
}
for (int j = 0; j < view.n_seq_max; j++) {
if (cs_curr[j] >= 0) {
const auto & it = seqs.find(cs_curr[j]);
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
} else {
putchar('.');
}
}
putchar(' ');
}

printf("\n=== Done dumping\n");
}

//
// Embedding utils
//
Expand Down
12 changes: 1 addition & 11 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,13 +323,13 @@ struct common_params {
bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics
bool ctx_shift = true; // context shift on inifinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data
Expand Down Expand Up @@ -621,16 +621,6 @@ std::string common_detokenize(
const std::vector<llama_token> & tokens,
bool special = true);

//
// KV cache utils
//

// Dump the KV cache view with the number of sequences per cell.
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);

// Dump the KV cache view showing individual sequences in each cell (long output).
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

//
// Embedding utils
//
Expand Down
126 changes: 74 additions & 52 deletions docs/backend/CANN.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,60 +56,82 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi

## Model Supports

| Model Name | FP16 | Q8_0 | Q4_0 |
| Model Name | FP16 | Q4_0 | Q8_0 |
|:----------------------------|:-----:|:----:|:----:|
| AquilaChat2-7B | √ | √ | √ |
| Baichuan-7b | √ | √ | √ |
| Baichuan2-7B-Chat | √ | √ | √ |
| bitnet_b1_58-large | √ | √ | √ |
| bloom-560m | √ | x | √ |
| bloomz-alpaca-560m | √ | x | √ |
| c4ai-command-r-35B-v01 | x | x | x |
| chatglm3-6B | x | x | x |
| chinese-alpaca-2-1.3b | √ | √ | √ |
| CodeShell-7B | √ | √ | √ |
| deepseek-ai_deepseek-coder-1.3B-base | x | x | x |
| deepseek-ai_DeepSeek-V2-Lite | x | x | x |
| deepseek-coder-6.7B-instruct | x | x | x |
| DeepSeek-V2-Lite-64x1.5B | x | x | x |
| falcon-7b-instruct | √ | √ | √ |
| flan-t5-large | √ | √ | √ |
| gemma-2-9b-it | √ | √ | √ |
| glm-4-9B | x | x | x |
| gpt2 | √ | √ | √ |
| Gpt2-163M | √ | √ | √ |
| granite-3B-code-instruct | √ | √ | √ |
| Llama-2 | √ | √ | √ |
| Llama-3 | √ | √ | √ |
| Mistral-7B | √ | √ | √ |
| Mistral MOE | √ | √ | √ |
| DBRX | - | - | - |
| Falcon | √ | √ | √ |
| Chinese LLaMA/Alpaca | √ | √ | √ |
| Vigogne(French) | √ | √ | √ |
| BERT | x | x | x |
| Koala | √ | √ | √ |
| Baichuan | √ | √ | √ |
| Aquila 1 & 2 | √ | √ | √ |
| Starcoder models | √ | √ | √ |
| Refact | √ | √ | √ |
| MPT | √ | √ | √ |
| Bloom | √ | √ | √ |
| Yi models | √ | √ | √ |
| stablelm models | √ | √ | √ |
| DeepSeek models | x | x | x |
| Qwen models | √ | √ | √ |
| PLaMo-13B | √ | √ | √ |
| Phi models | √ | √ | √ |
| PhiMoE | √ | √ | √ |
| GPT-2 | √ | √ | √ |
| Orion | √ | √ | √ |
| InternlLM2 | √ | √ | √ |
| CodeShell | √ | √ | √ |
| Gemma | √ | √ | √ |
| Mamba | √ | √ | √ |
| Xverse | √ | √ | √ |
| command-r models | √ | √ | √ |
| Grok-1 | - | - | - |
| SEA-LION | √ | √ | √ |
| GritLM-7B | √ | √ | √ |
| internlm2_5-7b-chat | √ | √ | √ |
| koala-7B-HF | √ | √ | √ |
| Llama-2-7b-chat-hf | √ | √ | √ |
| Llama-3-Smaug-8B | √ | √ | √ |
| Llama2-Chinese-7b-Chat | √ | √ | √ |
| Llama3-8B | √ | √ | √ |
| Llama3-8b-chinese | √ | √ | √ |
| mamba-130m-hf | √ | √ | √ |
| Mistral-7B-Instruct-v0.2 | √ | √ | √ |
| Mixtral-8x7B-Instruct-v0.1 | x | √ | √ |
| mpt-7B | √ | √ | √ |
| OLMo-1B-hf | √ | √ | √ |
| OpenELM-3B-Instruct | √ | √ | √ |
| Orion-14b-base | √ | √ | √ |
| phi1 | x | x | x |
| phi2 | x | x | x |
| Phi-3-mini-4k-instruct | √ | √ | √ |
| plamo-13b | √ | √ | √ |
| pythia-70M | x | x | x |
| Qwen-7B | √ | √ | √ |
| Qwen2-1.5B-Instruct | √ | x | √ |
| Refact-1_6B-fim | √ | √ | √ |
| SmolLM-135M | √ | √ | √ |
| stablelm-zephyr | x | x | x |
| stablelm-2-zephyr-1_6b | x | x | x |
| starcoderbase-1b | √ | √ | √ |
| starcoder2-3b | √ | √ | √ |
| vigogne-7b-chat | √ | √ | √ |
| xverse-7b-chat | √ | √ | √ |
| Yi-6b-Chat | √ | √ | √ |
| OLMo | √ | √ | √ |
| OLMo 2 | √ | √ | √ |
| OLMoE | √ | √ | √ |
| Granite models | √ | √ | √ |
| GPT-NeoX | √ | √ | √ |
| Pythia | √ | √ | √ |
| Snowflake-Arctic MoE | - | - | - |
| Smaug | √ | √ | √ |
| Poro 34B | √ | √ | √ |
| Bitnet b1.58 models | √ | x | x |
| Flan-T5 | √ | √ | √ |
| Open Elm models | x | √ | √ |
| chatGLM3-6B + ChatGLM4-9b + GLMEdge-1.5b + GLMEdge-4b | √ | √ | √ |
| GLM-4-0414 | √ | √ | √ |
| SmolLM | √ | √ | √ |
| EXAONE-3.0-7.8B-Instruct | √ | √ | √ |
| FalconMamba Models | √ | √ | √ |
| Jais Models | - | x | x |
| Bielik-11B-v2.3 | √ | √ | √ |
| RWKV-6 | - | √ | √ |
| QRWKV-6 | √ | √ | √ |
| GigaChat-20B-A3B | x | x | x |
| Trillion-7B-preview | √ | √ | √ |
| Ling models | √ | √ | √ |


**Multimodal**
| Model Name | FP16 | Q4_0 | Q8_0 |
|:----------------------------|:-----:|:----:|:----:|
| LLaVA 1.5 models, LLaVA 1.6 models | x | x | x |
| BakLLaVA | √ | √ | √ |
| Obsidian | √ | - | - |
| ShareGPT4V | x | - | - |
| MobileVLM 1.7B/3B models | - | - | - |
| Yi-VL | - | - | - |
| Mini CPM | √ | √ | √ |
| Moondream | √ | √ | √ |
| Bunny | √ | - | - |
| GLM-EDGE | √ | √ | √ |
| Qwen2-VL | √ | √ | √ |



Expand Down
13 changes: 0 additions & 13 deletions examples/lookahead/lookahead.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
const int N = 5; // n-gram size
const int G = 15; // max verification n-grams

const bool dump_kv_cache = params.dump_kv_cache;

// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
Expand Down Expand Up @@ -152,9 +150,6 @@ int main(int argc, char ** argv) {
// here we keep adding new n-grams as we go
ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);

// debug
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);

const auto t_dec_start = ggml_time_us();

// sample first token
Expand All @@ -172,12 +167,6 @@ int main(int argc, char ** argv) {
}

while (true) {
// debug
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
common_kv_cache_dump_view_seqs(kvc_view, 40);
}

// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
//
// Example for W = 5, N = 4, G = 2:
Expand Down Expand Up @@ -473,8 +462,6 @@ int main(int argc, char ** argv) {

common_sampler_free(smpl);

llama_kv_cache_view_free(&kvc_view);

llama_batch_free(batch);

llama_backend_free();
Expand Down
11 changes: 0 additions & 11 deletions examples/lookup/lookup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ int main(int argc, char ** argv){
// max. number of additional tokens to draft if match is found
const int n_draft = params.speculative.n_max;

const bool dump_kv_cache = params.dump_kv_cache;

// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
Expand Down Expand Up @@ -110,18 +108,9 @@ int main(int argc, char ** argv){

llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);

// debug
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);

const auto t_dec_start = ggml_time_us();

while (true) {
// debug
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
common_kv_cache_dump_view_seqs(kvc_view, 40);
}

// print current draft sequence
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());

Expand Down
9 changes: 0 additions & 9 deletions examples/parallel/parallel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,6 @@ int main(int argc, char ** argv) {
// insert new requests as soon as the previous one is done
const bool cont_batching = params.cont_batching;

const bool dump_kv_cache = params.dump_kv_cache;

// is the system prompt shared in the cache
const bool is_sp_shared = params.is_pp_shared;

Expand Down Expand Up @@ -241,8 +239,6 @@ int main(int argc, char ** argv) {
int32_t n_total_gen = 0;
int32_t n_cache_miss = 0;

struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);

const auto t_main_start = ggml_time_us();

LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
Expand Down Expand Up @@ -272,11 +268,6 @@ int main(int argc, char ** argv) {
LOG_INF("Processing requests ...\n\n");

while (true) {
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
common_kv_cache_dump_view_seqs(kvc_view, 40);
}

common_batch_clear(batch);

// decode any currently ongoing sequences
Expand Down
Loading