Skip to content

Commit f890560

Browse files
committed
talk-llama : sync llama.cpp
ggml-ci
1 parent a14c89a commit f890560

25 files changed

+2851
-1129
lines changed

examples/talk-llama/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ if (WHISPER_SDL2)
2020
llama-memory.cpp
2121
llama-mmap.cpp
2222
llama-model-loader.cpp
23+
llama-model-saver.cpp
2324
llama-model.cpp
2425
llama-quant.cpp
2526
llama-sampling.cpp

examples/talk-llama/llama-adapter.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
253253
std::vector<ggml_backend_buffer_type_t> buft_extra;
254254
{
255255
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
256+
if (!cpu_dev) {
257+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
258+
}
256259
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
257260

258261
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
@@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
291294
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
292295

293296
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
297+
if (!cpu_dev) {
298+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
299+
}
294300
buft = ggml_backend_dev_buffer_type(cpu_dev);
295301

296302
break;

examples/talk-llama/llama-batch.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
189189
return ubatch;
190190
}
191191

192-
void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
192+
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
193193
GGML_ASSERT(batch.n_tokens >= 0);
194194
this->batch = &batch;
195195
this->n_embd = n_embd;
@@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
203203
for (size_t i = 0; i < n_tokens; ++i) {
204204
ids[i] = i;
205205
}
206+
206207
if (simple_split) {
207208
seq.resize(1);
208209
llama_sbatch_seq & s = seq[0];
@@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
212213
s.length = n_tokens;
213214
return;
214215
}
216+
215217
std::sort(ids.begin(), ids.end(),
216218
[&batch](size_t a, size_t b) {
217219
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
@@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
239241
return n_seq_a > n_seq_b;
240242
}
241243
);
244+
242245
// init seq
243246
llama_sbatch_seq * last_seq = nullptr;
244247

@@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
262265
seq.push_back(new_seq);
263266
last_seq = &seq.back();
264267
}
268+
265269
// keep shared prompts first at the end, then sort by length descending.
266270
std::sort(seq.begin(), seq.end(),
267271
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {

examples/talk-llama/llama-batch.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ struct llama_sbatch {
7070
// sequence-wise split
7171
llama_ubatch split_seq(size_t n_ubatch);
7272

73-
void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
73+
llama_sbatch() = default;
74+
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
7475
};
7576

7677
// temporary allocate memory for the input batch if needed

examples/talk-llama/llama-chat.cpp

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
3535
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
3636
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
3737
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
38+
{ "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
3839
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
3940
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
4041
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
@@ -202,19 +203,20 @@ int32_t llm_chat_apply_template(
202203
if (add_ass) {
203204
ss << "<|im_start|>assistant\n";
204205
}
205-
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
206+
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
206207
// Official mistral 'v7' template
207208
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
209+
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
210+
const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
208211
for (auto message : chat) {
209212
std::string role(message->role);
210213
std::string content(message->content);
211214
if (role == "system") {
212-
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
215+
ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
213216
} else if (role == "user") {
214-
ss << "[INST] " << content << "[/INST]";
215-
}
216-
else {
217-
ss << " " << content << "</s>";
217+
ss << "[INST]" << trailing_space << content << "[/INST]";
218+
} else {
219+
ss << trailing_space << content << "</s>";
218220
}
219221
}
220222
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
@@ -447,8 +449,16 @@ int32_t llm_chat_apply_template(
447449
if (add_ass) {
448450
ss << "<|assistant|>";
449451
}
450-
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
452+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
451453
ss << "[gMASK]" << "<sop>";
454+
for (auto message : chat) {
455+
std::string role(message->role);
456+
ss << "<|" << role << "|>" << "\n" << message->content;
457+
}
458+
if (add_ass) {
459+
ss << "<|assistant|>\n";
460+
}
461+
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
452462
for (auto message : chat) {
453463
std::string role(message->role);
454464
ss << "<|" << role << "|>" << "\n" << message->content;

examples/talk-llama/llama-chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ enum llm_chat_template {
1414
LLM_CHAT_TEMPLATE_MISTRAL_V3,
1515
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
1616
LLM_CHAT_TEMPLATE_MISTRAL_V7,
17+
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
1718
LLM_CHAT_TEMPLATE_PHI_3,
1819
LLM_CHAT_TEMPLATE_PHI_4,
1920
LLM_CHAT_TEMPLATE_FALCON_3,

0 commit comments

Comments
 (0)