Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
0c4a229
sycl: addressing non-contiguous src1 mul_mats (nc and batched) (llama…
Alcpz May 8, 2025
19d8d9a
vulkan: Allow up to 4096 elements for mul_mat_id row_ids (llama/13326)
jeffbolznv May 9, 2025
00c8056
rpc : add rpc_msg_set_tensor_hash_req (llama/13353)
rgerganov May 9, 2025
f8c75dc
CUDA: fix crash on large batch size for MoE models (llama/13384)
JohannesGaessler May 9, 2025
aef59f4
CUDA: FA support for Deepseek (Ampere or newer) (llama/13306)
JohannesGaessler May 9, 2025
b493e03
sycl : implementation of reordered Q4_0 MMVQ for Intel GPUs (llama/12…
Alcpz May 9, 2025
22f4997
vulkan: scalar flash attention implementation (llama/13324)
jeffbolznv May 10, 2025
0444566
CUDA: fix FlashAttention on Turing (llama/13415)
JohannesGaessler May 10, 2025
86dece9
CUDA: fix race conditions FlashAttention kernels (llama/13438)
JohannesGaessler May 10, 2025
0b1962a
Add `--no-op-offload` to improve `-ot` pp perf in MoE models like lla…
hjc4869 May 11, 2025
c426829
CUDA: fix crash with partial offloading of MoE (llama/13439)
JohannesGaessler May 11, 2025
882d975
enable dpcpp nightly builds with libraries (llama/13406)
AD2605 May 12, 2025
8264872
CUDA: fix misaligned synchronization in FA (llama/13469)
JohannesGaessler May 12, 2025
cb90cb0
ggml-cpu: Integrate fp32=bf16xbf16 SME KleidiAI kernel (llama/13053)
eddnjjn May 12, 2025
fe0d52b
llama/ggml: add LLM training support (llama/10544)
JohannesGaessler May 12, 2025
43a59ec
opencl: remove unnecessary assert for `add` (llama/13257)
lhez May 12, 2025
926e06d
metal : optimize MoE for large batches (llama/13388)
ggerganov May 13, 2025
79fb43e
ggml : add mrope kernel for metal (llama/13457)
ngxson May 13, 2025
89970b9
sync : ggml
ggerganov May 13, 2025
6975380
whisper : update to ggml-backend changes (#0)
ggerganov May 13, 2025
bff8dc2
talk-llama : sync llama.cpp
ggerganov May 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/talk-llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ if (WHISPER_SDL2)
llama-memory.cpp
llama-mmap.cpp
llama-model-loader.cpp
llama-model-saver.cpp
llama-model.cpp
llama-quant.cpp
llama-sampling.cpp
Expand Down
6 changes: 6 additions & 0 deletions examples/talk-llama/llama-adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
std::vector<ggml_backend_buffer_type_t> buft_extra;
{
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!cpu_dev) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);

auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
Expand Down Expand Up @@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));

auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!cpu_dev) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
buft = ggml_backend_dev_buffer_type(cpu_dev);

break;
Expand Down
6 changes: 5 additions & 1 deletion examples/talk-llama/llama-batch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
return ubatch;
}

void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
GGML_ASSERT(batch.n_tokens >= 0);
this->batch = &batch;
this->n_embd = n_embd;
Expand All @@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
for (size_t i = 0; i < n_tokens; ++i) {
ids[i] = i;
}

if (simple_split) {
seq.resize(1);
llama_sbatch_seq & s = seq[0];
Expand All @@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
s.length = n_tokens;
return;
}

std::sort(ids.begin(), ids.end(),
[&batch](size_t a, size_t b) {
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
Expand Down Expand Up @@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
return n_seq_a > n_seq_b;
}
);

// init seq
llama_sbatch_seq * last_seq = nullptr;

Expand All @@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
seq.push_back(new_seq);
last_seq = &seq.back();
}

// keep shared prompts first at the end, then sort by length descending.
std::sort(seq.begin(), seq.end(),
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
Expand Down
3 changes: 2 additions & 1 deletion examples/talk-llama/llama-batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ struct llama_sbatch {
// sequence-wise split
llama_ubatch split_seq(size_t n_ubatch);

void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
llama_sbatch() = default;
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
};

// temporary allocate memory for the input batch if needed
Expand Down
24 changes: 17 additions & 7 deletions examples/talk-llama/llama-chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
{ "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
Expand Down Expand Up @@ -202,19 +203,20 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|im_start|>assistant\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
// Official mistral 'v7' template
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
for (auto message : chat) {
std::string role(message->role);
std::string content(message->content);
if (role == "system") {
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
} else if (role == "user") {
ss << "[INST] " << content << "[/INST]";
}
else {
ss << " " << content << "</s>";
ss << "[INST]" << trailing_space << content << "[/INST]";
} else {
ss << trailing_space << content << "</s>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
Expand Down Expand Up @@ -447,8 +449,16 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
ss << "[gMASK]" << "<sop>";
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>" << "\n" << message->content;
}
if (add_ass) {
ss << "<|assistant|>\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>" << "\n" << message->content;
Expand Down
1 change: 1 addition & 0 deletions examples/talk-llama/llama-chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_MISTRAL_V3,
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
LLM_CHAT_TEMPLATE_MISTRAL_V7,
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
LLM_CHAT_TEMPLATE_PHI_3,
LLM_CHAT_TEMPLATE_PHI_4,
LLM_CHAT_TEMPLATE_FALCON_3,
Expand Down
Loading
Loading