Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
57549f6
ggml : remove ggml_graph_import and ggml_graph_export declarations (g…
rgerganov May 30, 2025
2e445b2
cmake : Fix broken CMake error messages (ggml/1252)
dg0yt May 31, 2025
59c6afa
vulkan : Remove unexpected ; (ggml/1253)
dg0yt May 31, 2025
ee61e88
ggml : add ggml_repeat_4d (llama/13824)
ngxson May 27, 2025
da9b2d3
SYCL: add gelu_erf kernel (llama/13749)
qnixsynapse May 27, 2025
52520d8
vulkan: use timestamp queries for GGML_VULKAN_PERF (llama/13817)
jeffbolznv May 27, 2025
355202e
opencl: mark `mul_mat` `f32f32` as supporting non-contiguous tensors …
lhez May 27, 2025
f22ba6e
opencl: add new ops - `argsort`, `div`, `sub`, `addrows`, `sigmoid`, …
lhez May 27, 2025
194a5a1
CANN: Add SOC TYPE printing in cmake configuration (llama/13837)
leo-pony May 28, 2025
395e809
CUDA: fix FA tg at long context for CC >= 8.9 (llama/13852)
JohannesGaessler May 28, 2025
2995878
ggml: aarch64: Implement SVE F32 kernels for vector functions (llama/…
vineelabhinav May 29, 2025
c0f50c4
ggml: aarch64: Implement SVE F32 kernels for Mamba Sequential Scan Al…
vineelabhinav May 29, 2025
9241a94
cmake: Factor out CPU architecture detection (llama/13883)
ckastner May 29, 2025
d311b3d
arm64: optimize q4_k_q8_k kernel with i8mm (llama/13886)
cyb70289 May 29, 2025
5138044
cmake: Guard GGML_CPU_ALL_VARIANTS by architecture (llama/13890)
ckastner May 29, 2025
6ab025b
SYCL: Add mrope kernel (llama/13755)
qnixsynapse May 30, 2025
1c216c0
cuda : prevent using split buffers with 3d/4d matrices (llama/13919)
slaren May 30, 2025
085f43f
sched : avoid changing cur_copy when a graph is already allocated (ll…
slaren May 30, 2025
6a2f2c4
CUDA: fix typo in FlashAttention code (llama/13926)
JohannesGaessler May 30, 2025
45ac7b3
CUDA: add a prop in ggml_cuda_device_infor for distinguish iGPU or dG…
Yangxiaoz May 31, 2025
92c6df7
threading: support for GGML_SCHED_PRIO_LOW, update thread info on Win…
max-krasnyansky May 31, 2025
85ca186
sync : ggml
ggerganov Jun 1, 2025
95001c7
talk-llama : sync llama.cpp
ggerganov Jun 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions examples/talk-llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ if (WHISPER_SDL2)
llama-impl.cpp
llama-io.cpp
llama-kv-cache.cpp
llama-kv-cache-unified.cpp
llama-kv-cache-unified-iswa.cpp
llama-kv-cache-recurrent.cpp
llama-memory.cpp
llama-mmap.cpp
llama-model-loader.cpp
Expand Down
3 changes: 3 additions & 0 deletions examples/talk-llama/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
{ LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },

{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },

{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
Expand Down Expand Up @@ -448,6 +450,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_POS_EMBD, "position_embd" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
Expand Down
2 changes: 2 additions & 0 deletions examples/talk-llama/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ enum llm_kv {
LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
LLM_KV_CONVNEXT_BLOCK_COUNT,

LLM_KV_CLASSIFIER_OUTPUT_LABELS,

// deprecated:
LLM_KV_TOKENIZER_PREFIX_ID,
LLM_KV_TOKENIZER_SUFFIX_ID,
Expand Down
31 changes: 19 additions & 12 deletions examples/talk-llama/llama-batch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,31 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
break;
}
}
ubatch_token.resize(!has_embd ? n_ubatch : 0);
ubatch_embd.resize(has_embd ? n_embd * n_ubatch : 0);
ubatch_pos.resize(n_ubatch);
ubatch_n_seq_id.resize(n_ubatch);
ubatch_seq_id.resize(n_ubatch);
ubatch_output.resize(n_ubatch);

udatas.push_back({});

auto & udata = udatas.back();

udata.token.resize(!has_embd ? n_ubatch : 0);
udata.embd.resize(has_embd ? n_embd * n_ubatch : 0);
udata.pos.resize(n_ubatch);
udata.n_seq_id.resize(n_ubatch);
udata.seq_id.resize(n_ubatch);
udata.output.resize(n_ubatch);

llama_ubatch ubatch = {
/*equal_seqs =*/ true,
/*n_tokens =*/ 0,
/*n_seq_tokens =*/ 0,
/*n_seqs =*/ 0,
/*token =*/ !has_embd ? ubatch_token.data() : nullptr,
/*embd =*/ has_embd ? ubatch_embd.data() : nullptr,
/*pos =*/ ubatch_pos.data(),
/*n_seq_id =*/ ubatch_n_seq_id.data(),
/*seq_id =*/ ubatch_seq_id.data(),
/*output =*/ ubatch_output.data(),
/*token =*/ !has_embd ? udata.token.data() : nullptr,
/*embd =*/ has_embd ? udata.embd.data() : nullptr,
/*pos =*/ udata.pos.data(),
/*n_seq_id =*/ udata.n_seq_id.data(),
/*seq_id =*/ udata.seq_id.data(),
/*output =*/ udata.output.data(),
};

return ubatch;
}

Expand Down
25 changes: 15 additions & 10 deletions examples/talk-llama/llama-batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ struct llama_ubatch {
bool equal_seqs;
// TODO: whole_seqs for embeddings?

uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
uint32_t n_seq_tokens; // tokens per sequence
uint32_t n_seqs;

llama_token * token; // [n_tokens]
float * embd; // [n_embd, n_tokens]
llama_pos * pos; // [n_tokens]
int32_t * n_seq_id; // [n_seqs]
llama_seq_id ** seq_id; // [n_seqs]
int32_t * n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence
llama_seq_id ** seq_id; // [n_seqs] // TODO: become llama_seq_id * seq_id;
int8_t * output; // [n_tokens]
};

Expand Down Expand Up @@ -49,13 +49,18 @@ struct llama_sbatch {

const llama_batch * batch = nullptr;

// buffers for the ubatch
std::vector<llama_token> ubatch_token;
std::vector<float> ubatch_embd;
std::vector<llama_pos> ubatch_pos;
std::vector<int32_t> ubatch_n_seq_id;
std::vector<llama_seq_id *> ubatch_seq_id;
std::vector<int8_t> ubatch_output;
// buffers for the ubatches
// TODO: very hacky, this needs a complete rework
struct ubatch_data {
std::vector<llama_token> token;
std::vector<float> embd;
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id *> seq_id;
std::vector<int8_t> output;
};

std::vector<ubatch_data> udatas;

llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false);

Expand Down
Loading
Loading