Skip to content

Commit 5091a28

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 0a81ff0 + 851553e commit 5091a28

File tree

11 files changed

+81
-58
lines changed

11 files changed

+81
-58
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
#include "ggml-cuda/upscale.cuh"
5151
#include "ggml-cuda/wkv.cuh"
5252
#include "ggml-cuda/gla.cuh"
53+
#include "ggml-cuda/set.cuh"
5354
#include "ggml-cuda/set-rows.cuh"
5455
#include "ggml-cuda/pad_reflect_1d.cuh"
5556
#include "ggml.h"
@@ -2416,6 +2417,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
24162417
case GGML_OP_SET_ROWS:
24172418
ggml_cuda_op_set_rows(ctx, dst);
24182419
break;
2420+
case GGML_OP_SET:
2421+
ggml_cuda_op_set(ctx, dst);
2422+
break;
24192423
case GGML_OP_DUP:
24202424
ggml_cuda_dup(ctx, dst);
24212425
break;
@@ -3842,6 +3846,13 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
38423846
op->src[0]->type == GGML_TYPE_F32 &&
38433847
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
38443848
} break;
3849+
case GGML_OP_SET:
3850+
{
3851+
const ggml_type t = op->type;
3852+
return (t == GGML_TYPE_F32 || t == GGML_TYPE_I32) &&
3853+
t == op->src[0]->type &&
3854+
t == op->src[1]->type;
3855+
} break;
38453856
case GGML_OP_CPY:
38463857
{
38473858
ggml_type src0_type = op->src[0]->type;

ggml/src/ggml-cuda/set.cu

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#include "set.cuh"
2+
#include "cpy.cuh"
3+
4+
void ggml_cuda_op_set(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
5+
const ggml_tensor * src0 = dst->src[0];
6+
const ggml_tensor * src1 = dst->src[1];
7+
8+
GGML_ASSERT((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32));
9+
GGML_ASSERT(src1->type == src0->type);
10+
GGML_ASSERT(dst ->type == src0->type);
11+
12+
GGML_ASSERT(ggml_is_contiguous(dst));
13+
GGML_ASSERT(ggml_is_contiguous(src0));
14+
GGML_ASSERT(ggml_is_contiguous(src1));
15+
16+
const size_t nb1 = ((int32_t *) dst->op_params)[0];
17+
const size_t nb2 = ((int32_t *) dst->op_params)[1];
18+
const size_t nb3 = ((int32_t *) dst->op_params)[2];
19+
const size_t offset = ((int32_t *) dst->op_params)[3];
20+
const bool inplace= (bool) ((int32_t *) dst->op_params)[4];
21+
22+
if (!inplace) {
23+
ggml_cuda_cpy(ctx, src0, dst);
24+
}
25+
26+
ggml_tensor dst_view = *dst;
27+
dst_view.data = (void *)((char *)dst->data + offset);
28+
dst_view.ne[0] = src1->ne[0];
29+
dst_view.ne[1] = src1->ne[1];
30+
dst_view.ne[2] = src1->ne[2];
31+
dst_view.ne[3] = src1->ne[3];
32+
33+
dst_view.nb[0] = ggml_element_size(dst);
34+
dst_view.nb[1] = nb1;
35+
dst_view.nb[2] = nb2;
36+
dst_view.nb[3] = nb3;
37+
38+
ggml_cuda_cpy(ctx, src1, &dst_view);
39+
}

ggml/src/ggml-cuda/set.cuh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#pragma once
2+
3+
#include "common.cuh"
4+
5+
#define CUDA_SET_BLOCK_SIZE 256
6+
7+
void ggml_cuda_op_set(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

ggml/src/ggml-hexagon/ggml-hexagon.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ static inline void hex_format_op_names(char * str, const struct ggml_tensor * t)
211211
// ** backend sessions
212212

213213
struct ggml_hexagon_session {
214-
ggml_hexagon_session(int dev_id) noexcept(false);
214+
ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
215215
~ggml_hexagon_session() noexcept(true);
216216

217217
void allocate(int dev_id) noexcept(false);
@@ -1631,10 +1631,13 @@ void ggml_hexagon_session::release() noexcept(true) {
16311631
}
16321632
}
16331633

1634-
ggml_hexagon_session::ggml_hexagon_session(int dev_id) noexcept(false) {
1634+
ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false) {
16351635
buffer_type.context = nullptr;
16361636
repack_buffer_type.context = nullptr;
16371637

1638+
buffer_type.device = dev;
1639+
repack_buffer_type.device = dev;
1640+
16381641
try {
16391642
allocate(dev_id);
16401643

@@ -3628,7 +3631,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
36283631
devices[i].iface = ggml_backend_hexagon_device_i;
36293632
devices[i].reg = reg;
36303633
try {
3631-
devices[i].context = new ggml_hexagon_session(i);
3634+
devices[i].context = new ggml_hexagon_session(i, &devices[i]);
36323635
} catch (std::exception const &exc) {
36333636
GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
36343637
devices[i].context = nullptr;

src/llama-kv-cache.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -961,10 +961,14 @@ bool llama_kv_cache::get_has_shift() const {
961961
uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
962962
uint32_t result = 0;
963963

964+
// pad the n_kv value so that the graph remains constant across batches and can be reused
965+
// note: this also helps some backends with performance (f.ex https://github.com/ggml-org/llama.cpp/pull/16812#issuecomment-3455112220)
966+
const uint32_t n_pad_cur = std::max(n_pad, 256u);
967+
964968
for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
965969
const auto & cells = v_cells[sinfo.strm[s]];
966970

967-
result = std::max(std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))), result);
971+
result = std::max(std::min(cells.size(), std::max(n_pad_cur, GGML_PAD(cells.used_max_p1(), n_pad_cur))), result);
968972
}
969973

970974
return result;
@@ -2014,8 +2018,3 @@ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ub
20142018
void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
20152019
kv->set_input_pos_bucket(dst, ubatch);
20162020
}
2017-
2018-
uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) {
2019-
// the FA kernels require padding to avoid extra runtime boundary checks
2020-
return cparams.flash_attn ? 256u : 32u;
2021-
}

src/llama-kv-cache.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ struct llama_context;
1919

2020
class llama_kv_cache : public llama_memory_i {
2121
public:
22-
static uint32_t get_padding(const llama_cparams & cparams);
23-
2422
struct stream_copy_info {
2523
bool empty() const {
2624
assert(ssrc.size() == sdst.size());

src/llama-model.cpp

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19641,7 +19641,7 @@ struct llm_build_apertus : public llm_graph_context {
1964119641
}
1964219642
};
1964319643

19644-
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
19644+
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
1964519645
llama_memory_i * res;
1964619646

1964719647
switch (arch) {
@@ -19692,17 +19692,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1969219692
};
1969319693
}
1969419694

19695-
const auto padding = llama_kv_cache::get_padding(cparams);
19696-
19697-
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
19698-
1969919695
res = new llama_memory_hybrid(
1970019696
/* model */ *this,
1970119697
/* attn_type_k */ params.type_k,
1970219698
/* attn_type_v */ params.type_v,
1970319699
/* attn_v_trans */ !cparams.flash_attn,
1970419700
/* attn_kv_size */ cparams.n_ctx,
19705-
/* attn_n_pad */ padding,
19701+
/* attn_n_pad */ 1,
1970619702
/* attn_n_swa */ hparams.n_swa,
1970719703
/* attn_swa_type */ hparams.swa_type,
1970819704
/* recurrent_type_k */ GGML_TYPE_F32,
@@ -19714,23 +19710,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1971419710
/* filter_attn */ std::move(filter_attn),
1971519711
/* filter_recr */ std::move(filter_recr));
1971619712
} else {
19717-
const auto padding = llama_kv_cache::get_padding(cparams);
19718-
1971919713
uint32_t n_ctx_per_stream = cparams.n_ctx;
1972019714

1972119715
if (!cparams.kv_unified) {
1972219716
n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
19723-
n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
19724-
19725-
cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
19726-
} else {
19727-
n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
19728-
19729-
cparams.n_ctx = n_ctx_per_stream;
1973019717
}
1973119718

19732-
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
19733-
1973419719
llama_memory_i::layer_reuse_cb reuse = nullptr;
1973519720

1973619721
if (arch == LLM_ARCH_GEMMA3N) {
@@ -19757,7 +19742,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1975719742
n_ctx_per_stream,
1975819743
cparams.n_seq_max,
1975919744
cparams.n_ubatch,
19760-
padding,
19745+
1,
1976119746
nullptr,
1976219747
reuse);
1976319748
} else {
@@ -19772,7 +19757,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1977219757
cparams.kv_unified,
1977319758
n_ctx_per_stream,
1977419759
cparams.n_seq_max,
19775-
padding,
19760+
1,
1977619761
hparams.n_swa,
1977719762
hparams.swa_type,
1977819763
nullptr,

src/llama-model.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -500,9 +500,8 @@ struct llama_model {
500500

501501
ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
502502

503-
// note: can mutate `cparams`
504503
// TODO: move this to new llm_arch_model_i interface
505-
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
504+
llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
506505

507506
// TODO: move this to new llm_arch_model_i interface
508507
ggml_cgraph * build_graph(const llm_graph_params & params) const;

tools/llama-bench/README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,9 @@ Using the `-d <n>` option, each test can be run at a specified context depth, pr
8282

8383
For a description of the other options, see the [main example](../main/README.md).
8484

85+
> [!NOTE]
86+
> The measurements with `llama-bench` do not include the times for tokenization and for sampling.
87+
8588
## Examples
8689

8790
### Text generation with different models
@@ -131,7 +134,7 @@ $ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
131134
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | pp 64 | 33.52 ± 0.03 |
132135
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | tg 16 | 15.32 ± 0.05 |
133136
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | pp 64 | 59.00 ± 1.11 |
134-
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | tg 16 | 16.41 ± 0.79 ||
137+
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | tg 16 | 16.41 ± 0.79 |
135138

136139
### Different numbers of layers offloaded to the GPU
137140

tools/server/server.cpp

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2866,10 +2866,12 @@ struct server_context {
28662866

28672867
// if context shifting is disabled, make sure that we don't run out of context
28682868
if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
2869+
slot.truncated = true;
28692870
slot.stop = STOP_TYPE_LIMIT;
28702871
slot.has_next_token = false;
28712872

2872-
SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
2873+
SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
2874+
slot.n_decoded, slot.n_prompt_tokens(), slot.n_past, slot.n_ctx);
28732875
}
28742876

28752877
// check the limits
@@ -2929,36 +2931,13 @@ struct server_context {
29292931
}
29302932
}
29312933

2932-
// if context shift is disabled, we stop when it reaches the context limit
2933-
if (slot.n_past >= slot.n_ctx) {
2934-
slot.truncated = true;
2935-
slot.stop = STOP_TYPE_LIMIT;
2936-
slot.has_next_token = false;
2937-
2938-
SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
2939-
slot.n_decoded, slot.n_prompt_tokens(), slot.n_past, slot.n_ctx);
2940-
}
2941-
29422934
if (llama_vocab_is_eog(vocab, result.tok)) {
29432935
slot.stop = STOP_TYPE_EOS;
29442936
slot.has_next_token = false;
29452937

29462938
SLT_DBG(slot, "%s", "stopped by EOS\n");
29472939
}
29482940

2949-
const auto n_ctx_train = llama_model_n_ctx_train(model);
2950-
2951-
if (slot.task->params.n_predict < 1 && slot.n_prompt_tokens() + slot.n_decoded >= n_ctx_train) {
2952-
slot.truncated = true;
2953-
slot.stop = STOP_TYPE_LIMIT;
2954-
slot.has_next_token = false; // stop prediction
2955-
2956-
SLT_WRN(slot,
2957-
"n_predict (%d) is set for infinite generation. "
2958-
"Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
2959-
slot.task->params.n_predict, n_ctx_train);
2960-
}
2961-
29622941
SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());
29632942

29642943
return slot.has_next_token; // continue

0 commit comments

Comments
 (0)