Skip to content

Commit 7233ee8

Browse files
authored
Merge branch 'ggml-org:master' into seed_oss
2 parents fa8da8e + 54a241f commit 7233ee8

File tree

8 files changed

+134
-438
lines changed

8 files changed

+134
-438
lines changed

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1755,7 +1755,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
17551755
[](common_params & params) {
17561756
params.warmup = false;
17571757
}
1758-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
1758+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
17591759
add_opt(common_arg(
17601760
{"--spm-infill"},
17611761
string_format(

ggml/src/ggml-backend.cpp

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1355,15 +1355,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
13551355
std::vector<int32_t> ids;
13561356
std::vector<ggml_bitset_t> used_ids;
13571357

1358-
for (int i = 0; i < sched->n_splits; i++) {
1359-
struct ggml_backend_sched_split * split = &splits[i];
1358+
for (int split_id = 0; split_id < sched->n_splits; split_id++) {
1359+
struct ggml_backend_sched_split * split = &splits[split_id];
13601360
int split_backend_id = split->backend_id;
13611361
ggml_backend_t split_backend = sched->backends[split_backend_id];
13621362

13631363
// copy the input tensors to the split backend
1364-
for (int j = 0; j < split->n_inputs; j++) {
1365-
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1366-
struct ggml_tensor * input = split->inputs[j];
1364+
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
1365+
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
1366+
struct ggml_tensor * input = split->inputs[input_id];
13671367
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
13681368

13691369
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
@@ -1398,17 +1398,30 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
13981398

13991399
// get the ids
14001400
ggml_tensor * ids_tensor = node->src[2];
1401+
ggml_backend_t ids_backend = split_backend;
1402+
1403+
// if the ids tensor is also an input of the split, it may not have been copied yet to the split backend
1404+
// in that case, we use the original ids tensor
1405+
for (int i = input_id + 1; i < split->n_inputs; i++) {
1406+
if (ids_tensor == tensor_copy(split->inputs[i], split_backend_id, sched->cur_copy)) {
1407+
ids_tensor = split->inputs[i];
1408+
ids_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[i]);
1409+
break;
1410+
}
1411+
}
1412+
14011413
if (ids_tensor != prev_ids_tensor) {
14021414
ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t));
1403-
ggml_backend_tensor_get_async(split_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
1404-
ggml_backend_synchronize(split_backend);
1415+
ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
1416+
ggml_backend_synchronize(ids_backend);
14051417

14061418
// find the used experts
14071419
used_ids.clear();
14081420
used_ids.resize(ggml_bitset_size(n_expert));
14091421
for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
14101422
for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
14111423
int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
1424+
GGML_ASSERT(id >= 0 && id < n_expert);
14121425
ggml_bitset_set(used_ids.data(), id);
14131426
}
14141427
}

include/llama.h

Lines changed: 0 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -663,111 +663,6 @@ extern "C" {
663663
// Check if the memory supports shifting
664664
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
665665

666-
//
667-
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
668-
//
669-
670-
// Returns the number of tokens in the KV cache (slow, use only for debug)
671-
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
672-
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
673-
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
674-
675-
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
676-
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
677-
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
678-
679-
// Clear the KV cache - both cell info is erased and KV data is zeroed
680-
DEPRECATED(LLAMA_API void llama_kv_self_clear(
681-
struct llama_context * ctx),
682-
"Use llama_memory_clear() instead");
683-
684-
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
685-
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
686-
// seq_id < 0 : match any sequence
687-
// p0 < 0 : [0, p1]
688-
// p1 < 0 : [p0, inf)
689-
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
690-
struct llama_context * ctx,
691-
llama_seq_id seq_id,
692-
llama_pos p0,
693-
llama_pos p1),
694-
"Use llama_memory_seq_rm() instead");
695-
696-
// Copy all tokens that belong to the specified sequence to another sequence
697-
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
698-
// p0 < 0 : [0, p1]
699-
// p1 < 0 : [p0, inf)
700-
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
701-
struct llama_context * ctx,
702-
llama_seq_id seq_id_src,
703-
llama_seq_id seq_id_dst,
704-
llama_pos p0,
705-
llama_pos p1),
706-
"Use llama_memory_seq_cp() instead");
707-
708-
// Removes all tokens that do not belong to the specified sequence
709-
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
710-
struct llama_context * ctx,
711-
llama_seq_id seq_id),
712-
"Use llama_memory_seq_keep() instead");
713-
714-
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
715-
// If the KV cache is RoPEd, the KV data is updated accordingly:
716-
// - lazily on next llama_decode()
717-
// p0 < 0 : [0, p1]
718-
// p1 < 0 : [p0, inf)
719-
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
720-
struct llama_context * ctx,
721-
llama_seq_id seq_id,
722-
llama_pos p0,
723-
llama_pos p1,
724-
llama_pos delta),
725-
"Use llama_memory_seq_add() instead");
726-
727-
// Integer division of the positions by factor of `d > 1`
728-
// If the KV cache is RoPEd, the KV data is updated accordingly:
729-
// - lazily on next llama_decode()
730-
// p0 < 0 : [0, p1]
731-
// p1 < 0 : [p0, inf)
732-
DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
733-
struct llama_context * ctx,
734-
llama_seq_id seq_id,
735-
llama_pos p0,
736-
llama_pos p1,
737-
int d),
738-
"Use llama_memory_seq_div() instead");
739-
740-
// Returns the smallest position present in the KV cache for the specified sequence
741-
// This is typically non-zero only for SWA caches
742-
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
743-
// Return -1 if the sequence is empty
744-
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
745-
struct llama_context * ctx,
746-
llama_seq_id seq_id),
747-
"Use llama_memory_seq_pos_min() instead");
748-
749-
// Returns the largest position present in the KV cache for the specified sequence
750-
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
751-
// Return -1 if the sequence is empty
752-
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
753-
struct llama_context * ctx,
754-
llama_seq_id seq_id),
755-
"Use llama_memory_seq_pos_max() instead");
756-
757-
// Defragment the KV cache
758-
// This will be applied:
759-
// - lazily on next llama_decode()
760-
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
761-
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
762-
763-
// Check if the context supports KV cache shifting
764-
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
765-
"use llama_memory_can_shift() instead");
766-
767-
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
768-
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
769-
"simply remove this call, updates are applied lazily on the next llama_decode()");
770-
771666
//
772667
// State / sessions
773668
//

0 commit comments

Comments
 (0)