Skip to content

Commit 83350ec

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/ISSUE_TEMPLATE/020-enhancement.yml # .github/ISSUE_TEMPLATE/030-research.yml # .github/ISSUE_TEMPLATE/040-refactor.yml # .github/workflows/build.yml # Makefile # common/CMakeLists.txt # examples/CMakeLists.txt # examples/infill/infill.cpp # examples/lookahead/lookahead.cpp # examples/lookup/lookup-stats.cpp # examples/lookup/lookup.cpp # examples/parallel/parallel.cpp # examples/retrieval/retrieval.cpp # examples/save-load-state/save-load-state.cpp # examples/speculative/speculative.cpp # flake.lock # ggml/src/ggml-cann/CMakeLists.txt # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/kernels/CMakeLists.txt # ggml/src/ggml-cann/kernels/dup.cpp # ggml/src/ggml-cann/kernels/get_row_f16.cpp # ggml/src/ggml-cann/kernels/get_row_f32.cpp # ggml/src/ggml-cann/kernels/get_row_q4_0.cpp # tests/test-arg-parser.cpp # tests/test-backend-ops.cpp
2 parents a7f161d + d9d54e4 commit 83350ec

File tree

21 files changed

+795
-371
lines changed

21 files changed

+795
-371
lines changed

common/arg.cpp

Lines changed: 231 additions & 207 deletions
Large diffs are not rendered by default.

common/common.cpp

Lines changed: 68 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -538,12 +538,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
538538
[](const unsigned char c) { return !std::isprint(c); }),
539539
detokenized.end());
540540

541-
buf << "\n" << std::to_string(i)
542-
<< ":token '" << detokenized << "'"
543-
<< ":pos " << std::to_string(batch.pos[i])
544-
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
545-
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
546-
<< ":logits " << std::to_string(batch.logits[i]);
541+
buf << "\n" << std::to_string(i)
542+
<< ", token '" << detokenized << "'"
543+
<< ", pos " << std::to_string(batch.pos[i])
544+
<< ", n_seq_id " << std::to_string(batch.n_seq_id[i])
545+
<< ", seq_id " << std::to_string(batch.seq_id[i][0])
546+
<< ", logits " << std::to_string(batch.logits[i]);
547547
}
548548

549549
buf << " ]";
@@ -927,9 +927,9 @@ struct common_init_result common_init_from_params(common_params & params) {
927927
common_lora_adapters_apply(lctx, iparams.lora_adapters);
928928
}
929929

930-
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
930+
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
931931
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
932-
params.sparams.ignore_eos = false;
932+
params.sampling.ignore_eos = false;
933933
}
934934

935935
if (params.warmup) {
@@ -1492,6 +1492,66 @@ void common_batch_add(
14921492
batch.n_tokens++;
14931493
}
14941494

1495+
//
1496+
// Token utils
1497+
//
1498+
1499+
size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
1500+
size_t i;
1501+
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
1502+
1503+
return i;
1504+
}
1505+
1506+
size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
1507+
// check for empty sequences
1508+
if (a.empty() || b.empty()) {
1509+
return 0;
1510+
}
1511+
1512+
// get the lengths of the input sequences
1513+
size_t a_len = a.size();
1514+
size_t b_len = b.size();
1515+
1516+
// initialize the maximum length of the longest common subsequence (LCS)
1517+
size_t max_length = 0;
1518+
1519+
// use two rows instead of a 2D matrix to optimize space
1520+
std::vector<size_t> prev_row(b_len + 1, 0);
1521+
std::vector<size_t> curr_row(b_len + 1, 0);
1522+
1523+
// iterate through the elements of a
1524+
for (size_t i = 1; i <= a_len; i++) {
1525+
// iterate through the elements of b
1526+
for (size_t j = 1; j <= b_len; j++) {
1527+
// if elements at the current positions match
1528+
if (a[i - 1] == b[j - 1]) {
1529+
// if it's the first element of either sequences, set LCS length to 1
1530+
if (i == 1 || j == 1) {
1531+
curr_row[j] = 1;
1532+
} else {
1533+
// increment LCS length by 1 compared to the previous element
1534+
curr_row[j] = prev_row[j - 1] + 1;
1535+
}
1536+
1537+
// update max_length if necessary
1538+
if (curr_row[j] > max_length) {
1539+
max_length = curr_row[j];
1540+
}
1541+
} else {
1542+
// reset LCS length if elements don't match
1543+
curr_row[j] = 0;
1544+
}
1545+
}
1546+
1547+
// update the previous row for the next iteration
1548+
prev_row = curr_row;
1549+
}
1550+
1551+
// return the maximum length of the LCS
1552+
return max_length;
1553+
}
1554+
14951555
//
14961556
// Vocab utils
14971557
//

common/common.h

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ struct common_lora_adapter_container : common_lora_adapter_info {
3333
struct llama_lora_adapter * adapter;
3434
};
3535

36+
using llama_tokens = std::vector<llama_token>;
37+
3638
// build info
3739

3840
struct common_control_vector_load_info;
@@ -97,8 +99,8 @@ enum dimre_method {
9799
DIMRE_METHOD_MEAN,
98100
};
99101

100-
// sampler parameters
101-
struct common_sampler_params {
102+
// sampling parameters
103+
struct common_params_sampling {
102104
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
103105

104106
int32_t n_prev = 64; // number of previous tokens to remember
@@ -149,19 +151,30 @@ struct common_sampler_params {
149151
std::string print() const;
150152
};
151153

154+
struct common_params_speculative {
155+
int32_t n_ctx = 0; // draft context size
156+
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
157+
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
158+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
159+
float p_split = 0.1f; // speculative decoding split probability
160+
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
161+
162+
struct cpu_params cpuparams;
163+
struct cpu_params cpuparams_batch;
164+
165+
std::string model = ""; // draft model for speculative decoding // NOLINT
166+
};
167+
152168
struct common_params {
153169
int32_t n_predict = -1; // new tokens to predict
154170
int32_t n_ctx = 4096; // context size
155171
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
156172
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
157173
int32_t n_keep = 0; // number of tokens to keep from initial prompt
158-
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
159174
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
160175
int32_t n_parallel = 1; // number of parallel sequences to decode
161176
int32_t n_sequences = 1; // number of sequences to decode
162-
float p_split = 0.1f; // speculative decoding split probability
163177
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
164-
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
165178
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
166179
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
167180
int32_t grp_attn_n = 1; // group-attention factor
@@ -178,8 +191,6 @@ struct common_params {
178191

179192
struct cpu_params cpuparams;
180193
struct cpu_params cpuparams_batch;
181-
struct cpu_params draft_cpuparams;
182-
struct cpu_params draft_cpuparams_batch;
183194

184195
ggml_backend_sched_eval_callback cb_eval = nullptr;
185196
void * cb_eval_user_data = nullptr;
@@ -191,10 +202,10 @@ struct common_params {
191202
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
192203
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
193204

194-
struct common_sampler_params sparams;
205+
struct common_params_sampling sampling;
206+
struct common_params_speculative speculative;
195207

196208
std::string model = ""; // model path // NOLINT
197-
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
198209
std::string model_alias = "unknown"; // model alias // NOLINT
199210
std::string model_url = ""; // model url to download // NOLINT
200211
std::string hf_token = ""; // HF token // NOLINT
@@ -457,7 +468,9 @@ struct llama_model * common_load_model_from_hf(const char * repo, const char * f
457468
// clear LoRA adapters from context, then apply new list of adapters
458469
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
459470

471+
//
460472
// Batch utils
473+
//
461474

462475
void common_batch_clear(struct llama_batch & batch);
463476

@@ -468,6 +481,16 @@ void common_batch_add(
468481
const std::vector<llama_seq_id> & seq_ids,
469482
bool logits);
470483

484+
//
485+
// Token utils
486+
//
487+
488+
// longest common prefix
489+
size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
490+
491+
// longet common subsequence
492+
size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
493+
471494
//
472495
// Vocab utils
473496
//

common/sampling.cpp

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ struct ring_buffer {
9999
};
100100

101101
struct common_sampler {
102-
common_sampler_params params;
102+
common_params_sampling params;
103103

104104
struct llama_sampler * grmr;
105105
struct llama_sampler * chain;
@@ -125,7 +125,7 @@ struct common_sampler {
125125
}
126126
};
127127

128-
std::string common_sampler_params::print() const {
128+
std::string common_params_sampling::print() const {
129129
char result[1024];
130130

131131
snprintf(result, sizeof(result),
@@ -141,7 +141,7 @@ std::string common_sampler_params::print() const {
141141
return std::string(result);
142142
}
143143

144-
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
144+
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
145145
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
146146

147147
lparams.no_perf = params.no_perf;
@@ -320,6 +320,45 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
320320
return cur_p.data[cur_p.selected].id;
321321
}
322322

323+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
324+
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
325+
326+
std::vector<llama_token> result;
327+
result.reserve(idxs.size());
328+
329+
size_t i = 0;
330+
for (; i < draft.size(); i++) {
331+
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
332+
333+
common_sampler_accept(gsmpl, id, true);
334+
335+
result.push_back(id);
336+
337+
if (draft[i] != id) {
338+
break;
339+
}
340+
}
341+
342+
if (i == draft.size()) {
343+
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
344+
345+
common_sampler_accept(gsmpl, id, true);
346+
347+
result.push_back(id);
348+
}
349+
350+
return result;
351+
}
352+
353+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
354+
std::vector<int> idxs(draft.size() + 1);
355+
for (size_t i = 0; i < idxs.size(); ++i) {
356+
idxs[i] = i;
357+
}
358+
359+
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
360+
}
361+
323362
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
324363
return llama_sampler_get_seed(gsmpl->chain);
325364
}

common/sampling.h

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ struct common_sampler;
3636

3737
// llama_sampler API overloads
3838

39-
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
39+
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
4040

4141
void common_sampler_free(struct common_sampler * gsmpl);
4242

@@ -60,6 +60,27 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
6060
//
6161
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
6262

63+
// generalized version of common_sampler_sample
64+
//
65+
// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
66+
// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
67+
//
68+
// common_sampler_sample_n(gsmpl, ctx, { idx }, {});
69+
//
70+
// is equivalent to
71+
//
72+
// common_sampler_sample(gsmpl, ctx, idx);
73+
// common_sampler_accept(gsmpl, token, true);
74+
//
75+
// requires: idxs.size() == draft.size() + 1
76+
//
77+
// returns at least 1 token, up to idxs.size()
78+
//
79+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
80+
81+
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
82+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
83+
6384
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
6485

6586
// helpers

0 commit comments

Comments
 (0)