Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ struct common_sampler {
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}

cur_p = { cur.data(), cur.size(), -1, false };
cur_p = { cur.data(), cur.size(), false, -1, false };
}
};

Expand Down Expand Up @@ -360,7 +360,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
// check if it the sampled token fits the grammar
{
llama_token_data single_token_data = { id, 1.0f, 0.0f };
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
llama_token_data_array single_token_data_array = { &single_token_data, 1, false, -1, false };

llama_sampler_apply(grmr, &single_token_data_array);

Expand Down
3 changes: 3 additions & 0 deletions examples/diffusion/diffusion-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,7 @@ static void diffusion_generate(llama_context * ctx,
llama_token_data_array cur_p = {
candidates.data(),
(size_t) n_vocab,
false, // normalized
-1,
false,
};
Expand All @@ -429,6 +430,7 @@ static void diffusion_generate(llama_context * ctx,
llama_token_data_array cur_p = {
candidates.data(),
candidates.size(),
false, // normalized
-1,
false,
};
Expand Down Expand Up @@ -472,6 +474,7 @@ static void diffusion_generate(llama_context * ctx,
llama_token_data_array conf_array = {
conf_candidates.data(),
conf_candidates.size(),
false,
-1,
false,
};
Expand Down
2 changes: 1 addition & 1 deletion examples/speculative/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ int main(int argc, char ** argv) {

LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
float r = u_dist(rng);
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), false, LLAMA_TOKEN_NULL, true };

//GGML_ASSERT(dist_tgt.size <= dist_dft.size);

Expand Down
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ extern "C" {
// NOTE: this pointer can be modified by the samplers
llama_token_data * data;
size_t size;
bool normalized; // true if the probabilities (llama_token_data.p) have been computed
int64_t selected; // this is the index in the data array (i.e. not the token id)
bool sorted; // note: do not assume the data is sorted - always check this flag
} llama_token_data_array;
Expand Down
1 change: 1 addition & 0 deletions src/llama-grammar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1156,6 +1156,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
for (const auto & reject : rejects) {
cur_p->data[reject.index].logit = -INFINITY;
}
cur_p->normalized = false;
}

void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
Expand Down
64 changes: 57 additions & 7 deletions src/llama-sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ static void llama_log_softmax(float * array, size_t size) {
*/

static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) {
cur_p->normalized = false;
if (temp <= 0.0f) {
// find the token with the highest logit and set the rest to -inf
size_t max_i = 0;
Expand Down Expand Up @@ -309,6 +310,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_s
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= cum_sum;
}
cur_p->normalized = true;
}

static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
Expand All @@ -328,6 +330,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
}

cur_p->size = k;
cur_p->normalized = false;
}

static uint32_t get_rng_seed(uint32_t seed) {
Expand Down Expand Up @@ -422,6 +425,7 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte
llama_token_data_array cur_p = {
/* .data = */ cur.data(),
/* .size = */ cur.size(),
/* .normalized = */ false,
/* .selected = */ -1,
/* .sorted = */ false,
};
Expand Down Expand Up @@ -614,6 +618,23 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da

if (cur_p->size == 1) {
cur_p->data[0].p = 1.0f;
cur_p->normalized = true;
return;
}

if (cur_p->normalized) {
std::uniform_real_distribution<double> dist(0.0f, 1.0f);
const double rnd = dist(ctx->rng);
double sum_run = 0.0f;

for (size_t i = 0; i < cur_p->size; ++i) {
sum_run += cur_p->data[i].p;
if (sum_run >= rnd) {
cur_p->selected = i;
return;
}
}
cur_p->selected = cur_p->size - 1;
return;
}

Expand Down Expand Up @@ -663,13 +684,15 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
if (!found) {
cur_p->selected = cur_p->size - 1;
}
cur_p->normalized = true;
#else
// for clarity, this is the same as above but does one pass for normalization and one extra pass for sampling
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= sum_cum;
}

cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
cur_p->normalized = true;
#endif
}

Expand Down Expand Up @@ -780,7 +803,9 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
return;
}

llama_sampler_softmax_impl(cur_p, false);
if (!cur_p->normalized) {
llama_sampler_softmax_impl(cur_p, false);
}

size_t k = cur_p->size;
auto * pdata = cur_p->data;
Expand Down Expand Up @@ -826,6 +851,7 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
}

cur_p->size = last_idx;
cur_p->normalized = false;
}

static struct llama_sampler * llama_sampler_top_p_clone(const struct llama_sampler * smpl) {
Expand Down Expand Up @@ -897,6 +923,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
std::copy(filtered_tokens.begin(), filtered_tokens.end(), cur_p->data);
cur_p->size = filtered_tokens.size();
cur_p->normalized = false;
min_p_applied = true;
}
}
Expand All @@ -919,6 +946,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d

// Resize the output vector to keep only the matching tokens
cur_p->size = i;
cur_p->normalized = false;
}
}

Expand Down Expand Up @@ -971,7 +999,9 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
}

// Compute the softmax of logits and calculate entropy
llama_sampler_softmax_impl(cur_p, true);
if (!cur_p->normalized) {
llama_sampler_softmax_impl(cur_p, true);
}

float entropy = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) {
Expand Down Expand Up @@ -1019,6 +1049,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data);
cur_p->size = cur_p_new.size();
cur_p->sorted = false;
cur_p->normalized = false;
}

static struct llama_sampler * llama_sampler_typical_clone(const struct llama_sampler * smpl) {
Expand Down Expand Up @@ -1120,7 +1151,9 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
// Calculate maximum possible entropy
float max_entropy = -logf(1.0f / cur_p->size);

llama_sampler_softmax_impl(cur_p, true);
if (!cur_p->normalized) {
llama_sampler_softmax_impl(cur_p, true);
}

// Calculate entropy of the softmax probabilities
float entropy = 0.0f;
Expand Down Expand Up @@ -1162,6 +1195,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
}
cur_p->normalized = true;

#ifdef DEBUG
// Print the updated top 25 probabilities after temperature scaling
Expand Down Expand Up @@ -1236,7 +1270,9 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
return;
}

llama_sampler_softmax_impl(cur_p, true);
if (!cur_p->normalized) {
llama_sampler_softmax_impl(cur_p, true);
}

int pos_last = 0;

Expand All @@ -1251,6 +1287,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
cur_p->data += pos_last;
cur_p->size -= pos_last;
cur_p->normalized = false;
}
}

Expand Down Expand Up @@ -1327,7 +1364,9 @@ static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*s
static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_mirostat *) smpl->ctx;

llama_sampler_softmax_impl(cur_p, true);
if (!cur_p->normalized) {
llama_sampler_softmax_impl(cur_p, true);
}

// Estimate s_hat using the most probable m tokens
float s_hat = 0.0;
Expand Down Expand Up @@ -1433,7 +1472,9 @@ static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler *
static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;

llama_sampler_softmax_impl(cur_p, true);
if (!cur_p->normalized) {
llama_sampler_softmax_impl(cur_p, true);
}

// Truncate the words with surprise values greater than mu
cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
Expand Down Expand Up @@ -1775,6 +1816,7 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
}

cur_p->sorted = false;
cur_p->normalized = false;
}

static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
Expand Down Expand Up @@ -2193,6 +2235,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
}

cur_p->sorted = false;
cur_p->normalized = false;
}

static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
Expand Down Expand Up @@ -2344,6 +2387,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
}

if (ctx->to_search.empty()) {
cur_p->normalized = false;
return;
}

Expand All @@ -2356,6 +2400,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
}
}
}
cur_p->normalized = false;
}

static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
Expand Down Expand Up @@ -2408,7 +2453,9 @@ static const char * llama_sampler_infill_name(const struct llama_sampler * /*smp
static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_infill *) smpl->ctx;

llama_sampler_softmax_impl(cur_p, true);
if (!cur_p->normalized) {
llama_sampler_softmax_impl(cur_p, true);
}

#if defined(GGML_DEBUG_SAMPLER_INFILL)
#define LOG_DBG_CUR LLAMA_LOG_DEBUG
Expand Down Expand Up @@ -2457,6 +2504,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= p_sum;
}
cur_p->normalized = true;

return;
}
Expand Down Expand Up @@ -2542,6 +2590,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
cur_p->size = 1;
cur_p->data[0].id = ctx->vocab->token_eot();
cur_p->data[0].logit = 1.0f;
cur_p->normalized = true;

return;
}
Expand Down Expand Up @@ -2579,6 +2628,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_

LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
}
cur_p->normalized = true;

#undef LOG_DBG_CUR
}
Expand Down
5 changes: 3 additions & 2 deletions tests/test-grammar-llguidance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ static bool match_string(const std::string & input, llama_sampler * grammar) {
for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
cur.emplace_back(llama_token_data{ token_id, 0.0f, 0.0f });
}
auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), -1, false };
auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), false, -1, false };

for (const auto token : tokens) {
for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
Expand Down Expand Up @@ -1096,6 +1096,7 @@ static void one_hot(llama_token_data_array & tok_arr, llama_token selected) {
}

tok_arr.data[selected].logit = 100.0f;
tok_arr.normalized = false;
}

static void test_sampler_chain(void) {
Expand All @@ -1119,7 +1120,7 @@ start: /[A-Z ]*/)";
for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
cur.emplace_back(llama_token_data{ token_id, 0.0f, 0.0f });
}
auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), -1, false };
auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), false, -1, false };

for (const auto token : tokens) {
one_hot(tok_arr, token);
Expand Down
8 changes: 4 additions & 4 deletions tests/test-sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ struct sampler_tester {
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
}

cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
cur_p = llama_token_data_array { cur.data(), cur.size(), false, -1, false };
}

sampler_tester(const std::vector<float> & probs, const std::vector<float> & probs_expected) : probs_expected(probs_expected) {
Expand All @@ -38,7 +38,7 @@ struct sampler_tester {
cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]});
}

cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
cur_p = llama_token_data_array { cur.data(), cur.size(), false, -1, false };
}

void apply(llama_sampler * sampler) {
Expand Down Expand Up @@ -270,13 +270,13 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector<llama_token_data> & data, int n_iter) {
std::vector<llama_token_data> cur(data.size());
std::copy(data.begin(), data.end(), cur.begin());
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
llama_token_data_array cur_p = { cur.data(), cur.size(), false, -1, false };
llama_sampler_apply(cnstr, &cur_p);
llama_sampler_reset(cnstr);
const int64_t t_start = ggml_time_us();
for (int i = 0; i < n_iter; i++) {
std::copy(data.begin(), data.end(), cur.begin());
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
llama_token_data_array cur_p = { cur.data(), cur.size(), false, -1, false };
llama_sampler_apply(cnstr, &cur_p);
llama_sampler_reset(cnstr);
}
Expand Down
Loading