Skip to content

Commit b74690a

Browse files
committed
DRY: Fixing strict compile errors, cleaning up
1 parent 75920e6 commit b74690a

File tree

4 files changed

+42
-41
lines changed

4 files changed

+42
-41
lines changed

common/common.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,10 @@ struct gpt_sampler_params {
117117
float penalty_repeat = 1.00f; // 1.0 = disabled
118118
float penalty_freq = 0.00f; // 0.0 = disabled
119119
float penalty_present = 0.00f; // 0.0 = disabled
120-
float dry_multiplier = 0.0f; // 0.0f = disabled, recommended value: 0.8f
121-
float dry_base = 1.75f;
122-
int32_t dry_allowed_length = 2;
123-
int32_t dry_penalty_last_n = -1; // DRY last n tokens to penalize (0 = disable penalty, -1 = context size)
120+
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
121+
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
122+
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
123+
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
124124
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
125125
float mirostat_tau = 5.00f; // target entropy
126126
float mirostat_eta = 0.10f; // learning rate

examples/server/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -358,13 +358,13 @@ node index.js
358358

359359
`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
360360

361-
`dry_multiplier`: Set the DRY (Don't Repeat Yourself) sampling multiplier. Default: `0.0`, which is disabled.
361+
`dry_multiplier`: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.
362362

363-
`dry_base`: Set the DRY sampling base value. Default: `1.75`
363+
`dry_base`: Set the DRY repetition penalty base value. Default: `1.75`
364364

365-
`dry_allowed_length`: Set the allowed length for DRY sampling. Default: `2`
365+
`dry_allowed_length`: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`
366366

367-
`dry_penalty_last_n`: Set DRY penalty for the last n tokens. Default: `-1`, where `0` is disabled and `-1` is context size.
367+
`dry_penalty_last_n`: How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.
368368

369369
`dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Can be provided as a JSON array of strings or as a JSON-encoded string representing an array of strings. Default: `["\n", ":", "\"", "*"]`
370370

examples/server/server.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1522,8 +1522,6 @@ struct server_context {
15221522
return has_number;
15231523
};
15241524

1525-
bool is_singleton = prompt.is_string() || (prompt.is_array() && is_valid_singleton_array(prompt));
1526-
15271525
// if the prompt is a singleton (i.e. a string, a list of tokens, or a mixed array of strings and tokens), we only need to create a single task
15281526
if (prompt.is_string() || (prompt.is_array() && is_valid_singleton_array(prompt))) {
15291527
data["index"] = 0;

src/llama-sampling.cpp

Lines changed: 34 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1583,7 +1583,7 @@ struct llama_sampler_dry {
15831583
ring_buffer<llama_token> last_tokens;
15841584
};
15851585

1586-
std::vector<llama_token> llama_tokenize(
1586+
static std::vector<llama_token> llama_tokenize(
15871587
const struct llama_model * model,
15881588
const std::string & text,
15891589
bool add_special,
@@ -1602,9 +1602,9 @@ std::vector<llama_token> llama_tokenize(
16021602
return result;
16031603
}
16041604

1605-
std::string llama_detokenize(const struct llama_model * model, const std::vector<llama_token> & tokens, bool special) {
1606-
if (model == nullptr) {
1607-
return "??";
1605+
static std::string llama_detokenize(const struct llama_model * model, const std::vector<llama_token> & tokens, bool special) {
1606+
if (model == nullptr) { // model is passed as nullptr in test-sampling.cpp
1607+
return "";
16081608
}
16091609
std::string text;
16101610
text.resize(std::max(text.capacity(), tokens.size()));
@@ -1621,13 +1621,14 @@ std::string llama_detokenize(const struct llama_model * model, const std::vector
16211621
return text;
16221622
}
16231623

1624-
std::string llama_detokenize_single(const struct llama_model * model, llama_token token, bool special) {
1624+
static std::string llama_detokenize_single(const struct llama_model * model, llama_token token, bool special) {
16251625
std::vector<llama_token> tokens = {token};
16261626
return llama_detokenize(model, tokens, special);
16271627
}
16281628

1629+
#ifdef DEBUG
16291630
// For DRY debugging
1630-
std::string detokenize_for_display(const struct llama_model * model, llama_token token, bool special) {
1631+
static std::string detokenize_for_display(const struct llama_model * model, llama_token token, bool special) {
16311632
std::string token_text = llama_detokenize_single(model, token, special);
16321633
size_t pos = 0;
16331634
while ((pos = token_text.find('\n', pos)) != std::string::npos) {
@@ -1639,31 +1640,32 @@ std::string detokenize_for_display(const struct llama_model * model, llama_token
16391640
}
16401641

16411642
// For DRY debugging
1642-
void dry_print_ring_buffer_debug(const llama_sampler_dry * ctx, int max_tokens_per_side = 100) {
1643+
static void dry_print_ring_buffer_debug(const llama_sampler_dry * ctx, int max_tokens_per_side = 100) {
16431644
const size_t total_tokens = ctx->last_tokens.size();
16441645
size_t tokens_to_print = total_tokens;
1646+
size_t mps = (max_tokens_per_side >= 0) ? static_cast<size_t>(max_tokens_per_side) : 0;
16451647

1646-
if (max_tokens_per_side != -1) {
1647-
tokens_to_print = std::min(total_tokens, static_cast<size_t>(max_tokens_per_side) * 2);
1648+
if (max_tokens_per_side < 0) {
1649+
tokens_to_print = total_tokens;
16481650
}
16491651

16501652
std::vector<std::pair<int, std::string>> token_info;
16511653
token_info.reserve(tokens_to_print);
16521654

16531655
// Collect token information
1654-
if (max_tokens_per_side == -1 || total_tokens <= tokens_to_print) {
1656+
if (max_tokens_per_side < 0 || total_tokens <= tokens_to_print) {
16551657
for (size_t i = 0; i < total_tokens; ++i) {
16561658
llama_token token = ctx->last_tokens.rat(total_tokens - 1 - i);
16571659
std::string token_text = detokenize_for_display(ctx->model, token, true);
16581660
token_info.emplace_back(token, std::move(token_text));
16591661
}
16601662
} else {
1661-
for (size_t i = 0; i < max_tokens_per_side; ++i) {
1663+
for (size_t i = 0; i < mps; ++i) {
16621664
llama_token token = ctx->last_tokens.rat(total_tokens - 1 - i);
16631665
std::string token_text = detokenize_for_display(ctx->model, token, true);
16641666
token_info.emplace_back(token, std::move(token_text));
16651667
}
1666-
for (size_t i = total_tokens - max_tokens_per_side; i < total_tokens; ++i) {
1668+
for (size_t i = total_tokens - mps; i < total_tokens; ++i) {
16671669
llama_token token = ctx->last_tokens.rat(total_tokens - 1 - i);
16681670
std::string token_text = detokenize_for_display(ctx->model, token, true);
16691671
token_info.emplace_back(token, std::move(token_text));
@@ -1686,14 +1688,14 @@ void dry_print_ring_buffer_debug(const llama_sampler_dry * ctx, int max_tokens_p
16861688

16871689
// Print tokens
16881690
for (size_t i = 0; i < tokens_to_print; ++i) {
1689-
size_t true_index = (max_tokens_per_side == -1 || total_tokens <= tokens_to_print) ? i :
1690-
(i < max_tokens_per_side) ? i : (total_tokens - tokens_to_print + i);
1691+
size_t true_index = (max_tokens_per_side < 0 || total_tokens <= tokens_to_print) ? i :
1692+
(i < mps) ? i : (total_tokens - tokens_to_print + i);
16911693
LLAMA_LOG_INFO("%-*zu | %-*d | %-*s\n",
16921694
(int)max_index_width, true_index,
16931695
(int)max_token_width, token_info[i].first,
16941696
(int)max_text_width, token_info[i].second.c_str());
16951697
// Add a separator between oldest and newest tokens if applicable
1696-
if (max_tokens_per_side != -1 && total_tokens > tokens_to_print && i == max_tokens_per_side - 1) {
1698+
if (max_tokens_per_side > 0 && total_tokens > tokens_to_print && i == mps - 1) {
16971699
LLAMA_LOG_INFO("%s\n", std::string(max_index_width + max_token_width + max_text_width + 6, '.').c_str());
16981700
}
16991701
}
@@ -1707,7 +1709,7 @@ struct CandidateInfo {
17071709
};
17081710

17091711
// For DRY debugging
1710-
std::vector<CandidateInfo> get_top_n_candidates(const llama_token_data_array * cur_p, size_t n) {
1712+
static std::vector<CandidateInfo> get_top_n_candidates(const llama_token_data_array * cur_p, size_t n) {
17111713
std::vector<CandidateInfo> candidates;
17121714
candidates.reserve(cur_p->size);
17131715

@@ -1721,6 +1723,7 @@ std::vector<CandidateInfo> get_top_n_candidates(const llama_token_data_array * c
17211723
candidates.resize(std::min(n, candidates.size()));
17221724
return candidates;
17231725
}
1726+
#endif // DEBUG
17241727

17251728
static void GetOverlappingTokenSequences(const struct llama_model * model, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
17261729
const int n_vocab = llama_n_vocab(model);
@@ -1766,7 +1769,7 @@ static void GetOverlappingTokenSequences(const struct llama_model * model, const
17661769

17671770

17681771

1769-
static const char * llama_sampler_dry_name(const struct llama_sampler * smpl) {
1772+
static const char * llama_sampler_dry_name(const struct llama_sampler * /*smpl*/) {
17701773
return "dry";
17711774
}
17721775

@@ -1798,7 +1801,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
17981801

17991802
// Step 1: Look for restart sequences
18001803
int rep_limit = last_n_repeat;
1801-
for (size_t i = 0; i < last_n_repeat; ++i) {
1804+
for (int i = 0; i < last_n_repeat; ++i) {
18021805
llama_token token = ctx->last_tokens.rat(i);
18031806
auto its = ctx->dry_processed_breakers.equal_range(token);
18041807
if (its.first == ctx->dry_processed_breakers.end()) {
@@ -1809,7 +1812,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
18091812
int seq_len = (int)it->second.size();
18101813
if (seq_len > longest_match && seq_len <= (int)i) {
18111814
bool match = true;
1812-
for (size_t offset = 0; offset < seq_len; ++offset) {
1815+
for (int offset = 0; offset < seq_len; ++offset) {
18131816
if (it->second[offset] != ctx->last_tokens.rat(i - offset - 1)) {
18141817
match = false;
18151818
break;
@@ -1868,7 +1871,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
18681871
}
18691872

18701873
// Step 3: Find maximum repeat length for each token
1871-
for (size_t i = 0; i < last_n_repeat - 1; ++i) {
1874+
for (int i = 0; i < last_n_repeat - 1; ++i) {
18721875
int repeat_len = ctx->dry_repeat_count[i];
18731876
if (repeat_len >= ctx->dry_allowed_length) {
18741877
llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i);
@@ -1891,7 +1894,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
18911894
const size_t top_n = 10;
18921895
std::vector<CandidateInfo> top_n_before = get_top_n_candidates(cur_p, top_n);
18931896

1894-
#endif
1897+
#endif // DEBUG
18951898

18961899
for (size_t i = 0; i < cur_p->size; ++i) {
18971900
const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id);
@@ -1910,7 +1913,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
19101913
std::string token_text = detokenize_for_display(ctx->model, cur_p->data[i].id, true);
19111914
LLAMA_LOG_INFO(" Applied penalty %.4f to token %d (%s) (repeat length %d)\n",
19121915
penalty, cur_p->data[i].id, token_text.c_str(), af_kvp->second);
1913-
#endif
1916+
#endif // DEBUG
19141917
}
19151918
}
19161919

@@ -1996,16 +1999,16 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_model * model,
19961999
return new llama_sampler {
19972000
/* .iface = */ &llama_sampler_dry_i,
19982001
/* .ctx = */ new llama_sampler_dry {
1999-
/* .model = */ model,
2000-
/* .total_context_size = */ context_size,
2001-
/* .dry_multiplier = */ dry_multiplier,
2002-
/* .dry_base = */ dry_base,
2003-
/* .dry_allowed_length = */ dry_allowed_length,
2004-
/* .dry_penalty_last_n = */ dry_penalty_last_n,
2002+
/* .model = */ model,
2003+
/* .total_context_size = */ context_size,
2004+
/* .dry_multiplier = */ dry_multiplier,
2005+
/* .dry_base = */ dry_base,
2006+
/* .dry_allowed_length = */ dry_allowed_length,
2007+
/* .dry_penalty_last_n = */ dry_penalty_last_n,
20052008
/* .dry_processed_breakers = */ {},
2006-
/* .dry_repeat_count = */ std::vector<int>(effective_dry_penalty_last_n, 0),
2007-
/* .dry_max_token_repeat = */ {},
2008-
/* .last_tokens = */ ring_buffer<llama_token>(effective_dry_penalty_last_n),
2009+
/* .dry_repeat_count = */ std::vector<int>(effective_dry_penalty_last_n, 0),
2010+
/* .dry_max_token_repeat = */ {},
2011+
/* .last_tokens = */ ring_buffer<llama_token>(effective_dry_penalty_last_n),
20092012
},
20102013
};
20112014
}

0 commit comments

Comments
 (0)