Skip to content

Commit 16ecbc9

Browse files
Merge branch 'ggerganov:master' into master
2 parents 40b9ba1 + 8a79c59 commit 16ecbc9

File tree

3 files changed

+64
-31
lines changed

3 files changed

+64
-31
lines changed

convert.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -515,10 +515,14 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
515515

516516
# Yield token text, score, and type
517517
yield token_text, self.get_token_score(token_id), self.get_token_type(
518-
token_id, self.special_ids # Reuse already stored special IDs
518+
token_id, token_text, self.special_ids # Reuse already stored special IDs
519519
)
520520

521-
def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
521+
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
522+
# Special case for byte tokens
523+
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
524+
return gguf.TokenType.BYTE
525+
522526
# Determine token type based on whether it's a special token
523527
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
524528

@@ -530,7 +534,7 @@ def get_token_score(self, token_id: int) -> float:
530534
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
531535
for text in self.added_tokens_list:
532536
if text in self.specials:
533-
toktype = self.get_token_type(self.specials[text], self.special_ids)
537+
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
534538
score = self.get_token_score(self.specials[text])
535539
else:
536540
toktype = gguf.TokenType.USER_DEFINED

examples/server/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,10 @@ node index.js
137137

138138
`temperature`: Adjust the randomness of the generated text (default: 0.8).
139139

140+
`dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled).
141+
142+
`dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
143+
140144
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
141145

142146
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).

examples/server/server.cpp

Lines changed: 53 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,7 @@ struct llama_server_context
432432
}
433433

434434
default_generation_settings_for_props = get_formated_generation(slots.front());
435+
default_generation_settings_for_props["num_slots"] = params.n_parallel;
435436
default_generation_settings_for_props["seed"] = -1;
436437

437438
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
@@ -524,27 +525,29 @@ struct llama_server_context
524525
slot->oaicompat_model = "";
525526
}
526527

527-
slot->params.stream = json_value(data, "stream", false);
528-
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
529-
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
530-
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
531-
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
532-
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
533-
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
534-
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
535-
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
536-
slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
537-
slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
538-
slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
539-
slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
540-
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
541-
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
542-
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
543-
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
544-
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
545-
slot->params.seed = json_value(data, "seed", default_params.seed);
546-
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
547-
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
528+
slot->params.stream = json_value(data, "stream", false);
529+
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
530+
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
531+
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
532+
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
533+
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
534+
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
535+
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
536+
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
537+
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
538+
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
539+
slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
540+
slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
541+
slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
542+
slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
543+
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
544+
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
545+
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
546+
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
547+
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
548+
slot->params.seed = json_value(data, "seed", default_params.seed);
549+
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
550+
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
548551

549552
// infill
550553
if (data.count("input_prefix") != 0)
@@ -1002,6 +1005,8 @@ struct llama_server_context
10021005
{"model", params.model_alias},
10031006
{"seed", slot.params.seed},
10041007
{"temperature", slot.sparams.temp},
1008+
{"dynatemp_range", slot.sparams.dynatemp_range},
1009+
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
10051010
{"top_k", slot.sparams.top_k},
10061011
{"top_p", slot.sparams.top_p},
10071012
{"min_p", slot.sparams.min_p},
@@ -1163,13 +1168,30 @@ struct llama_server_context
11631168
task.multitask_id = multitask_id;
11641169

11651170
// when a completion task's prompt array is not a singleton, we split it into multiple requests
1166-
if (task.data.count("prompt") && task.data.at("prompt").size() > 1)
1167-
{
1168-
split_multiprompt_task(task_id, task);
1169-
}
1170-
11711171
// otherwise, it's a single-prompt task, we actually queue it
1172-
queue_tasks.post(task);
1172+
// if there's numbers in the prompt array it will be treated as an array of tokens
1173+
if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) {
1174+
bool numbers = false;
1175+
for (const auto& e : task.data.at("prompt")) {
1176+
if (e.is_number()) {
1177+
numbers = true;
1178+
break;
1179+
}
1180+
}
1181+
1182+
// NOTE: split_multiprompt_task() does not handle a mix of strings and numbers,
1183+
// it will completely stall the server. I don't know where the bug for this is.
1184+
//
1185+
// if there are numbers, it needs to be treated like a single prompt,
1186+
// queue_tasks handles a mix of strings and numbers just fine.
1187+
if (numbers) {
1188+
queue_tasks.post(task);
1189+
} else {
1190+
split_multiprompt_task(task_id, task);
1191+
}
1192+
} else {
1193+
queue_tasks.post(task);
1194+
}
11731195
}
11741196

11751197
// for multiple images processing
@@ -1251,7 +1273,10 @@ struct llama_server_context
12511273
void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
12521274
{
12531275
int prompt_count = multiprompt_task.data.at("prompt").size();
1254-
assert(prompt_count > 1);
1276+
if (prompt_count <= 1) {
1277+
send_error(multiprompt_task, "error while handling multiple prompts");
1278+
return;
1279+
}
12551280

12561281
// generate all the ID for subtask
12571282
std::vector<int> subtask_ids(prompt_count);

0 commit comments

Comments
 (0)