Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ struct gpt_params {
bool use_jinja = false; // NOLINT
std::string system_prompt = "";
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
int reasoning_budget = -1;
bool prefill_assistant = true;

Expand Down
Binary file modified examples/server/public/index.html.gz
Binary file not shown.
39 changes: 22 additions & 17 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ struct server_task_result {
std::vector<llama_token> tokens;

bool stream;
bool include_usage;
std::string prompt;
//slot_params generation_params;

Expand Down Expand Up @@ -500,22 +501,22 @@ struct server_task_result {
{"model", oaicompat_model},
{"object", "chat.completion.chunk"},
});

// OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
// https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
deltas.push_back({
{"choices", json::array()},
{"created", t},
{"id", oaicompat_cmpl_id},
{"model", oaicompat_model},
{"object", "chat.completion.chunk"},
{"usage", json {
{"completion_tokens", n_decoded},
{"prompt_tokens", n_prompt_tokens},
{"total_tokens", n_decoded + n_prompt_tokens},
}},
});

if (include_usage) {
// OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
// https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
deltas.push_back({
{"choices", json::array()},
{"created", t},
{"id", oaicompat_cmpl_id},
{"model", oaicompat_model},
{"object", "chat.completion.chunk"},
{"usage", json {
{"completion_tokens", n_decoded},
{"prompt_tokens", n_prompt_tokens},
{"total_tokens", n_decoded + n_prompt_tokens},
}},
});
}
if (timings.prompt_n >= 0) {
deltas.back().push_back({ "timings", timings.to_json() });
}
Expand Down Expand Up @@ -547,6 +548,7 @@ struct server_task_multi {

struct slot_params {
bool stream = true;
bool include_usage = false;
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt

int32_t n_keep = 0; // number of tokens to keep from initial prompt
Expand Down Expand Up @@ -1359,7 +1361,7 @@ struct server_context {
// thinking is enabled if:
// 1. It's not explicitly disabled (reasoning_budget == 0)
// 2. The chat template supports it
const bool enable_thinking = params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
const bool enable_thinking = params.use_jinja && params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
//LLAMA_LOG_INFO("Enable thinking? %d\n", enable_thinking);

oai_parser_opt = {
Expand Down Expand Up @@ -1514,6 +1516,8 @@ struct server_context {
}
slot.params.timings_per_token = json_value(data, "timings_per_token", false);
slot.params.stream = json_value(data, "stream", false);
auto stream_opt = json_value(data, "stream_options", json::object());
slot.params.include_usage = json_value(stream_opt, "include_usage", false);
slot.params.cache_prompt = json_value(data, "cache_prompt", true);
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
Expand Down Expand Up @@ -2206,6 +2210,7 @@ struct server_context {
res.error = false;
res.stop = true; // to do: set value
res.stream = slot.params.stream;
res.include_usage = slot.params.include_usage;
res.content = slot.generated_text;
res.oaicompat = slot.params.oaicompat;
res.oaicompat_model = slot.params.oaicompat_model;
Expand Down
74 changes: 37 additions & 37 deletions examples/server/webui/dist/index.html

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions examples/server/webui/src/Config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export const CONFIG_DEFAULT = {
showTokensPerSecond: false,
showThoughtInProgress: false,
excludeThoughtOnReq: true,
reasoning_format: 'auto',
// make sure these default values are in sync with `common.h`
samplers: 'dkypmxnt',
temperature: 0.8,
Expand All @@ -42,6 +43,7 @@ export const CONFIG_DEFAULT = {
pyIntepreterEnabled: false,
};
export const CONFIG_INFO: Record<string, string> = {
reasoning_format : 'Specify how to parse reasoning content. none: reasoning content in content block. auto: reasoning content in reasoning_content. ',
apiKey: 'Set the API Key if you are using --api-key option for the server.',
systemMessage: 'The starting message that defines how model should behave.',
samplers:
Expand Down
1 change: 1 addition & 0 deletions examples/server/webui/src/components/SettingDialog.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import toast from 'react-hot-toast'
type SettKey = keyof typeof CONFIG_DEFAULT;

const BASIC_KEYS: SettKey[] = [
'reasoning_format',
'temperature',
'top_k',
'top_p',
Expand Down
27 changes: 24 additions & 3 deletions examples/server/webui/src/utils/app.context.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ export const AppContextProvider = ({
messages,
stream: true,
cache_prompt: true,
reasoning_format: 'none',
reasoning_format: config.reasoning_format===''?'auto':config.reasoning_format,
samplers: config.samplers,
temperature: config.temperature,
dynatemp_range: config.dynatemp_range,
Expand All @@ -226,7 +226,7 @@ export const AppContextProvider = ({
typical_p: config.typical_p,
xtc_probability: config.xtc_probability,
xtc_threshold: config.xtc_threshold,
top_n_sigma: config.top_n_sigma,
top_n_sigma: config.top_n_sigma,
repeat_last_n: config.repeat_last_n,
repeat_penalty: config.repeat_penalty,
presence_penalty: config.presence_penalty,
Expand Down Expand Up @@ -257,14 +257,35 @@ export const AppContextProvider = ({
throw new Error(body?.error?.message || 'Unknown error');
}
const chunks = getSSEStreamAsync(fetchResponse);
let thinkingTagOpen = false;
for await (const chunk of chunks) {
// const stop = chunk.stop;
if (chunk.error) {
throw new Error(chunk.error?.message || 'Unknown error');
}

const reasoningContent = chunk.choices?.[0]?.delta?.reasoning_content;
if (reasoningContent) {
if (pendingMsg.content === null || pendingMsg.content === '') {
thinkingTagOpen = true;
pendingMsg = {
...pendingMsg,
content: '<think>' + reasoningContent,
};
} else {
pendingMsg = {
...pendingMsg,
content: pendingMsg.content + reasoningContent,
};
}
}
const addedContent = chunk.choices?.[0]?.delta?.content;
const lastContent = pendingMsg.content || '';
let lastContent = pendingMsg.content || '';
if (addedContent) {
if (thinkingTagOpen) {
lastContent = lastContent + '</think>';
thinkingTagOpen = false;
}
pendingMsg = {
...pendingMsg,
content: lastContent + addedContent,
Expand Down