Skip to content

Commit 586a6e6

Browse files
authored
Merge branch 'ikawrakow:main' into main
2 parents c7424c9 + 65763a2 commit 586a6e6

File tree

297 files changed

+31412
-528
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

297 files changed

+31412
-528
lines changed

.gitignore

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,3 +130,16 @@ poetry.toml
130130

131131
# Scripts
132132
!/scripts/install-oneapi.bat
133+
/examples/server/webui_llamacpp/.gitignore
134+
135+
# Test models for lora adapters
136+
/lora-tests
137+
138+
# Local scripts
139+
/run-vim.sh
140+
/run-chat.sh
141+
.ccache/
142+
143+
# IDE
144+
*.code-workspace
145+
.windsurf/

common/chat-parser.cpp

Lines changed: 125 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
#include "log.h"
44
#include "regex-partial.h"
55

6+
#include <algorithm>
7+
#include <cctype>
68
#include <optional>
79
#include <stdexcept>
810
#include <string>
11+
#include <string_view>
912
#include <vector>
1013

1114
using json = nlohmann::ordered_json;
@@ -137,6 +140,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
137140
}
138141

139142
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
143+
std::string pending_reasoning_prefix;
144+
145+
if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
146+
return false;
147+
}
148+
149+
auto set_reasoning_prefix = [&](size_t prefix_pos) {
150+
if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
151+
return;
152+
}
153+
if (prefix_pos + start_think.size() > input_.size()) {
154+
pending_reasoning_prefix.clear();
155+
return;
156+
}
157+
// Capture the exact literal that opened the reasoning section so we can
158+
// surface it back to callers. This ensures formats that force the
159+
// reasoning tag open (e.g. DeepSeek R1) retain their original prefix
160+
// instead of dropping it during parsing.
161+
pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
162+
};
163+
140164
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
141165
auto stripped_reasoning = string_strip(reasoning);
142166
if (stripped_reasoning.empty()) {
@@ -149,28 +173,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
149173
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
150174
}
151175
} else {
176+
if (!pending_reasoning_prefix.empty()) {
177+
add_reasoning_content(pending_reasoning_prefix);
178+
pending_reasoning_prefix.clear();
179+
}
152180
add_reasoning_content(stripped_reasoning);
153181
}
154182
};
155-
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
156-
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
157-
if (auto res = try_find_literal(end_think)) {
158-
handle_reasoning(res->prelude, /* closed */ true);
159-
consume_spaces();
160-
return true;
161-
}
162-
auto rest = consume_rest();
183+
184+
const size_t saved_pos = pos_;
185+
const size_t saved_content_size = result_.content.size();
186+
const size_t saved_reasoning_size = result_.reasoning_content.size();
187+
188+
auto restore_state = [&]() {
189+
move_to(saved_pos);
190+
result_.content.resize(saved_content_size);
191+
result_.reasoning_content.resize(saved_reasoning_size);
192+
};
193+
194+
// Allow leading whitespace to be preserved as content when reasoning is present at the start
195+
size_t cursor = pos_;
196+
size_t whitespace_end = cursor;
197+
while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
198+
++whitespace_end;
199+
}
200+
201+
if (whitespace_end >= input_.size()) {
202+
restore_state();
203+
if (syntax_.thinking_forced_open) {
204+
auto rest = input_.substr(saved_pos);
163205
if (!rest.empty()) {
164206
handle_reasoning(rest, /* closed */ !is_partial());
165207
}
166-
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
167-
// if (!syntax_.thinking_forced_open) {
168-
// throw common_chat_msg_partial_exception(end_think);
169-
// }
208+
move_to(input_.size());
170209
return true;
171210
}
211+
return false;
212+
}
213+
214+
cursor = whitespace_end;
215+
const size_t remaining = input_.size() - cursor;
216+
const size_t start_prefix = std::min(start_think.size(), remaining);
217+
const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
218+
219+
if (has_start_tag && start_prefix < start_think.size()) {
220+
move_to(input_.size());
221+
return true;
222+
}
223+
224+
if (has_start_tag) {
225+
if (whitespace_end > pos_) {
226+
add_content(input_.substr(pos_, whitespace_end - pos_));
227+
}
228+
set_reasoning_prefix(cursor);
229+
cursor += start_think.size();
230+
} else if (syntax_.thinking_forced_open) {
231+
cursor = whitespace_end;
232+
} else {
233+
restore_state();
234+
return false;
235+
}
236+
while (true) {
237+
if (cursor >= input_.size()) {
238+
move_to(input_.size());
239+
return true;
240+
}
241+
242+
size_t end_pos = input_.find(end_think, cursor);
243+
if (end_pos == std::string::npos) {
244+
std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
245+
size_t partial_off = string_find_partial_stop(remaining_view, end_think);
246+
size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
247+
if (reasoning_end > cursor) {
248+
handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
249+
}
250+
move_to(input_.size());
251+
return true;
252+
}
253+
254+
if (end_pos > cursor) {
255+
handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
256+
} else {
257+
handle_reasoning("", /* closed */ true);
258+
}
259+
260+
cursor = end_pos + end_think.size();
261+
262+
while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
263+
++cursor;
264+
}
265+
266+
const size_t next_remaining = input_.size() - cursor;
267+
if (next_remaining == 0) {
268+
move_to(cursor);
269+
return true;
270+
}
271+
272+
const size_t next_prefix = std::min(start_think.size(), next_remaining);
273+
if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
274+
if (next_prefix < start_think.size()) {
275+
move_to(input_.size());
276+
return true;
277+
}
278+
set_reasoning_prefix(cursor);
279+
cursor += start_think.size();
280+
continue;
281+
}
282+
283+
move_to(cursor);
284+
return true;
172285
}
173-
return false;
174286
}
175287

176288
std::string common_chat_msg_parser::consume_rest() {

common/chat.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,6 +1207,8 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
12071207
return data;
12081208
}
12091209
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
1210+
builder.try_parse_reasoning("<think>", "</think>");
1211+
12101212
if (!builder.syntax().parse_tool_calls) {
12111213
builder.add_content(builder.consume_rest());
12121214
return;
@@ -2411,6 +2413,7 @@ common_chat_params common_chat_templates_apply(
24112413
}
24122414

24132415
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
2416+
builder.try_parse_reasoning("<think>", "</think>");
24142417
builder.add_content(builder.consume_rest());
24152418
}
24162419

common/common.cpp

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,23 @@ int32_t cpu_get_num_math() {
200200
return cpu_get_num_physical_cores();
201201
}
202202

203+
//
204+
// Arg utils
205+
//
206+
common_webui common_webui_from_name(const std::string& format) {
207+
if (format == "none") {
208+
return COMMON_WEBUI_NONE;
209+
}
210+
else if (format == "auto") {
211+
return COMMON_WEBUI_AUTO;
212+
}
213+
else if (format == "llamacpp") {
214+
return COMMON_WEBUI_LLAMACPP;
215+
}
216+
else {
217+
return COMMON_WEBUI_AUTO;
218+
}
219+
}
203220

204221
static std::string read_file(const std::string& fname) {
205222
std::ifstream file(fname);
@@ -210,6 +227,14 @@ static std::string read_file(const std::string& fname) {
210227
file.close();
211228
return content;
212229
}
230+
231+
static std::string parse_device_list(const std::string& value) {
232+
if (value==" " || value.find("-")!= std::string::npos) {
233+
throw std::invalid_argument("no devices specified");
234+
}
235+
return value;
236+
}
237+
213238
//
214239
// CLI argument parsing
215240
//
@@ -1052,7 +1077,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
10521077
}
10531078
return true;
10541079
}
1055-
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
1080+
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
10561081
CHECK_ARG
10571082
params.n_gpu_layers_draft = std::stoi(argv[i]);
10581083
if (!llama_supports_gpu_offload()) {
@@ -1199,6 +1224,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
11991224
else { invalid_param = true; }
12001225
return true;
12011226
}
1227+
if (arg == "-dev" || arg == "--device") {
1228+
CHECK_ARG
1229+
std::string value(argv[i]);
1230+
params.devices = parse_device_list(value);
1231+
return true;
1232+
}
1233+
if (arg == "-devd" || arg == "--device-draft") {
1234+
CHECK_ARG
1235+
std::string value(argv[i]);
1236+
params.devices_draft = parse_device_list(value);
1237+
return true;
1238+
}
12021239
if (arg == "-v" || arg == "--verbose") {
12031240
params.verbosity = 1;
12041241
return true;
@@ -1417,6 +1454,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
14171454
params.public_path = argv[i];
14181455
return true;
14191456
}
1457+
if (arg == "--webui") {
1458+
CHECK_ARG
1459+
params.webui = common_webui_from_name(std::string(argv[i]));
1460+
return true;
1461+
}
14201462
if (arg == "--api-key") {
14211463
CHECK_ARG
14221464
params.api_keys.push_back(argv[i]);
@@ -1888,6 +1930,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
18881930
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
18891931
"- none: leaves thoughts unparsed in `message.content`\n"
18901932
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
1933+
"- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
18911934
"(default: none)", });
18921935
options.push_back({ "main", " --chat-template-kwargs JSON", "sets additional params for the json template parser"});
18931936
options.push_back({ "main", " --reasoning-budget N", "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)" });
@@ -1982,6 +2025,12 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
19822025
" - row: split rows across GPUs" });
19832026
options.push_back({ "*", "-ts, --tensor-split SPLIT",
19842027
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
2028+
options.push_back({ "*", "-dev, --device dev1,dev2",
2029+
"comma-separated list of devices to use for offloading (none = don't offload)\n"
2030+
"Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
2031+
options.push_back({ "*", "-devd, --device-draft dev1,dev2",
2032+
"comma-separated list of devices to use for offloading for the draft model (none = don't offload)\n"
2033+
"Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
19852034
options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
19862035
"or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
19872036
}
@@ -2046,6 +2095,12 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
20462095
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
20472096
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
20482097
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
2098+
options.push_back({ "server", " --webui NAME",
2099+
"controls which webui to server:\n"
2100+
"- none: disable webui\n"
2101+
"- auto: default webui \n"
2102+
"- llamacpp: llamacpp webui \n"
2103+
"(default: auto)", });
20492104
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
20502105
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
20512106
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
@@ -2549,7 +2604,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
25492604
} else {
25502605
model = llama_load_model_from_file(params.model.c_str(), mparams);
25512606
}
2552-
2607+
25532608
if (model == NULL) {
25542609
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
25552610
return iparams;
@@ -2666,6 +2721,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
26662721

26672722
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
26682723
auto mparams = llama_model_default_params();
2724+
mparams.devices = params.devices.c_str();
26692725

26702726
if (params.n_gpu_layers != -1) {
26712727
mparams.n_gpu_layers = params.n_gpu_layers;

common/common.h

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,14 @@ enum common_reasoning_format {
109109
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
110110
};
111111

112+
enum common_webui {
113+
COMMON_WEBUI_NONE,
114+
COMMON_WEBUI_AUTO,
115+
COMMON_WEBUI_LLAMACPP,
116+
};
117+
118+
common_webui common_webui_from_name(const std::string& format);
119+
112120
struct model_paths {
113121
std::string path = ""; // model local path // NOLINT
114122
std::string url = ""; // model url to download // NOLINT
@@ -118,6 +126,9 @@ struct model_paths {
118126
};
119127

120128
struct gpt_params {
129+
std::string devices;
130+
std::string devices_draft;
131+
121132
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
122133

123134
int32_t n_threads = cpu_get_num_math();
@@ -185,6 +196,7 @@ struct gpt_params {
185196
std::string logits_file = ""; // file for saving *all* logits
186197
std::string rpc_servers = ""; // comma separated list of RPC servers
187198

199+
188200
std::vector<std::string> in_files; // all input files
189201
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
190202
std::vector<llama_model_kv_override> kv_overrides;
@@ -288,7 +300,7 @@ struct gpt_params {
288300
bool use_jinja = false; // NOLINT
289301
std::string system_prompt = "";
290302
bool enable_chat_template = true;
291-
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
303+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
292304
int reasoning_budget = -1;
293305
bool prefill_assistant = true;
294306

@@ -300,8 +312,8 @@ struct gpt_params {
300312
std::map<std::string, std::string> default_template_kwargs;
301313

302314
// "advanced" endpoints are disabled by default for better security
303-
bool webui = true;
304-
bool endpoint_slots = false;
315+
common_webui webui = COMMON_WEBUI_AUTO;
316+
bool endpoint_slots = true;
305317
bool endpoint_props = false; // only control POST requests, not GET
306318
bool endpoint_metrics = false;
307319

@@ -432,6 +444,7 @@ bool fs_create_directory_with_parents(const std::string & path);
432444
std::string fs_get_cache_directory();
433445
std::string fs_get_cache_file(const std::string & filename);
434446

447+
435448
//
436449
// Model utils
437450
//

0 commit comments

Comments
 (0)