Skip to content

Commit b28a935

Browse files
Merge pull request #283 from menloresearch/update-dev-from-master-2025-10-09-00-33
Sync master with upstream release b6715
2 parents 17c28b3 + 12bbc3f commit b28a935

File tree

24 files changed

+647
-522
lines changed

24 files changed

+647
-522
lines changed

common/arg.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3432,7 +3432,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34323432
{"--reasoning-format"}, "FORMAT",
34333433
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
34343434
"- none: leaves thoughts unparsed in `message.content`\n"
3435-
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
3435+
"- deepseek: puts thoughts in `message.reasoning_content`\n"
3436+
"- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
34363437
"(default: auto)",
34373438
[](common_params & params, const std::string & value) {
34383439
params.reasoning_format = common_reasoning_format_from_name(value);

common/chat-parser.cpp

Lines changed: 125 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
#include "log.h"
44
#include "regex-partial.h"
55

6+
#include <algorithm>
7+
#include <cctype>
68
#include <optional>
79
#include <stdexcept>
810
#include <string>
11+
#include <string_view>
912
#include <vector>
1013

1114
using json = nlohmann::ordered_json;
@@ -166,6 +169,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
166169
}
167170

168171
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
172+
std::string pending_reasoning_prefix;
173+
174+
if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
175+
return false;
176+
}
177+
178+
auto set_reasoning_prefix = [&](size_t prefix_pos) {
179+
if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
180+
return;
181+
}
182+
if (prefix_pos + start_think.size() > input_.size()) {
183+
pending_reasoning_prefix.clear();
184+
return;
185+
}
186+
// Capture the exact literal that opened the reasoning section so we can
187+
// surface it back to callers. This ensures formats that force the
188+
// reasoning tag open (e.g. DeepSeek R1) retain their original prefix
189+
// instead of dropping it during parsing.
190+
pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
191+
};
192+
169193
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
170194
auto stripped_reasoning = string_strip(reasoning);
171195
if (stripped_reasoning.empty()) {
@@ -178,28 +202,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
178202
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
179203
}
180204
} else {
205+
if (!pending_reasoning_prefix.empty()) {
206+
add_reasoning_content(pending_reasoning_prefix);
207+
pending_reasoning_prefix.clear();
208+
}
181209
add_reasoning_content(stripped_reasoning);
182210
}
183211
};
184-
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
185-
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
186-
if (auto res = try_find_literal(end_think)) {
187-
handle_reasoning(res->prelude, /* closed */ true);
188-
consume_spaces();
189-
return true;
190-
}
191-
auto rest = consume_rest();
212+
213+
const size_t saved_pos = pos_;
214+
const size_t saved_content_size = result_.content.size();
215+
const size_t saved_reasoning_size = result_.reasoning_content.size();
216+
217+
auto restore_state = [&]() {
218+
move_to(saved_pos);
219+
result_.content.resize(saved_content_size);
220+
result_.reasoning_content.resize(saved_reasoning_size);
221+
};
222+
223+
// Allow leading whitespace to be preserved as content when reasoning is present at the start
224+
size_t cursor = pos_;
225+
size_t whitespace_end = cursor;
226+
while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
227+
++whitespace_end;
228+
}
229+
230+
if (whitespace_end >= input_.size()) {
231+
restore_state();
232+
if (syntax_.thinking_forced_open) {
233+
auto rest = input_.substr(saved_pos);
192234
if (!rest.empty()) {
193235
handle_reasoning(rest, /* closed */ !is_partial());
194236
}
195-
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
196-
// if (!syntax_.thinking_forced_open) {
197-
// throw common_chat_msg_partial_exception(end_think);
198-
// }
237+
move_to(input_.size());
199238
return true;
200239
}
240+
return false;
241+
}
242+
243+
cursor = whitespace_end;
244+
const size_t remaining = input_.size() - cursor;
245+
const size_t start_prefix = std::min(start_think.size(), remaining);
246+
const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
247+
248+
if (has_start_tag && start_prefix < start_think.size()) {
249+
move_to(input_.size());
250+
return true;
251+
}
252+
253+
if (has_start_tag) {
254+
if (whitespace_end > pos_) {
255+
add_content(input_.substr(pos_, whitespace_end - pos_));
256+
}
257+
set_reasoning_prefix(cursor);
258+
cursor += start_think.size();
259+
} else if (syntax_.thinking_forced_open) {
260+
cursor = whitespace_end;
261+
} else {
262+
restore_state();
263+
return false;
264+
}
265+
while (true) {
266+
if (cursor >= input_.size()) {
267+
move_to(input_.size());
268+
return true;
269+
}
270+
271+
size_t end_pos = input_.find(end_think, cursor);
272+
if (end_pos == std::string::npos) {
273+
std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
274+
size_t partial_off = string_find_partial_stop(remaining_view, end_think);
275+
size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
276+
if (reasoning_end > cursor) {
277+
handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
278+
}
279+
move_to(input_.size());
280+
return true;
281+
}
282+
283+
if (end_pos > cursor) {
284+
handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
285+
} else {
286+
handle_reasoning("", /* closed */ true);
287+
}
288+
289+
cursor = end_pos + end_think.size();
290+
291+
while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
292+
++cursor;
293+
}
294+
295+
const size_t next_remaining = input_.size() - cursor;
296+
if (next_remaining == 0) {
297+
move_to(cursor);
298+
return true;
299+
}
300+
301+
const size_t next_prefix = std::min(start_think.size(), next_remaining);
302+
if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
303+
if (next_prefix < start_think.size()) {
304+
move_to(input_.size());
305+
return true;
306+
}
307+
set_reasoning_prefix(cursor);
308+
cursor += start_think.size();
309+
continue;
310+
}
311+
312+
move_to(cursor);
313+
return true;
201314
}
202-
return false;
203315
}
204316

205317
std::string common_chat_msg_parser::consume_rest() {

common/chat.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1408,6 +1408,8 @@ static common_chat_params common_chat_params_init_apertus(const common_chat_temp
14081408
return data;
14091409
}
14101410
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
1411+
builder.try_parse_reasoning("<think>", "</think>");
1412+
14111413
if (!builder.syntax().parse_tool_calls) {
14121414
builder.add_content(builder.consume_rest());
14131415
return;
@@ -2862,6 +2864,7 @@ common_chat_params common_chat_templates_apply(
28622864
}
28632865

28642866
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
2867+
builder.try_parse_reasoning("<think>", "</think>");
28652868
builder.add_content(builder.consume_rest());
28662869
}
28672870

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ struct common_params {
433433
std::string chat_template = ""; // NOLINT
434434
bool use_jinja = false; // NOLINT
435435
bool enable_chat_template = true;
436-
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
436+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
437437
int reasoning_budget = -1;
438438
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
439439

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
231231

232232
info.default_tensor_split[id] = total_vram;
233233
total_vram += prop.totalGlobalMem;
234-
info.devices[id].integrated = prop.integrated;
234+
info.devices[id].integrated = false; // Temporarily disabled due to issues with corrupted output (e.g. #15034)
235235
info.devices[id].nsm = prop.multiProcessorCount;
236236
info.devices[id].smpb = prop.sharedMemPerBlock;
237237
info.devices[id].warp_size = prop.warpSize;

ggml/src/ggml-metal/ggml-metal-device.cpp

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -959,7 +959,53 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_pad(
959959
//ggml_metal_cv_set_int32(cv, ns20, FC_FLASH_ATTN_EXT_PAD + 21);
960960
//ggml_metal_cv_set_int32(cv, nsg, FC_FLASH_ATTN_EXT_PAD + 22);
961961
//ggml_metal_cv_set_int32(cv, nwg, FC_FLASH_ATTN_EXT_PAD + 23);
962-
ggml_metal_cv_set_int32(cv, ncpsg, FC_FLASH_ATTN_EXT_PAD + 24);
962+
//ggml_metal_cv_set_int32(cv, nqptg, FC_FLASH_ATTN_EXT_PAD + 24);
963+
ggml_metal_cv_set_int32(cv, ncpsg, FC_FLASH_ATTN_EXT_PAD + 25);
964+
965+
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
966+
967+
ggml_metal_cv_free(cv);
968+
969+
return res;
970+
}
971+
972+
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_blk(
973+
ggml_metal_library_t lib,
974+
const struct ggml_tensor * op,
975+
int32_t nqptg,
976+
int32_t ncpsg) {
977+
assert(op->op == GGML_OP_FLASH_ATTN_EXT);
978+
GGML_UNUSED(op);
979+
980+
char base[256];
981+
char name[256];
982+
983+
snprintf(base, 256, "kernel_%s",
984+
"flash_attn_ext_blk");
985+
986+
snprintf(name, 256, "%s_nqptg=%d_ncpsg=%d",
987+
base,
988+
nqptg,
989+
ncpsg);
990+
991+
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
992+
if (res) {
993+
return res;
994+
}
995+
996+
ggml_metal_cv_t cv = ggml_metal_cv_init();
997+
998+
//ggml_metal_cv_set_bool(cv, has_mask, FC_FLASH_ATTN_EXT_BLK + 0);
999+
//ggml_metal_cv_set_bool(cv, has_sinks, FC_FLASH_ATTN_EXT_BLK + 1);
1000+
//ggml_metal_cv_set_bool(cv, has_bias, FC_FLASH_ATTN_EXT_BLK + 2);
1001+
//ggml_metal_cv_set_bool(cv, has_scap, FC_FLASH_ATTN_EXT_BLK + 3);
1002+
1003+
//ggml_metal_cv_set_int32(cv, ns10, FC_FLASH_ATTN_EXT_BLK + 20);
1004+
//ggml_metal_cv_set_int32(cv, ns20, FC_FLASH_ATTN_EXT_BLK + 21);
1005+
//ggml_metal_cv_set_int32(cv, nsg, FC_FLASH_ATTN_EXT_BLK + 22);
1006+
//ggml_metal_cv_set_int32(cv, nwg, FC_FLASH_ATTN_EXT_BLK + 23);
1007+
ggml_metal_cv_set_int32(cv, nqptg, FC_FLASH_ATTN_EXT_BLK + 24);
1008+
ggml_metal_cv_set_int32(cv, ncpsg, FC_FLASH_ATTN_EXT_BLK + 25);
9631009

9641010
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
9651011

ggml/src/ggml-metal/ggml-metal-device.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,12 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_pad(
141141
bool has_mask,
142142
int32_t ncpsg);
143143

144+
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_blk(
145+
ggml_metal_library_t lib,
146+
const struct ggml_tensor * op,
147+
int32_t nqptg,
148+
int32_t ncpsg);
149+
144150
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext(
145151
ggml_metal_library_t lib,
146152
const struct ggml_tensor * op,

ggml/src/ggml-metal/ggml-metal-impl.h

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,19 @@
7070

7171
// function constants offsets
7272
#define FC_FLASH_ATTN_EXT_PAD 100
73-
#define FC_FLASH_ATTN_EXT 200
74-
#define FC_FLASH_ATTN_EXT_VEC 300
75-
#define FC_FLASH_ATTN_EXT_VEC_REDUCE 400
76-
#define FC_MUL_MV 500
77-
#define FC_MUL_MM 600
73+
#define FC_FLASH_ATTN_EXT_BLK 200
74+
#define FC_FLASH_ATTN_EXT 300
75+
#define FC_FLASH_ATTN_EXT_VEC 400
76+
#define FC_FLASH_ATTN_EXT_VEC_REDUCE 500
77+
#define FC_MUL_MV 600
78+
#define FC_MUL_MM 700
79+
80+
// op-specific constants
81+
#define OP_FLASH_ATTN_EXT_NQPTG 8
82+
#define OP_FLASH_ATTN_EXT_NCPSG 64
83+
84+
#define OP_FLASH_ATTN_EXT_VEC_NQPTG 1
85+
#define OP_FLASH_ATTN_EXT_VEC_NCPSG 32
7886

7987
// kernel argument structs
8088
//
@@ -263,6 +271,17 @@ typedef struct {
263271
uint64_t nb33;
264272
} ggml_metal_kargs_flash_attn_ext_pad;
265273

274+
typedef struct {
275+
int32_t ne01;
276+
int32_t ne30;
277+
int32_t ne31;
278+
int32_t ne32;
279+
int32_t ne33;
280+
uint64_t nb31;
281+
uint64_t nb32;
282+
uint64_t nb33;
283+
} ggml_metal_kargs_flash_attn_ext_blk;
284+
266285
typedef struct {
267286
int32_t ne01;
268287
int32_t ne02;

0 commit comments

Comments
 (0)