Skip to content

Commit 5b6ba02

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # ci/run.sh # examples/model-conversion/Makefile # examples/model-conversion/README.md # examples/model-conversion/logits.cpp # examples/model-conversion/requirements.txt # examples/model-conversion/scripts/embedding/convert-model.sh # examples/model-conversion/scripts/embedding/run-converted-model.sh # examples/model-conversion/scripts/embedding/run-original-model.py # examples/model-conversion/scripts/utils/semantic_check.py # ggml/src/ggml-cann/common.h # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/kleidiai/kernels.cpp # ggml/src/ggml-cpu/kleidiai/kernels.h # ggml/src/ggml-cpu/kleidiai/kleidiai.cpp # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/dpct/helper.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/softmax.cpp # ggml/src/ggml-sycl/softmax.hpp # requirements/requirements-all.txt # tests/test-chat-parser.cpp # tools/server/README.md
2 parents 5396e62 + 8328fd4 commit 5b6ba02

27 files changed

+410
-429
lines changed

common/arg.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3455,7 +3455,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34553455
{"--reasoning-format"}, "FORMAT",
34563456
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
34573457
"- none: leaves thoughts unparsed in `message.content`\n"
3458-
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
3458+
"- deepseek: puts thoughts in `message.reasoning_content`\n"
3459+
"- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
34593460
"(default: auto)",
34603461
[](common_params & params, const std::string & value) {
34613462
params.reasoning_format = common_reasoning_format_from_name(value);

common/chat-parser.cpp

Lines changed: 125 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
#include "log.h"
44
#include "regex-partial.h"
55

6+
#include <algorithm>
7+
#include <cctype>
68
#include <optional>
79
#include <stdexcept>
810
#include <string>
11+
#include <string_view>
912
#include <vector>
1013

1114
using json = nlohmann::ordered_json;
@@ -166,6 +169,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
166169
}
167170

168171
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
172+
std::string pending_reasoning_prefix;
173+
174+
if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
175+
return false;
176+
}
177+
178+
auto set_reasoning_prefix = [&](size_t prefix_pos) {
179+
if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
180+
return;
181+
}
182+
if (prefix_pos + start_think.size() > input_.size()) {
183+
pending_reasoning_prefix.clear();
184+
return;
185+
}
186+
// Capture the exact literal that opened the reasoning section so we can
187+
// surface it back to callers. This ensures formats that force the
188+
// reasoning tag open (e.g. DeepSeek R1) retain their original prefix
189+
// instead of dropping it during parsing.
190+
pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
191+
};
192+
169193
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
170194
auto stripped_reasoning = string_strip(reasoning);
171195
if (stripped_reasoning.empty()) {
@@ -178,28 +202,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
178202
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
179203
}
180204
} else {
205+
if (!pending_reasoning_prefix.empty()) {
206+
add_reasoning_content(pending_reasoning_prefix);
207+
pending_reasoning_prefix.clear();
208+
}
181209
add_reasoning_content(stripped_reasoning);
182210
}
183211
};
184-
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
185-
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
186-
if (auto res = try_find_literal(end_think)) {
187-
handle_reasoning(res->prelude, /* closed */ true);
188-
consume_spaces();
189-
return true;
190-
}
191-
auto rest = consume_rest();
212+
213+
const size_t saved_pos = pos_;
214+
const size_t saved_content_size = result_.content.size();
215+
const size_t saved_reasoning_size = result_.reasoning_content.size();
216+
217+
auto restore_state = [&]() {
218+
move_to(saved_pos);
219+
result_.content.resize(saved_content_size);
220+
result_.reasoning_content.resize(saved_reasoning_size);
221+
};
222+
223+
// Allow leading whitespace to be preserved as content when reasoning is present at the start
224+
size_t cursor = pos_;
225+
size_t whitespace_end = cursor;
226+
while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
227+
++whitespace_end;
228+
}
229+
230+
if (whitespace_end >= input_.size()) {
231+
restore_state();
232+
if (syntax_.thinking_forced_open) {
233+
auto rest = input_.substr(saved_pos);
192234
if (!rest.empty()) {
193235
handle_reasoning(rest, /* closed */ !is_partial());
194236
}
195-
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
196-
// if (!syntax_.thinking_forced_open) {
197-
// throw common_chat_msg_partial_exception(end_think);
198-
// }
237+
move_to(input_.size());
199238
return true;
200239
}
240+
return false;
241+
}
242+
243+
cursor = whitespace_end;
244+
const size_t remaining = input_.size() - cursor;
245+
const size_t start_prefix = std::min(start_think.size(), remaining);
246+
const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
247+
248+
if (has_start_tag && start_prefix < start_think.size()) {
249+
move_to(input_.size());
250+
return true;
251+
}
252+
253+
if (has_start_tag) {
254+
if (whitespace_end > pos_) {
255+
add_content(input_.substr(pos_, whitespace_end - pos_));
256+
}
257+
set_reasoning_prefix(cursor);
258+
cursor += start_think.size();
259+
} else if (syntax_.thinking_forced_open) {
260+
cursor = whitespace_end;
261+
} else {
262+
restore_state();
263+
return false;
264+
}
265+
while (true) {
266+
if (cursor >= input_.size()) {
267+
move_to(input_.size());
268+
return true;
269+
}
270+
271+
size_t end_pos = input_.find(end_think, cursor);
272+
if (end_pos == std::string::npos) {
273+
std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
274+
size_t partial_off = string_find_partial_stop(remaining_view, end_think);
275+
size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
276+
if (reasoning_end > cursor) {
277+
handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
278+
}
279+
move_to(input_.size());
280+
return true;
281+
}
282+
283+
if (end_pos > cursor) {
284+
handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
285+
} else {
286+
handle_reasoning("", /* closed */ true);
287+
}
288+
289+
cursor = end_pos + end_think.size();
290+
291+
while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
292+
++cursor;
293+
}
294+
295+
const size_t next_remaining = input_.size() - cursor;
296+
if (next_remaining == 0) {
297+
move_to(cursor);
298+
return true;
299+
}
300+
301+
const size_t next_prefix = std::min(start_think.size(), next_remaining);
302+
if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
303+
if (next_prefix < start_think.size()) {
304+
move_to(input_.size());
305+
return true;
306+
}
307+
set_reasoning_prefix(cursor);
308+
cursor += start_think.size();
309+
continue;
310+
}
311+
312+
move_to(cursor);
313+
return true;
201314
}
202-
return false;
203315
}
204316

205317
std::string common_chat_msg_parser::consume_rest() {

common/chat.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1408,6 +1408,8 @@ static common_chat_params common_chat_params_init_apertus(const common_chat_temp
14081408
return data;
14091409
}
14101410
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
1411+
builder.try_parse_reasoning("<think>", "</think>");
1412+
14111413
if (!builder.syntax().parse_tool_calls) {
14121414
builder.add_content(builder.consume_rest());
14131415
return;
@@ -2862,6 +2864,7 @@ common_chat_params common_chat_templates_apply(
28622864
}
28632865

28642866
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
2867+
builder.try_parse_reasoning("<think>", "</think>");
28652868
builder.add_content(builder.consume_rest());
28662869
}
28672870

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,7 @@ struct common_params {
429429
std::string chat_template = ""; // NOLINT
430430
bool use_jinja = false; // NOLINT
431431
bool enable_chat_template = true;
432-
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
432+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
433433
int reasoning_budget = -1;
434434
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
435435

convert_hf_to_gguf.py

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,13 +96,15 @@ class ModelBase:
9696
# Mistral format specifics
9797
is_mistral_format: bool = False
9898
disable_mistral_community_chat_template: bool = False
99+
sentence_transformers_dense_modules: bool = False
99100

100101
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
101102
use_temp_file: bool = False, eager: bool = False,
102103
metadata_override: Path | None = None, model_name: str | None = None,
103104
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
104105
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
105-
disable_mistral_community_chat_template: bool = False):
106+
disable_mistral_community_chat_template: bool = False,
107+
sentence_transformers_dense_modules: bool = False):
106108
if type(self) is ModelBase or \
107109
type(self) is TextModel or \
108110
type(self) is MmprojModel:
@@ -117,6 +119,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
117119
self.lazy = not eager or (remote_hf_model_id is not None)
118120
self.dry_run = dry_run
119121
self.remote_hf_model_id = remote_hf_model_id
122+
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
120123
if remote_hf_model_id is not None:
121124
self.is_safetensors = True
122125

@@ -5274,6 +5277,53 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
52745277
@ModelBase.register("Gemma3TextModel")
52755278
class EmbeddingGemma(Gemma3Model):
52765279
model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
5280+
module_paths = []
5281+
dense_features_dims = {}
5282+
5283+
def __init__(self, *args, **kwargs):
5284+
super().__init__(*args, **kwargs)
5285+
if self.sentence_transformers_dense_modules:
5286+
# read modules.json to determine if model has Dense layers
5287+
modules_file = self.dir_model / "modules.json"
5288+
if modules_file.is_file():
5289+
with open(modules_file, encoding="utf-8") as modules_json_file:
5290+
mods = json.load(modules_json_file)
5291+
for mod in mods:
5292+
if mod["type"] == "sentence_transformers.models.Dense":
5293+
mod_path = mod["path"]
5294+
# check if model.safetensors file for Dense layer exists
5295+
model_tensors_file = self.dir_model / mod_path / "model.safetensors"
5296+
if model_tensors_file.is_file():
5297+
self.module_paths.append(mod_path)
5298+
# read config.json of the Dense layer to get in/out features
5299+
mod_conf_file = self.dir_model / mod_path / "config.json"
5300+
if mod_conf_file.is_file():
5301+
with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file:
5302+
mod_conf = json.load(mod_conf_json_file)
5303+
# hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights
5304+
prefix = self._get_dense_prefix(mod_path)
5305+
if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None:
5306+
self.dense_features_dims[prefix] = (mod_conf["in_features"], mod_conf["out_features"])
5307+
5308+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
5309+
from safetensors.torch import load_file
5310+
module_paths = list(self.module_paths)
5311+
for i, module_path in enumerate(module_paths):
5312+
tensors_file = self.dir_model / module_path / "model.safetensors"
5313+
local_tensors = load_file(tensors_file)
5314+
tensor_name = self._get_dense_prefix(module_path)
5315+
for name, local_tensor in local_tensors.items():
5316+
if not name.endswith(".weight"):
5317+
continue
5318+
orig_name = name.replace("linear", tensor_name)
5319+
name = self.map_tensor_name(orig_name)
5320+
yield name, local_tensor.clone()
5321+
5322+
@staticmethod
5323+
def _get_dense_prefix(module_path) -> str:
5324+
"""Get the tensor name prefix for the Dense layer from module path."""
5325+
tensor_name = "dense_2" if module_path == "2_Dense" else "dense_3"
5326+
return tensor_name
52775327

52785328
def set_gguf_parameters(self):
52795329
super().set_gguf_parameters()
@@ -5290,6 +5340,10 @@ def set_gguf_parameters(self):
52905340
logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
52915341
f"instead of {self.hparams['sliding_window']}")
52925342
self.gguf_writer.add_sliding_window(orig_sliding_window)
5343+
if self.sentence_transformers_dense_modules:
5344+
for dense, dims in self.dense_features_dims.items():
5345+
logger.info(f"Setting dense layer {dense} in/out features to {dims}")
5346+
self.gguf_writer.add_dense_features_dims(dense, dims[0], dims[1])
52935347

52945348
self._try_set_pooling_type()
52955349

@@ -9340,6 +9394,13 @@ def parse_args() -> argparse.Namespace:
93409394
)
93419395
)
93429396

9397+
parser.add_argument(
9398+
"--sentence-transformers-dense-modules", action="store_true",
9399+
help=("Whether to include sentence-transformers dense modules."
9400+
"It can be used for sentence-transformers models, like google/embeddinggemma-300m"
9401+
"Default these modules are not included.")
9402+
)
9403+
93439404
args = parser.parse_args()
93449405
if not args.print_supported_models and args.model is None:
93459406
parser.error("the following arguments are required: model")
@@ -9402,9 +9463,13 @@ def main() -> None:
94029463
if args.remote:
94039464
hf_repo_id = args.model
94049465
from huggingface_hub import snapshot_download
9466+
allowed_patterns = ["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"]
9467+
if args.sentence_transformers_dense_modules:
9468+
# include sentence-transformers dense modules safetensors files
9469+
allowed_patterns.append("*.safetensors")
94059470
local_dir = snapshot_download(
94069471
repo_id=hf_repo_id,
9407-
allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
9472+
allow_patterns=allowed_patterns)
94089473
dir_model = Path(local_dir)
94099474
logger.info(f"Downloaded config and tokenizer to {local_dir}")
94109475
else:
@@ -9472,7 +9537,8 @@ def main() -> None:
94729537
split_max_tensors=args.split_max_tensors,
94739538
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
94749539
small_first_shard=args.no_tensor_first_split,
9475-
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template
9540+
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
9541+
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
94769542
)
94779543

94789544
if args.vocab_only:

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
234234

235235
info.default_tensor_split[id] = total_vram;
236236
total_vram += prop.totalGlobalMem;
237-
info.devices[id].integrated = prop.integrated;
237+
info.devices[id].integrated = false; // Temporarily disabled due to issues with corrupted output (e.g. #15034)
238238
info.devices[id].nsm = prop.multiProcessorCount;
239239
info.devices[id].smpb = prop.sharedMemPerBlock;
240240
info.devices[id].warp_size = prop.warpSize;

gguf-py/gguf/constants.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ class LLM:
128128
ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
129129
ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
130130
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
131+
DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in"
132+
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"
131133

132134
class Attention:
133135
HEAD_COUNT = "{arch}.attention.head_count"
@@ -433,6 +435,8 @@ class MODEL_TENSOR(IntEnum):
433435
TOKEN_TYPES = auto()
434436
POS_EMBD = auto()
435437
OUTPUT = auto()
438+
DENSE_2_OUT = auto() # embeddinggemma 2_Dense
439+
DENSE_3_OUT = auto() # embeddinggemma 3_Dense
436440
OUTPUT_NORM = auto()
437441
ROPE_FREQS = auto()
438442
ROPE_FACTORS_LONG = auto()
@@ -777,6 +781,8 @@ class MODEL_TENSOR(IntEnum):
777781
MODEL_TENSOR.POS_EMBD: "position_embd",
778782
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
779783
MODEL_TENSOR.OUTPUT: "output",
784+
MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense
785+
MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense
780786
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
781787
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
782788
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
@@ -1759,6 +1765,8 @@ class MODEL_TENSOR(IntEnum):
17591765
MODEL_ARCH.GEMMA_EMBEDDING: [
17601766
MODEL_TENSOR.TOKEN_EMBD,
17611767
MODEL_TENSOR.OUTPUT,
1768+
MODEL_TENSOR.DENSE_2_OUT,
1769+
MODEL_TENSOR.DENSE_3_OUT,
17621770
MODEL_TENSOR.OUTPUT_NORM,
17631771
MODEL_TENSOR.ATTN_Q,
17641772
MODEL_TENSOR.ATTN_Q_NORM,

gguf-py/gguf/gguf_writer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -730,6 +730,10 @@ def add_shared_kv_layers(self, value: int) -> None:
730730
def add_sliding_window_pattern(self, value: Sequence[bool]) -> None:
731731
self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value)
732732

733+
def add_dense_features_dims(self, dense:str, in_f:int, out_f:int) -> None:
734+
self.add_uint32(Keys.LLM.DENSE_FEAT_IN_SIZE.format(arch=self.arch, dense=dense), in_f)
735+
self.add_uint32(Keys.LLM.DENSE_FEAT_OUT_SIZE.format(arch=self.arch, dense=dense), out_f)
736+
733737
def add_logit_scale(self, value: float) -> None:
734738
self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
735739

0 commit comments

Comments
 (0)