Skip to content

Commit 1df584e

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents a25cb89 + dd62dcf commit 1df584e

File tree

9 files changed

+86
-35
lines changed

9 files changed

+86
-35
lines changed

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34353435
[](common_params & params) {
34363436
params.use_jinja = true;
34373437
}
3438-
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
3438+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
34393439
add_opt(common_arg(
34403440
{"--reasoning-format"}, "FORMAT",
34413441
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"

convert_hf_to_gguf.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,29 @@
2929
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
3030
import gguf
3131
from gguf.vocab import MistralTokenizerType, MistralVocab
32-
from mistral_common.tokens.tokenizers.base import TokenizerVersion
33-
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD
34-
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
35-
from mistral_common.tokens.tokenizers.sentencepiece import (
36-
SentencePieceTokenizer,
37-
)
32+
33+
try:
34+
from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
35+
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
36+
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
37+
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
38+
SentencePieceTokenizer,
39+
)
40+
41+
_mistral_common_installed = True
42+
_mistral_import_error_msg = ""
43+
except ImportError:
44+
_MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
45+
_MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
46+
47+
_mistral_common_installed = False
48+
TokenizerVersion = None
49+
Tekkenizer = None
50+
SentencePieceTokenizer = None
51+
_mistral_import_error_msg = (
52+
"Mistral format requires `mistral-common` to be installed. Please run "
53+
"`pip install mistral-common[image,audio]` to install it."
54+
)
3855

3956

4057
logger = logging.getLogger("hf-to-gguf")
@@ -107,6 +124,9 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
107124
type(self) is MmprojModel:
108125
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
109126

127+
if self.is_mistral_format and not _mistral_common_installed:
128+
raise ImportError(_mistral_import_error_msg)
129+
110130
self.dir_model = dir_model
111131
self.ftype = ftype
112132
self.fname_out = fname_out
@@ -1363,8 +1383,8 @@ def set_gguf_parameters(self):
13631383
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
13641384

13651385
# preprocessor config
1366-
image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
1367-
image_std = DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
1386+
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
1387+
image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
13681388

13691389
self.gguf_writer.add_vision_image_mean(image_mean)
13701390
self.gguf_writer.add_vision_image_std(image_std)
@@ -2033,6 +2053,9 @@ def __init__(self, *args, **kwargs):
20332053
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
20342054

20352055
def _set_vocab_mistral(self):
2056+
if not _mistral_common_installed:
2057+
raise ImportError(_mistral_import_error_msg)
2058+
20362059
vocab = MistralVocab(self.dir_model)
20372060
logger.info(
20382061
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
@@ -9212,7 +9235,7 @@ class MistralModel(LlamaModel):
92129235

92139236
@staticmethod
92149237
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
9215-
assert TokenizerVersion is not None, "mistral_common is not installed"
9238+
assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
92169239
assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), (
92179240
f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}"
92189241
)
@@ -9594,6 +9617,8 @@ def main() -> None:
95949617
fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
95959618

95969619
is_mistral_format = args.mistral_format
9620+
if is_mistral_format and not _mistral_common_installed:
9621+
raise ImportError(_mistral_import_error_msg)
95979622
disable_mistral_community_chat_template = args.disable_mistral_community_chat_template
95989623

95999624
with torch.inference_mode():

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2826,7 +2826,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
28262826
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
28272827

28282828
if (ops.size() == topk_moe_ops_with_norm.size() &&
2829-
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_with_norm, { node_idx + 3, node_idx + 8 })) {
2829+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
28302830
ggml_tensor * softmax = cgraph->nodes[node_idx];
28312831
ggml_tensor * weights = cgraph->nodes[node_idx+8];
28322832

@@ -2836,7 +2836,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
28362836
}
28372837

28382838
if (ops.size() == topk_moe_ops.size() &&
2839-
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops, { node_idx + 3, node_idx + 4 })) {
2839+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
28402840
ggml_tensor * softmax = cgraph->nodes[node_idx];
28412841
ggml_tensor * weights = cgraph->nodes[node_idx+4];
28422842
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
@@ -2845,7 +2845,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
28452845
}
28462846

28472847
if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
2848-
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_delayed_softmax, { node_idx + 2, node_idx + 5 })) {
2848+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
28492849
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
28502850
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
28512851

gguf-py/gguf/vocab.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
SentencePieceProcessor = None
1515

1616
try:
17-
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
18-
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
19-
from mistral_common.tokens.tokenizers.utils import (
17+
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports]
18+
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
19+
from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
2020
_filter_valid_tokenizer_files,
2121
)
22-
from mistral_common.tokens.tokenizers.sentencepiece import (
22+
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
2323
SentencePieceTokenizer,
2424
)
2525
except ImportError:

requirements/requirements-convert_hf_to_gguf.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
mistral-common>=1.8.3
2-
31
-r ./requirements-convert_legacy_llama.txt
42
--extra-index-url https://download.pytorch.org/whl/cpu
53

tools/imatrix/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,8 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17)
66
if(LLAMA_TOOLS_INSTALL)
77
install(TARGETS ${TARGET} RUNTIME)
88
endif()
9+
10+
if (CMAKE_SYSTEM_NAME MATCHES "AIX")
11+
# AIX's flock() function comes from libbsd.a
12+
target_link_libraries(${TARGET} PRIVATE -lbsd)
13+
endif()

tools/mtmd/mtmd-cli.cpp

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,11 @@ struct mtmd_cli_context {
7676

7777
mtmd::bitmaps bitmaps;
7878

79-
// note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
80-
// so here we don't need to keep track of chat history
79+
// chat template
8180
common_chat_templates_ptr tmpls;
81+
std::vector<common_chat_msg> chat_history;
82+
bool use_jinja = false;
83+
// TODO: support for --system-prompt with /clear command
8284

8385
// support for legacy templates (models not having EOT token)
8486
llama_tokens antiprompt_tokens;
@@ -108,6 +110,8 @@ struct mtmd_cli_context {
108110
}
109111

110112
tmpls = common_chat_templates_init(model, params.chat_template);
113+
use_jinja = params.use_jinja;
114+
chat_history.clear();
111115
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
112116

113117
init_vision_context(params);
@@ -193,19 +197,33 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) {
193197
return 1;
194198
}
195199
}
200+
201+
std::string generated_text = common_detokenize(ctx.lctx, generated_tokens);
202+
common_chat_msg msg;
203+
msg.role = "assistant";
204+
msg.content = generated_text;
205+
ctx.chat_history.push_back(std::move(msg));
206+
196207
return 0;
197208
}
198209

199-
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) {
200-
common_chat_templates_inputs tmpl_inputs;
201-
tmpl_inputs.messages = {msg};
202-
tmpl_inputs.add_generation_prompt = true;
203-
tmpl_inputs.use_jinja = false; // jinja is buggy here
204-
auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
205-
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
210+
static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) {
211+
LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n",
212+
new_msg.role.c_str(), new_msg.content.c_str());
213+
auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history,
214+
new_msg, new_msg.role == "user",
215+
ctx.use_jinja);
216+
ctx.chat_history.push_back(new_msg);
217+
return formatted;
218+
}
219+
220+
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
221+
bool add_bos = ctx.chat_history.empty();
222+
auto formatted_chat = chat_add_and_format(ctx, msg);
223+
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
206224

207225
mtmd_input_text text;
208-
text.text = formatted_chat.prompt.c_str();
226+
text.text = formatted_chat.c_str();
209227
text.add_special = add_bos;
210228
text.parse_special = true;
211229

@@ -303,7 +321,7 @@ int main(int argc, char ** argv) {
303321
return 1; // error is already printed by libmtmd
304322
}
305323
}
306-
if (eval_message(ctx, msg, true)) {
324+
if (eval_message(ctx, msg)) {
307325
return 1;
308326
}
309327
if (!g_is_interrupted && generate_response(ctx, n_predict)) {
@@ -322,7 +340,6 @@ int main(int argc, char ** argv) {
322340
LOG("\n /quit or /exit exit the program");
323341
LOG("\n");
324342

325-
bool is_first_msg = true;
326343
std::string content;
327344

328345
while (!g_is_interrupted) {
@@ -342,7 +359,8 @@ int main(int argc, char ** argv) {
342359
}
343360
if (line == "/clear") {
344361
ctx.n_past = 0;
345-
llama_memory_seq_rm(llama_get_memory(ctx.lctx), 0, 1, -1); // keep BOS
362+
ctx.chat_history.clear();
363+
llama_memory_clear(llama_get_memory(ctx.lctx), true);
346364
LOG("Chat history cleared\n\n");
347365
continue;
348366
}
@@ -367,7 +385,7 @@ int main(int argc, char ** argv) {
367385
common_chat_msg msg;
368386
msg.role = "user";
369387
msg.content = content;
370-
int ret = eval_message(ctx, msg, is_first_msg);
388+
int ret = eval_message(ctx, msg);
371389
if (ret) {
372390
return 1;
373391
}
@@ -376,7 +394,6 @@ int main(int argc, char ** argv) {
376394
return 1;
377395
}
378396
content.clear();
379-
is_first_msg = false;
380397
}
381398
}
382399
if (g_is_interrupted) LOG("\nInterrupted by user\n");

tools/run/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,11 @@ endif ()
1313
if(LLAMA_TOOLS_INSTALL)
1414
install(TARGETS ${TARGET} RUNTIME)
1515
endif()
16+
17+
if (CMAKE_SYSTEM_NAME MATCHES "AIX")
18+
# AIX's flock() function comes from libbsd.a
19+
target_link_libraries(${TARGET} PRIVATE -lbsd)
20+
endif()
21+
1622
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
1723
target_compile_features(${TARGET} PRIVATE cxx_std_17)

tools/server/server.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2839,7 +2839,7 @@ struct server_context {
28392839
slot.generated_text.begin() + pos + stop_pos,
28402840
slot.generated_text.end());
28412841
pos = std::min(slot.n_sent_text, slot.generated_text.size());
2842-
} else if (slot.has_next_token) {
2842+
} else if (slot.has_next_token && !llama_vocab_is_eog(vocab, result.tok) ) {
28432843
stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false);
28442844
send_text = stop_pos == std::string::npos;
28452845
}

0 commit comments

Comments
 (0)