Skip to content

Commit 86d2de5

Browse files
author
liyang
committed
Merge branch 'master' into bugfix-server-vision-mtmd
2 parents c469e8a + f8f071f commit 86d2de5

File tree

10 files changed

+308
-254
lines changed

10 files changed

+308
-254
lines changed

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34353435
[](common_params & params) {
34363436
params.use_jinja = true;
34373437
}
3438-
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
3438+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
34393439
add_opt(common_arg(
34403440
{"--reasoning-format"}, "FORMAT",
34413441
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"

convert_hf_to_gguf.py

Lines changed: 254 additions & 221 deletions
Large diffs are not rendered by default.

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2826,7 +2826,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
28262826
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
28272827

28282828
if (ops.size() == topk_moe_ops_with_norm.size() &&
2829-
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_with_norm, { node_idx + 3, node_idx + 8 })) {
2829+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
28302830
ggml_tensor * softmax = cgraph->nodes[node_idx];
28312831
ggml_tensor * weights = cgraph->nodes[node_idx+8];
28322832

@@ -2836,7 +2836,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
28362836
}
28372837

28382838
if (ops.size() == topk_moe_ops.size() &&
2839-
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops, { node_idx + 3, node_idx + 4 })) {
2839+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
28402840
ggml_tensor * softmax = cgraph->nodes[node_idx];
28412841
ggml_tensor * weights = cgraph->nodes[node_idx+4];
28422842
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
@@ -2845,7 +2845,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
28452845
}
28462846

28472847
if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
2848-
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_delayed_softmax, { node_idx + 2, node_idx + 5 })) {
2848+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
28492849
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
28502850
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
28512851

gguf-py/gguf/vocab.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
SentencePieceProcessor = None
1515

1616
try:
17-
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
18-
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
19-
from mistral_common.tokens.tokenizers.utils import (
17+
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports]
18+
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
19+
from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
2020
_filter_valid_tokenizer_files,
2121
)
22-
from mistral_common.tokens.tokenizers.sentencepiece import (
22+
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
2323
SentencePieceTokenizer,
2424
)
2525
except ImportError:

requirements/requirements-convert_hf_to_gguf.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
mistral-common>=1.8.3
2-
31
-r ./requirements-convert_legacy_llama.txt
42
--extra-index-url https://download.pytorch.org/whl/cpu
53

tools/imatrix/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,8 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17)
66
if(LLAMA_TOOLS_INSTALL)
77
install(TARGETS ${TARGET} RUNTIME)
88
endif()
9+
10+
if (CMAKE_SYSTEM_NAME MATCHES "AIX")
11+
# AIX's flock() function comes from libbsd.a
12+
target_link_libraries(${TARGET} PRIVATE -lbsd)
13+
endif()

tools/mtmd/mtmd-cli.cpp

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,11 @@ struct mtmd_cli_context {
7676

7777
mtmd::bitmaps bitmaps;
7878

79-
// note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
80-
// so here we don't need to keep track of chat history
79+
// chat template
8180
common_chat_templates_ptr tmpls;
81+
std::vector<common_chat_msg> chat_history;
82+
bool use_jinja = false;
83+
// TODO: support for --system-prompt with /clear command
8284

8385
// support for legacy templates (models not having EOT token)
8486
llama_tokens antiprompt_tokens;
@@ -108,6 +110,8 @@ struct mtmd_cli_context {
108110
}
109111

110112
tmpls = common_chat_templates_init(model, params.chat_template);
113+
use_jinja = params.use_jinja;
114+
chat_history.clear();
111115
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
112116

113117
init_vision_context(params);
@@ -193,19 +197,33 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) {
193197
return 1;
194198
}
195199
}
200+
201+
std::string generated_text = common_detokenize(ctx.lctx, generated_tokens);
202+
common_chat_msg msg;
203+
msg.role = "assistant";
204+
msg.content = generated_text;
205+
ctx.chat_history.push_back(std::move(msg));
206+
196207
return 0;
197208
}
198209

199-
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) {
200-
common_chat_templates_inputs tmpl_inputs;
201-
tmpl_inputs.messages = {msg};
202-
tmpl_inputs.add_generation_prompt = true;
203-
tmpl_inputs.use_jinja = false; // jinja is buggy here
204-
auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
205-
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
210+
static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) {
211+
LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n",
212+
new_msg.role.c_str(), new_msg.content.c_str());
213+
auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history,
214+
new_msg, new_msg.role == "user",
215+
ctx.use_jinja);
216+
ctx.chat_history.push_back(new_msg);
217+
return formatted;
218+
}
219+
220+
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
221+
bool add_bos = ctx.chat_history.empty();
222+
auto formatted_chat = chat_add_and_format(ctx, msg);
223+
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
206224

207225
mtmd_input_text text;
208-
text.text = formatted_chat.prompt.c_str();
226+
text.text = formatted_chat.c_str();
209227
text.add_special = add_bos;
210228
text.parse_special = true;
211229

@@ -303,7 +321,7 @@ int main(int argc, char ** argv) {
303321
return 1; // error is already printed by libmtmd
304322
}
305323
}
306-
if (eval_message(ctx, msg, true)) {
324+
if (eval_message(ctx, msg)) {
307325
return 1;
308326
}
309327
if (!g_is_interrupted && generate_response(ctx, n_predict)) {
@@ -322,7 +340,6 @@ int main(int argc, char ** argv) {
322340
LOG("\n /quit or /exit exit the program");
323341
LOG("\n");
324342

325-
bool is_first_msg = true;
326343
std::string content;
327344

328345
while (!g_is_interrupted) {
@@ -342,7 +359,8 @@ int main(int argc, char ** argv) {
342359
}
343360
if (line == "/clear") {
344361
ctx.n_past = 0;
345-
llama_memory_seq_rm(llama_get_memory(ctx.lctx), 0, 1, -1); // keep BOS
362+
ctx.chat_history.clear();
363+
llama_memory_clear(llama_get_memory(ctx.lctx), true);
346364
LOG("Chat history cleared\n\n");
347365
continue;
348366
}
@@ -367,7 +385,7 @@ int main(int argc, char ** argv) {
367385
common_chat_msg msg;
368386
msg.role = "user";
369387
msg.content = content;
370-
int ret = eval_message(ctx, msg, is_first_msg);
388+
int ret = eval_message(ctx, msg);
371389
if (ret) {
372390
return 1;
373391
}
@@ -376,7 +394,6 @@ int main(int argc, char ** argv) {
376394
return 1;
377395
}
378396
content.clear();
379-
is_first_msg = false;
380397
}
381398
}
382399
if (g_is_interrupted) LOG("\nInterrupted by user\n");

tools/run/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,11 @@ endif ()
1313
if(LLAMA_TOOLS_INSTALL)
1414
install(TARGETS ${TARGET} RUNTIME)
1515
endif()
16+
17+
if (CMAKE_SYSTEM_NAME MATCHES "AIX")
18+
# AIX's flock() function comes from libbsd.a
19+
target_link_libraries(${TARGET} PRIVATE -lbsd)
20+
endif()
21+
1622
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
1723
target_compile_features(${TARGET} PRIVATE cxx_std_17)

tools/server/server.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2839,7 +2839,7 @@ struct server_context {
28392839
slot.generated_text.begin() + pos + stop_pos,
28402840
slot.generated_text.end());
28412841
pos = std::min(slot.n_sent_text, slot.generated_text.size());
2842-
} else if (slot.has_next_token) {
2842+
} else if (slot.has_next_token && !llama_vocab_is_eog(vocab, result.tok) ) {
28432843
stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false);
28442844
send_text = stop_pos == std::string::npos;
28452845
}
@@ -5714,6 +5714,7 @@ int main(int argc, char ** argv) {
57145714

57155715
clean_up();
57165716
t.join();
5717+
llama_memory_breakdown_print(ctx_server.ctx);
57175718

57185719
return 0;
57195720
}

tools/server/utils.hpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -756,13 +756,7 @@ static json oaicompat_chat_params_parse(
756756
}
757757

758758
llama_params["chat_format"] = static_cast<int>(chat_params.format);
759-
if (!out_files.empty()) {
760-
std::string prompt_mm = chat_params.prompt;
761-
string_replace_all(prompt_mm, "<start_of_image><end_of_image>", mtmd_default_marker());
762-
llama_params["prompt"] = std::move(prompt_mm);
763-
} else {
764-
llama_params["prompt"] = chat_params.prompt;
765-
}
759+
llama_params["prompt"] = chat_params.prompt;
766760
if (!chat_params.grammar.empty()) {
767761
llama_params["grammar"] = chat_params.grammar;
768762
}

0 commit comments

Comments
 (0)