Merge branch 'master' into feature/jinaclip-v2-projector

liyang · liyang · commit 7f7f6836cb70 · 2025-10-24T10:03:49.000+08:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.use_jinja = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
     add_opt(common_arg(
         {"--reasoning-format"}, "FORMAT",
         "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2826,7 +2826,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
         ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
 
     if (ops.size() == topk_moe_ops_with_norm.size() &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_with_norm, { node_idx + 3, node_idx + 8 })) {
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
         ggml_tensor * softmax = cgraph->nodes[node_idx];
         ggml_tensor * weights = cgraph->nodes[node_idx+8];
 
@@ -2836,7 +2836,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
     }
 
     if (ops.size() == topk_moe_ops.size() &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops, { node_idx + 3, node_idx + 4 })) {
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
         ggml_tensor * softmax = cgraph->nodes[node_idx];
         ggml_tensor * weights = cgraph->nodes[node_idx+4];
         if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
@@ -2845,7 +2845,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
     }
 
     if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_delayed_softmax, { node_idx + 2, node_idx + 5 })) {
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
         ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
         ggml_tensor * weights = cgraph->nodes[node_idx + 5];
 
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
@@ -14,12 +14,12 @@
     SentencePieceProcessor = None
 
 try:
-    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
-    from mistral_common.tokens.tokenizers.tekken import Tekkenizer
-    from mistral_common.tokens.tokenizers.utils import (
+    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports]
+    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
+    from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
         _filter_valid_tokenizer_files,
     )
-    from mistral_common.tokens.tokenizers.sentencepiece import (
+    from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
         SentencePieceTokenizer,
     )
 except ImportError:
diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt
@@ -1,5 +1,3 @@
-mistral-common>=1.8.3
-
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 
diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt
@@ -6,3 +6,8 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17)
 if(LLAMA_TOOLS_INSTALL)
     install(TARGETS ${TARGET} RUNTIME)
 endif()
+
+if (CMAKE_SYSTEM_NAME MATCHES "AIX")
+    # AIX's flock() function comes from libbsd.a
+    target_link_libraries(${TARGET} PRIVATE -lbsd)
+endif()
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
@@ -82,9 +82,11 @@ struct mtmd_cli_context {
 
     mtmd::bitmaps bitmaps;
 
-    // note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
-    // so here we don't need to keep track of chat history
+    // chat template
     common_chat_templates_ptr tmpls;
+    std::vector<common_chat_msg> chat_history;
+    bool use_jinja = false;
+    // TODO: support for --system-prompt with /clear command
 
     // support for legacy templates (models not having EOT token)
     llama_tokens antiprompt_tokens;
@@ -114,6 +116,8 @@ struct mtmd_cli_context {
         }
 
         tmpls = common_chat_templates_init(model, params.chat_template);
+        use_jinja = params.use_jinja;
+        chat_history.clear();
         LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
 
         init_vision_context(params);
@@ -324,19 +328,33 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) {
             return 1;
         }
     }
+
+    std::string generated_text = common_detokenize(ctx.lctx, generated_tokens);
+    common_chat_msg msg;
+    msg.role    = "assistant";
+    msg.content = generated_text;
+    ctx.chat_history.push_back(std::move(msg));
+
     return 0;
 }
 
-static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) {
-    common_chat_templates_inputs tmpl_inputs;
-    tmpl_inputs.messages = {msg};
-    tmpl_inputs.add_generation_prompt = true;
-    tmpl_inputs.use_jinja = false; // jinja is buggy here
-    auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
-    LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
+static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) {
+    LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n",
+        new_msg.role.c_str(), new_msg.content.c_str());
+    auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history,
+        new_msg, new_msg.role == "user",
+        ctx.use_jinja);
+    ctx.chat_history.push_back(new_msg);
+    return formatted;
+}
+
+static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
+    bool add_bos = ctx.chat_history.empty();
+    auto formatted_chat = chat_add_and_format(ctx, msg);
+    LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
 
     mtmd_input_text text;
-    text.text          = formatted_chat.prompt.c_str();
+    text.text          = formatted_chat.c_str();
     text.add_special   = add_bos;
     text.parse_special = true;
 
@@ -446,7 +464,7 @@ int main(int argc, char ** argv) {
                 return 1; // error is already printed by libmtmd
             }
         }
-        if (eval_message(ctx, msg, true)) {
+        if (eval_message(ctx, msg)) {
             return 1;
         }
         if (!g_is_interrupted && generate_response(ctx, n_predict)) {
@@ -465,7 +483,6 @@ int main(int argc, char ** argv) {
         LOG("\n   /quit or /exit   exit the program");
         LOG("\n");
 
-        bool is_first_msg = true;
         std::string content;
 
         while (!g_is_interrupted) {
@@ -485,7 +502,8 @@ int main(int argc, char ** argv) {
             }
             if (line == "/clear") {
                 ctx.n_past = 0;
-                llama_memory_seq_rm(llama_get_memory(ctx.lctx), 0, 1, -1); // keep BOS
+                ctx.chat_history.clear();
+                llama_memory_clear(llama_get_memory(ctx.lctx), true);
                 LOG("Chat history cleared\n\n");
                 continue;
             }
@@ -510,7 +528,7 @@ int main(int argc, char ** argv) {
             common_chat_msg msg;
             msg.role = "user";
             msg.content = content;
-            int ret = eval_message(ctx, msg, is_first_msg);
+            int ret = eval_message(ctx, msg);
             if (ret) {
                 return 1;
             }
@@ -519,7 +537,6 @@ int main(int argc, char ** argv) {
                 return 1;
             }
             content.clear();
-            is_first_msg = false;
         }
     }
     if (g_is_interrupted) LOG("\nInterrupted by user\n");
diff --git a/tools/run/CMakeLists.txt b/tools/run/CMakeLists.txt
@@ -13,5 +13,11 @@ endif ()
 if(LLAMA_TOOLS_INSTALL)
     install(TARGETS ${TARGET} RUNTIME)
 endif()
+
+if (CMAKE_SYSTEM_NAME MATCHES "AIX")
+    # AIX's flock() function comes from libbsd.a
+    target_link_libraries(${TARGET} PRIVATE -lbsd)
+endif()
+
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -2839,7 +2839,7 @@ struct server_context {
                     slot.generated_text.begin() + pos + stop_pos,
                     slot.generated_text.end());
                 pos = std::min(slot.n_sent_text, slot.generated_text.size());
-            } else if (slot.has_next_token) {
+            } else if (slot.has_next_token && !llama_vocab_is_eog(vocab, result.tok) ) {
                 stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false);
                 send_text = stop_pos == std::string::npos;
             }
@@ -5714,6 +5714,7 @@ int main(int argc, char ** argv) {
 
     clean_up();
     t.join();
+    llama_memory_breakdown_print(ctx_server.ctx);
 
     return 0;
 }

Original file line number	Diff line number	Diff line change
`@@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`3435`	`3435`	`[](common_params & params) {`
`3436`	`3436`	`params.use_jinja = true;`
`3437`	`3437`	`}`
`3438`		`- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));`
	`3438`	`+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));`
`3439`	`3439`	`add_opt(common_arg(`
`3440`	`3440`	`{"--reasoning-format"}, "FORMAT",`
`3441`	`3441`	`"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,3 @@`
`1`		`-mistral-common>=1.8.3`
`2`		`-`
`3`	`1`	`-r ./requirements-convert_legacy_llama.txt`
`4`	`2`	`--extra-index-url https://download.pytorch.org/whl/cpu`
`5`	`3`
Original file line number	Diff line number	Diff line change
`@@ -2839,7 +2839,7 @@ struct server_context {`
`2839`	`2839`	`slot.generated_text.begin() + pos + stop_pos,`
`2840`	`2840`	`slot.generated_text.end());`
`2841`	`2841`	`pos = std::min(slot.n_sent_text, slot.generated_text.size());`
`2842`		`- } else if (slot.has_next_token) {`
	`2842`	`+ } else if (slot.has_next_token && !llama_vocab_is_eog(vocab, result.tok) ) {`
`2843`	`2843`	`stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false);`
`2844`	`2844`	`send_text = stop_pos == std::string::npos;`
`2845`	`2845`	`}`
`@@ -5714,6 +5714,7 @@ int main(int argc, char ** argv) {`
`5714`	`5714`
`5715`	`5715`	`clean_up();`
`5716`	`5716`	`t.join();`
	`5717`	`+ llama_memory_breakdown_print(ctx_server.ctx);`
`5717`	`5718`
`5718`	`5719`	`return 0;`
`5719`	`5720`	`}`