Mistral tool calling unary (#3567)

dkalinowski · przepeck · commit 137631e2a80c · 2025-08-21T13:26:53.000+02:00
CVS-171565
diff --git a/.dockerignore b/.dockerignore
@@ -4,3 +4,4 @@
 out
 demos/continuous_batching
 demos/embeddings
+demos/common/export_models/models
diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
@@ -52,7 +52,7 @@ def add_common_arguments(parser):
                          'Not effective if target device is not NPU', dest='max_prompt_len')
 parser_text.add_argument('--prompt_lookup_decoding', action='store_true', help='Set pipeline to use prompt lookup decoding', dest='prompt_lookup_decoding')
 parser_text.add_argument('--reasoning_parser', choices=["qwen3"], help='Set the type of the reasoning parser for reasoning content extraction', dest='reasoning_parser')
-parser_text.add_argument('--tool_parser', choices=["llama3","phi4","hermes3", "qwen3"], help='Set the type of the tool parser for tool calls extraction', dest='tool_parser')
+parser_text.add_argument('--tool_parser', choices=["llama3","phi4","hermes3", "qwen3","mistral"], help='Set the type of the tool parser for tool calls extraction', dest='tool_parser')
 parser_text.add_argument('--enable_tool_guided_generation', action='store_true', help='Enables enforcing tool schema during generation. Requires setting tool_parser', dest='enable_tool_guided_generation')
 
 parser_embeddings = subparsers.add_parser('embeddings', help='[deprecated] export model for embeddings endpoint with models split into separate, versioned directories')
@@ -464,15 +464,16 @@ def export_text_generation_model(model_repository_path, source_model, model_name
         f.write(graph_content)
     print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
 
-    if template_parameters.get("tools_model_type") is not None:
+    if template_parameters.get("tool_parser") is not None:
         print("Adding tuned chat template")
         template_mapping = {
             "phi4": "tool_chat_template_phi4_mini.jinja",
             "llama3": "tool_chat_template_llama3.1_json.jinja",
             "hermes3": "tool_chat_template_hermes.jinja",
+            "mistral": "tool_chat_template_mistral_parallel.jinja",
             "qwen3": None
             }
-        template_name = template_mapping[task_parameters.get("tools_model_type")]
+        template_name = template_mapping[task_parameters.get("tool_parser")]
         if template_name is not None:
             template_path = os.path.join(model_repository_path, model_name, "template.jinja")
             import requests
diff --git a/prepare_llm_models.sh b/prepare_llm_models.sh
@@ -30,8 +30,9 @@ QWEN3_MODEL="Qwen/Qwen3-8B"
 LLAMA3_MODEL="meta-llama/Llama-3.1-8B-Instruct"
 HERMES3_MODEL="NousResearch/Hermes-3-Llama-3.1-8B"
 PHI4_MODEL="microsoft/Phi-4-mini-instruct"
+MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
 
-MODELS=("$CB_MODEL" "$EMBEDDING_MODEL" "$RERANK_MODEL" "$VLM_MODEL" "$QWEN3_MODEL" "$LLAMA3_MODEL" "$HERMES3_MODEL" "$PHI4_MODEL" "$EMBEDDING_MODEL/ov" "$RERANK_MODEL/ov")
+MODELS=("$CB_MODEL" "$EMBEDDING_MODEL" "$RERANK_MODEL" "$VLM_MODEL" "$QWEN3_MODEL" "$LLAMA3_MODEL" "$HERMES3_MODEL" "$PHI4_MODEL" "$MISTRAL_MODEL" "$EMBEDDING_MODEL/ov" "$RERANK_MODEL/ov")
 
 all_exist=true
 for model in "${MODELS[@]}"; do
@@ -126,3 +127,11 @@ else
   mkdir -p $1/$PHI4_MODEL
   convert_tokenizer $PHI4_MODEL --with_detokenizer -o $1/$PHI4_MODEL
 fi
+
+if [ -d "$1/$MISTRAL_MODEL" ]; then
+  echo "Models directory $1/$MISTRAL_MODEL exists. Skipping downloading models."
+else
+  mkdir -p $1/$MISTRAL_MODEL
+  convert_tokenizer $MISTRAL_MODEL --with_detokenizer -o $1/$MISTRAL_MODEL
+fi
+
diff --git a/src/BUILD b/src/BUILD
@@ -2511,6 +2511,7 @@ cc_test(
                 "test/llm/output_parsers/qwen3_output_parser_test.cpp",
                 "test/llm/output_parsers/hermes3_output_parser_test.cpp",
                 "test/llm/output_parsers/phi4_output_parser_test.cpp",
+                "test/llm/output_parsers/mistral_output_parser_test.cpp",
                 "test/llm/output_parsers/partial_json_builder_test.cpp",
             ],
             "//:disable_python" : [],
diff --git a/src/llm/BUILD b/src/llm/BUILD
@@ -114,6 +114,7 @@ cc_library(
             "io_processing/hermes3/tool_parser.hpp",
             "io_processing/llama3/tool_parser.hpp",
             "io_processing/phi4/tool_parser.hpp",
+            "io_processing/mistral/tool_parser.hpp",
             "io_processing/qwen3/reasoning_parser.hpp",
             "io_processing/output_parser.hpp",
             "io_processing/partial_json_builder.hpp",
@@ -122,6 +123,7 @@ cc_library(
             "io_processing/hermes3/tool_parser.cpp",
             "io_processing/llama3/tool_parser.cpp",
             "io_processing/phi4/tool_parser.cpp",
+            "io_processing/mistral/tool_parser.cpp",
             "io_processing/qwen3/reasoning_parser.cpp",
             "io_processing/output_parser.cpp",
             "io_processing/partial_json_builder.cpp",
diff --git a/src/llm/io_processing/mistral/tool_parser.cpp b/src/llm/io_processing/mistral/tool_parser.cpp
@@ -0,0 +1,88 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <openvino/genai/tokenizer.hpp>
+#include <string>
+#include <vector>
+#include <regex>
+
+#pragma warning(push)
+#pragma warning(disable : 6313)
+#include <rapidjson/document.h>
+#include <rapidjson/stringbuffer.h>
+#include <rapidjson/writer.h>
+#pragma warning(pop)
+
+#include "../../../logging.hpp"
+#include "tool_parser.hpp"
+#include "../utils.hpp"
+
+namespace ovms {
+
+void MistralToolParser::parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) {
+    std::vector<std::string> tools;
+
+    if (parsedOutput.content.empty() || generatedTokens.size() <= 0) {
+        SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "No content to parse for tool calls");
+        return;
+    }
+
+    if (generatedTokens[0] != this->botTokenId) {
+        SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to parse functools content or extract tools array");
+        return;
+    }
+
+    rapidjson::Document toolsDoc;
+    toolsDoc.Parse(parsedOutput.content.c_str());
+
+    if (!toolsDoc.HasParseError() && toolsDoc.IsArray()) {
+        for (auto& toolVal : toolsDoc.GetArray()) {
+            if (!toolVal.IsObject()) {
+                SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool call is not a valid JSON object");
+                continue;
+            }
+            ToolCall toolCall;
+            if (toolVal.HasMember("name") && toolVal["name"].IsString()) {
+                toolCall.name = toolVal["name"].GetString();
+            } else {
+                SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool call does not contain valid name field");
+                continue;
+            }
+
+            if (toolVal.HasMember("arguments") && toolVal["arguments"].IsObject()) {
+                rapidjson::StringBuffer sb;
+                rapidjson::Writer<rapidjson::StringBuffer> toolWriter(sb);
+                toolVal["arguments"].Accept(toolWriter);
+                toolCall.arguments = sb.GetString();
+            } else {
+                SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool call does not contain valid parameters object");
+                continue;
+            }
+            toolCall.id = generateRandomId();  // Generate a random ID for the tool call
+            parsedOutput.toolCalls.push_back(toolCall);
+        }
+        parsedOutput.content.clear();
+    } else {
+        SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to parse functools content or extract tools array");
+    }
+}
+
+std::optional<rapidjson::Document> MistralToolParser::parseChunk(const std::string& chunk) {
+    // Not implemented
+    SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "MistralToolParser::parseChunk is not implemented");
+    return std::nullopt;
+}
+}  // namespace ovms
diff --git a/src/llm/io_processing/mistral/tool_parser.hpp b/src/llm/io_processing/mistral/tool_parser.hpp
@@ -0,0 +1,53 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+
+#include <openvino/genai/tokenizer.hpp>
+#include <string>
+#include <optional>
+#include <vector>
+
+#pragma warning(push)
+#pragma warning(disable : 6313)
+#include <rapidjson/document.h>
+#include <rapidjson/stringbuffer.h>
+#include <rapidjson/writer.h>
+#pragma warning(pop)
+
+#include "../base_output_parser.hpp"
+
+namespace ovms {
+class MistralToolParser : public BaseOutputParser {
+    const int64_t botTokenId = 5;  // [TOOL_CALLS]
+
+public:
+    MistralToolParser() = delete;
+    explicit MistralToolParser(ov::genai::Tokenizer& tokenizer) :
+        BaseOutputParser(tokenizer) {}
+
+    void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) override;
+    std::optional<rapidjson::Document> parseChunk(const std::string& chunk) override;
+    const std::string& getParsingStartTag() const override {
+        static const std::string toolCallStartTag = "[TOOL_CALLS]";
+        return toolCallStartTag;
+    }
+    // Tools calls are expected to be the last part of the content, so we do not specify an end tag.
+    const std::string& getParsingEndTag() const override {
+        static const std::string toolCallEndTag = "";
+        return toolCallEndTag;
+    }
+};
+}  // namespace ovms
diff --git a/src/llm/io_processing/output_parser.cpp b/src/llm/io_processing/output_parser.cpp
@@ -19,6 +19,7 @@
 #include "llama3/tool_parser.hpp"
 #include "hermes3/tool_parser.hpp"
 #include "phi4/tool_parser.hpp"
+#include "mistral/tool_parser.hpp"
 #include "qwen3/reasoning_parser.hpp"
 
 namespace ovms {
@@ -46,6 +47,8 @@ OutputParser::OutputParser(ov::genai::Tokenizer& tokenizer, const std::string to
         toolParser = std::make_unique<Hermes3ToolParser>(tokenizer);
     } else if (toolParserName == "phi4") {
         toolParser = std::make_unique<Phi4ToolParser>(tokenizer);
+    } else if (toolParserName == "mistral") {
+        toolParser = std::make_unique<MistralToolParser>(tokenizer);
     } else if (!toolParserName.empty()) {
         throw std::runtime_error("Unsupported tool parser: " + toolParserName);
     }
diff --git a/src/llm/servable_initializer.cpp b/src/llm/servable_initializer.cpp
@@ -60,6 +60,8 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
             global json
             import json
             from pathlib import Path
+            global datetime
+            import datetime
 
             global contextmanager
             from contextlib import contextmanager
@@ -73,6 +75,10 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
             def raise_exception(message):
                 raise jinja2.exceptions.TemplateError(message)
             
+            # Appears in some of mistral chat templates
+            def strftime_now(format):
+                return datetime.datetime.now().strftime(format)
+            
             # Following the logic from:
             # https://github.com/huggingface/transformers/blob/7188e2e28c6d663284634732564143b820a03f8b/src/transformers/utils/chat_template_utils.py#L398
             class AssistantTracker(Extension):
@@ -135,7 +141,8 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
             template_loader = jinja2.FileSystemLoader(searchpath=templates_directory)
             jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True, extensions=[AssistantTracker, jinja2.ext.loopcontrols], loader=template_loader)
             jinja_env.policies["json.dumps_kwargs"]["ensure_ascii"] = False
-            jinja_env.globals["raise_exception"] = raise_exception     
+            jinja_env.globals["raise_exception"] = raise_exception
+            jinja_env.globals["strftime_now"] = strftime_now
             if jinja_file.is_file():
                 template = jinja_env.get_template("template.jinja")
 
diff --git a/src/test/llm/output_parsers/mistral_output_parser_test.cpp b/src/test/llm/output_parsers/mistral_output_parser_test.cpp
diff --git a/windows_prepare_llm_models.bat b/windows_prepare_llm_models.bat