Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
out
demos/continuous_batching
demos/embeddings
demos/common/export_models/models
7 changes: 4 additions & 3 deletions demos/common/export_models/export_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def add_common_arguments(parser):
'Not effective if target device is not NPU', dest='max_prompt_len')
parser_text.add_argument('--prompt_lookup_decoding', action='store_true', help='Set pipeline to use prompt lookup decoding', dest='prompt_lookup_decoding')
parser_text.add_argument('--reasoning_parser', choices=["qwen3"], help='Set the type of the reasoning parser for reasoning content extraction', dest='reasoning_parser')
parser_text.add_argument('--tool_parser', choices=["llama3","phi4","hermes3", "qwen3"], help='Set the type of the tool parser for tool calls extraction', dest='tool_parser')
parser_text.add_argument('--tool_parser', choices=["llama3","phi4","hermes3", "qwen3","mistral"], help='Set the type of the tool parser for tool calls extraction', dest='tool_parser')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unrelated, but qwen3 should not be here

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is already being changed in your PR, right? im rebased

parser_text.add_argument('--enable_tool_guided_generation', action='store_true', help='Enables enforcing tool schema during generation. Requires setting tool_parser', dest='enable_tool_guided_generation')

parser_embeddings = subparsers.add_parser('embeddings', help='[deprecated] export model for embeddings endpoint with models split into separate, versioned directories')
Expand Down Expand Up @@ -464,15 +464,16 @@ def export_text_generation_model(model_repository_path, source_model, model_name
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))

if template_parameters.get("tools_model_type") is not None:
if template_parameters.get("tool_parser") is not None:
print("Adding tuned chat template")
template_mapping = {
"phi4": "tool_chat_template_phi4_mini.jinja",
"llama3": "tool_chat_template_llama3.1_json.jinja",
"hermes3": "tool_chat_template_hermes.jinja",
"mistral": "tool_chat_template_mistral_parallel.jinja",
"qwen3": None
}
template_name = template_mapping[task_parameters.get("tools_model_type")]
template_name = template_mapping[task_parameters.get("tool_parser")]
if template_name is not None:
template_path = os.path.join(model_repository_path, model_name, "template.jinja")
import requests
Expand Down
11 changes: 10 additions & 1 deletion prepare_llm_models.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ QWEN3_MODEL="Qwen/Qwen3-8B"
LLAMA3_MODEL="meta-llama/Llama-3.1-8B-Instruct"
HERMES3_MODEL="NousResearch/Hermes-3-Llama-3.1-8B"
PHI4_MODEL="microsoft/Phi-4-mini-instruct"
MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3"

MODELS=("$CB_MODEL" "$EMBEDDING_MODEL" "$RERANK_MODEL" "$VLM_MODEL" "$QWEN3_MODEL" "$LLAMA3_MODEL" "$HERMES3_MODEL" "$PHI4_MODEL" "$EMBEDDING_MODEL/ov" "$RERANK_MODEL/ov")
MODELS=("$CB_MODEL" "$EMBEDDING_MODEL" "$RERANK_MODEL" "$VLM_MODEL" "$QWEN3_MODEL" "$LLAMA3_MODEL" "$HERMES3_MODEL" "$PHI4_MODEL" "$MISTRAL_MODEL" "$EMBEDDING_MODEL/ov" "$RERANK_MODEL/ov")

all_exist=true
for model in "${MODELS[@]}"; do
Expand Down Expand Up @@ -126,3 +127,11 @@ else
mkdir -p $1/$PHI4_MODEL
convert_tokenizer $PHI4_MODEL --with_detokenizer -o $1/$PHI4_MODEL
fi

if [ -d "$1/$MISTRAL_MODEL" ]; then
echo "Models directory $1/$MISTRAL_MODEL exists. Skipping downloading models."
else
mkdir -p $1/$MISTRAL_MODEL
convert_tokenizer $MISTRAL_MODEL --with_detokenizer -o $1/$MISTRAL_MODEL
fi

1 change: 1 addition & 0 deletions src/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -2511,6 +2511,7 @@ cc_test(
"test/llm/output_parsers/qwen3_output_parser_test.cpp",
"test/llm/output_parsers/hermes3_output_parser_test.cpp",
"test/llm/output_parsers/phi4_output_parser_test.cpp",
"test/llm/output_parsers/mistral_output_parser_test.cpp",
"test/llm/output_parsers/partial_json_builder_test.cpp",
],
"//:disable_python" : [],
Expand Down
2 changes: 2 additions & 0 deletions src/llm/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ cc_library(
"io_processing/hermes3/tool_parser.hpp",
"io_processing/llama3/tool_parser.hpp",
"io_processing/phi4/tool_parser.hpp",
"io_processing/mistral/tool_parser.hpp",
"io_processing/qwen3/reasoning_parser.hpp",
"io_processing/output_parser.hpp",
"io_processing/partial_json_builder.hpp",
Expand All @@ -122,6 +123,7 @@ cc_library(
"io_processing/hermes3/tool_parser.cpp",
"io_processing/llama3/tool_parser.cpp",
"io_processing/phi4/tool_parser.cpp",
"io_processing/mistral/tool_parser.cpp",
"io_processing/qwen3/reasoning_parser.cpp",
"io_processing/output_parser.cpp",
"io_processing/partial_json_builder.cpp",
Expand Down
88 changes: 88 additions & 0 deletions src/llm/io_processing/mistral/tool_parser.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
//*****************************************************************************
// Copyright 2025 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************

#include <openvino/genai/tokenizer.hpp>
#include <string>
#include <vector>
#include <regex>

#pragma warning(push)
#pragma warning(disable : 6313)
#include <rapidjson/document.h>
#include <rapidjson/stringbuffer.h>
#include <rapidjson/writer.h>
#pragma warning(pop)

#include "../../../logging.hpp"
#include "tool_parser.hpp"
#include "../utils.hpp"

namespace ovms {

void MistralToolParser::parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) {
std::vector<std::string> tools;

if (parsedOutput.content.empty() || generatedTokens.size() <= 0) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "No content to parse for tool calls");
return;
}

if (generatedTokens[0] != this->botTokenId) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to parse functools content or extract tools array");
return;
}

rapidjson::Document toolsDoc;
toolsDoc.Parse(parsedOutput.content.c_str());

if (!toolsDoc.HasParseError() && toolsDoc.IsArray()) {
for (auto& toolVal : toolsDoc.GetArray()) {
if (!toolVal.IsObject()) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool call is not a valid JSON object");
continue;
}
ToolCall toolCall;
if (toolVal.HasMember("name") && toolVal["name"].IsString()) {
toolCall.name = toolVal["name"].GetString();
} else {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool call does not contain valid name field");
continue;
}

if (toolVal.HasMember("arguments") && toolVal["arguments"].IsObject()) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> toolWriter(sb);
toolVal["arguments"].Accept(toolWriter);
toolCall.arguments = sb.GetString();
} else {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool call does not contain valid parameters object");
continue;
}
toolCall.id = generateRandomId(); // Generate a random ID for the tool call
parsedOutput.toolCalls.push_back(toolCall);
}
parsedOutput.content.clear();
} else {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to parse functools content or extract tools array");
}
}

std::optional<rapidjson::Document> MistralToolParser::parseChunk(const std::string& chunk) {
// Not implemented
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "MistralToolParser::parseChunk is not implemented");
return std::nullopt;
}
} // namespace ovms
53 changes: 53 additions & 0 deletions src/llm/io_processing/mistral/tool_parser.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//*****************************************************************************
// Copyright 2025 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once

#include <openvino/genai/tokenizer.hpp>
#include <string>
#include <optional>
#include <vector>

#pragma warning(push)
#pragma warning(disable : 6313)
#include <rapidjson/document.h>
#include <rapidjson/stringbuffer.h>
#include <rapidjson/writer.h>
#pragma warning(pop)

#include "../base_output_parser.hpp"

namespace ovms {
class MistralToolParser : public BaseOutputParser {
const int64_t botTokenId = 5; // [TOOL_CALLS]

public:
MistralToolParser() = delete;
explicit MistralToolParser(ov::genai::Tokenizer& tokenizer) :
BaseOutputParser(tokenizer) {}

void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) override;
std::optional<rapidjson::Document> parseChunk(const std::string& chunk) override;
const std::string& getParsingStartTag() const override {
static const std::string toolCallStartTag = "[TOOL_CALLS]";
return toolCallStartTag;
}
// Tools calls are expected to be the last part of the content, so we do not specify an end tag.
const std::string& getParsingEndTag() const override {
static const std::string toolCallEndTag = "";
return toolCallEndTag;
}
};
} // namespace ovms
3 changes: 3 additions & 0 deletions src/llm/io_processing/output_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "llama3/tool_parser.hpp"
#include "hermes3/tool_parser.hpp"
#include "phi4/tool_parser.hpp"
#include "mistral/tool_parser.hpp"
#include "qwen3/reasoning_parser.hpp"

namespace ovms {
Expand Down Expand Up @@ -46,6 +47,8 @@ OutputParser::OutputParser(ov::genai::Tokenizer& tokenizer, const std::string to
toolParser = std::make_unique<Hermes3ToolParser>(tokenizer);
} else if (toolParserName == "phi4") {
toolParser = std::make_unique<Phi4ToolParser>(tokenizer);
} else if (toolParserName == "mistral") {
toolParser = std::make_unique<MistralToolParser>(tokenizer);
} else if (!toolParserName.empty()) {
throw std::runtime_error("Unsupported tool parser: " + toolParserName);
}
Expand Down
9 changes: 8 additions & 1 deletion src/llm/servable_initializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
global json
import json
from pathlib import Path
global datetime
import datetime

global contextmanager
from contextlib import contextmanager
Expand All @@ -73,6 +75,10 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
def raise_exception(message):
raise jinja2.exceptions.TemplateError(message)

# Appears in some of mistral chat templates
def strftime_now(format):
return datetime.datetime.now().strftime(format)

# Following the logic from:
# https://github.com/huggingface/transformers/blob/7188e2e28c6d663284634732564143b820a03f8b/src/transformers/utils/chat_template_utils.py#L398
class AssistantTracker(Extension):
Expand Down Expand Up @@ -135,7 +141,8 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
template_loader = jinja2.FileSystemLoader(searchpath=templates_directory)
jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True, extensions=[AssistantTracker, jinja2.ext.loopcontrols], loader=template_loader)
jinja_env.policies["json.dumps_kwargs"]["ensure_ascii"] = False
jinja_env.globals["raise_exception"] = raise_exception
jinja_env.globals["raise_exception"] = raise_exception
jinja_env.globals["strftime_now"] = strftime_now
if jinja_file.is_file():
template = jinja_env.get_template("template.jinja")

Expand Down
Loading