Workaround support for qwen3 rerank (#3578)

michalkulakowski · web-flow · commit 2e65fae9b9f0 · 2025-08-21T20:21:15.000+02:00
diff --git a/demos/rerank/README.md b/demos/rerank/README.md
@@ -144,6 +144,57 @@ index 1, relevance_score 0.09138210117816925
 ```
 :::
 
+:::{dropdown} **Requesting rerank score with model that requires template applying on query and documents**
+
+tomaarsen/Qwen3-Reranker-0.6B-seq-cls is a copy of the Qwen3-Reranker-0.6B model (original model is not supported in OVMS) modified as a sequence classification model instead. It requires applying template on input, here is example client that does it:
+
+```bash
+pip3 install requests
+```
+```bash
+echo '
+import requests
+
+prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
+suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+
+query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
+document_template = "<Document>: {doc}{suffix}"
+
+instruction = (
+    "Given a web search query, retrieve relevant passages that answer the query"
+)
+
+query = "welcome"
+
+documents = [
+    "good morning",
+    "farewell",
+]
+
+query = query_template.format(prefix=prefix, instruction=instruction, query=query)
+
+documents = [
+    document_template.format(doc=doc, suffix=suffix) for doc in documents
+]
+
+response = requests.post("http://127.0.0.1:8125/v3/rerank",
+                         json={
+                             "model": "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
+                             "query": query,
+                             "documents": documents,
+                         }).json()
+
+print(response)' > rerank_client.py
+
+python rerank_client.py
+```
+It will return response similar to:
+```
+{'results': [{'index': 0, 'relevance_score': 0.024518223479390144}, {'index': 1, 'relevance_score': 0.0026006349362432957}]}
+```
+:::
+
 ## Comparison with Hugging Faces
 
 ```bash
@@ -202,6 +253,7 @@ BAAI/bge-reranker-large
 BAAI/bge-reranker-v2-m3
 BAAI/bge-reranker-base
 cross-encoder/msmarco-MiniLM-L6-en-de-v1
+tomaarsen/Qwen3-Reranker-0.6B-seq-cls
 ```
 
 ## Integration with Langchain
diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp
@@ -550,8 +550,8 @@ Status MediapipeGraphDefinition::initializeNodes() {
             }
             mediapipe::RerankCalculatorOVOptions nodeOptions;
             config.node(i).node_options(0).UnpackTo(&nodeOptions);
-            std::shared_ptr<SidepacketServable> servable = std::make_shared<SidepacketServable>(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.plugin_config(), mgconfig.getBasePath());
-            rerankServableMap.insert(std::pair<std::string, std::shared_ptr<SidepacketServable>>(nodeName, std::move(servable)));
+            std::shared_ptr<RerankServable> servable = std::make_shared<RerankServable>(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.plugin_config(), mgconfig.getBasePath());
+            rerankServableMap.insert(std::pair<std::string, std::shared_ptr<RerankServable>>(nodeName, std::move(servable)));
             rerankServablesCleaningGuard.disableCleaning();
         }
     }
diff --git a/src/mediapipe_internal/mediapipegraphdefinition.hpp b/src/mediapipe_internal/mediapipegraphdefinition.hpp
@@ -45,6 +45,7 @@
 
 #include "../sidepacket_servable.hpp"
 #include "../embeddings/embeddings_servable.hpp"
+#include "../rerank/rerank_servable.hpp"
 
 namespace ovms {
 class MediapipeGraphDefinitionUnloadGuard;
@@ -60,8 +61,8 @@ class GenAiServable;
 struct ImageGenerationPipelines;
 using PythonNodeResourcesMap = std::unordered_map<std::string, std::shared_ptr<PythonNodeResources>>;
 using GenAiServableMap = std::unordered_map<std::string, std::shared_ptr<GenAiServable>>;
+using RerankServableMap = std::unordered_map<std::string, std::shared_ptr<RerankServable>>;
 using EmbeddingsServableMap = std::unordered_map<std::string, std::shared_ptr<EmbeddingsServable>>;
-using RerankServableMap = std::unordered_map<std::string, std::shared_ptr<SidepacketServable>>;
 using ImageGenerationPipelinesMap = std::unordered_map<std::string, std::shared_ptr<ImageGenerationPipelines>>;
 
 struct GraphSidePackets {
diff --git a/src/rerank/BUILD b/src/rerank/BUILD
@@ -27,6 +27,14 @@ mediapipe_proto_library(
     ],
 )
 
+ovms_cc_library(
+    name = "rerank_servable",
+    hdrs = ["rerank_servable.hpp"],
+    deps = ["//src:sidepacket_servable",],
+    visibility = ["//visibility:public"],
+    alwayslink = 1,
+)
+
 mediapipe_proto_library(
     name = "rerank_calculator_ov_proto", # rerank_calculator_cc_proto - just mediapipe stuff with mediapipe_proto_library adding nonvisible target
     srcs = ["rerank_calculator_ov.proto"],
@@ -68,7 +76,7 @@ ovms_cc_library(
         "//src:libovmsprofiler",
         "rerank_calculator_ov_cc_proto",
         ":rerank_api_handler",
-        "//src:sidepacket_servable",
+        ":rerank_servable",
         "//src:model_metric_reporter",
         "//src:executingstreamidguard",
         "//src:libovms_execution_context",
diff --git a/src/rerank/rerank_calculator_ov.cc b/src/rerank/rerank_calculator_ov.cc
@@ -42,7 +42,7 @@
 #include "../profiler.hpp"
 #include "src/rerank/rerank_calculator_ov.pb.h"
 #include "src/rerank/rerank_utils.hpp"
-#include "../sidepacket_servable.hpp"
+#include "rerank_servable.hpp"
 #include "../model_metric_reporter.hpp"
 #include "../executingstreamidguard.hpp"
 
@@ -77,7 +77,7 @@ class RerankCalculatorOV : public CalculatorBase {
     size_t max_allowed_chunks{0};  // Read from options in ::Open()
 
 protected:
-    std::shared_ptr<ovms::SidepacketServable> rerank_session{nullptr};
+    std::shared_ptr<ovms::RerankServable> rerank_session{nullptr};
 
 public:
     static absl::Status GetContract(CalculatorContract* cc) {
@@ -127,7 +127,7 @@ class RerankCalculatorOV : public CalculatorBase {
         }
 
         // post-validation
-        if (this->max_position_embeddings <= 2 * NUMBER_OF_SPECIAL_TOKENS) {
+        if (rerank_session->addBosToken && (this->max_position_embeddings <= 2 * NUMBER_OF_SPECIAL_TOKENS)) {
             SPDLOG_LOGGER_ERROR(rerank_calculator_logger, "max_position_embeddings should be larger than 2 * NUMBER_OF_SPECIAL_TOKENS");
             return absl::InvalidArgumentError("max_position_embeddings should be larger than 2 * NUMBER_OF_SPECIAL_TOKENS");
         }
@@ -153,7 +153,25 @@ class RerankCalculatorOV : public CalculatorBase {
         // Validate batch size before tokenizing
         if (handler.getDocumentsList().size() > this->max_allowed_chunks)
             throw std::runtime_error("Number of documents exceeds max_allowed_chunks");
-
+        if (!rerank_session->addBosToken) {
+            auto batchSize = handler.getDocumentsList().size();
+            std::vector<std::string> data(batchSize);
+            for (int i = 0; i < batchSize; i++) {
+                data[i] += handler.getQuery() + handler.getDocumentsList()[i];
+            }
+            chunk_mapping.resize(batchSize);
+            std::iota(chunk_mapping.begin(), chunk_mapping.end(), 0);
+            auto tokens = rerank_session->getTokenizer().encode(data);
+            if (tokens.input_ids.get_shape().size() != 2) {
+                throw std::runtime_error("Tokens shape invalid.");  // should never happen
+            }
+            if (this->max_position_embeddings < tokens.input_ids.get_shape()[1]) {
+                std::ostringstream msg;
+                msg << "The requests length of " << tokens.input_ids.get_shape()[1] << " tokens exceeds the model context of " << max_position_embeddings;
+                throw std::runtime_error(msg.str());
+            }
+            return std::make_pair(tokens.input_ids, tokens.attention_mask);
+        }
         // Compute Query Tokens
         auto query_tokens = ComputeTokensForString(handler.getQuery());
 
@@ -289,8 +307,8 @@ class RerankCalculatorOV : public CalculatorBase {
                 typeIds = ov::Tensor{ov::element::i64, input_ids.get_shape()};
                 std::fill_n(typeIds->data<int64_t>(), input_ids.get_size(), 0);
             }
-            // Compute scores using rerank model
             size_t batch_size = handler.getDocumentsList().size();
+            // Compute scores using rerank model
             auto scores = ComputeScoresUsingRerankModel(
                 input_ids,
                 attention_mask,
diff --git a/src/rerank/rerank_servable.hpp b/src/rerank/rerank_servable.hpp
@@ -0,0 +1,55 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+
+#include "../sidepacket_servable.hpp"
+#include "../filesystem.hpp"
+#include <rapidjson/istreamwrapper.h>
+#include <rapidjson/error/en.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+namespace ovms {
+
+struct RerankServable : SidepacketServable {
+    bool addBosToken = true;
+    RerankServable(const std::string& modelDir, const std::string& targetDevice, const std::string& pluginConfig, const std::string& graphPath) :
+        SidepacketServable(modelDir, targetDevice, pluginConfig, graphPath) {
+        std::filesystem::path tokenizerConfigPath = (parsedModelsPath / "tokenizer_config.json");
+        if (!std::filesystem::exists(tokenizerConfigPath)) {
+            return;
+        }
+        std::ifstream ifs(tokenizerConfigPath.string());
+        if (!ifs.is_open()) {
+            return;
+        }
+        rapidjson::Document tokenizerConfig;
+        rapidjson::IStreamWrapper isw(ifs);
+        rapidjson::ParseResult parseResult = tokenizerConfig.ParseStream(isw);
+        if (parseResult.Code()) {
+            SPDLOG_ERROR("Parsing tokenizer_config.json failed: {}", rapidjson::GetParseError_En(parseResult.Code()));
+            return;
+        }
+        if (tokenizerConfig.HasMember("add_bos_token") && tokenizerConfig["add_bos_token"].IsBool() && tokenizerConfig["add_bos_token"].IsFalse()) {
+            SPDLOG_DEBUG("Rerank model add_bos_token set to false");
+            addBosToken = false;
+        }
+    }
+};
+
+using RerankServableMap = std::unordered_map<std::string, std::shared_ptr<RerankServable>>;
+}  // namespace ovms
diff --git a/src/sidepacket_servable.hpp b/src/sidepacket_servable.hpp
@@ -76,6 +76,4 @@ struct SidepacketServable {
         return compiledModel.inputs().size();
     }
 };
-
-using RerankServableMap = std::unordered_map<std::string, std::shared_ptr<SidepacketServable>>;
 }  // namespace ovms

Original file line number	Diff line number	Diff line change
`@@ -550,8 +550,8 @@ Status MediapipeGraphDefinition::initializeNodes() {`
`550`	`550`	`}`
`551`	`551`	`mediapipe::RerankCalculatorOVOptions nodeOptions;`
`552`	`552`	`config.node(i).node_options(0).UnpackTo(&nodeOptions);`
`553`		`- std::shared_ptr<SidepacketServable> servable = std::make_shared<SidepacketServable>(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.plugin_config(), mgconfig.getBasePath());`
`554`		`- rerankServableMap.insert(std::pair<std::string, std::shared_ptr<SidepacketServable>>(nodeName, std::move(servable)));`
	`553`	`+ std::shared_ptr<RerankServable> servable = std::make_shared<RerankServable>(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.plugin_config(), mgconfig.getBasePath());`
	`554`	`+ rerankServableMap.insert(std::pair<std::string, std::shared_ptr<RerankServable>>(nodeName, std::move(servable)));`
`555`	`555`	`rerankServablesCleaningGuard.disableCleaning();`
`556`	`556`	`}`
`557`	`557`	`}`
Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,4 @@ struct SidepacketServable {`
`76`	`76`	`return compiledModel.inputs().size();`
`77`	`77`	`}`
`78`	`78`	`};`
`79`		`-`
`80`		`-using RerankServableMap = std::unordered_map<std::string, std::shared_ptr<SidepacketServable>>;`
`81`	`79`	`} // namespace ovms`