qwen3 embeddings support (#3529)

michalkulakowski · dkalinowski · dtrawins · web-flow · commit b41197de1cc6 · 2025-08-07T16:52:58.000+02:00
* Add support for last token pooling mode

Co-authored-by: Trawinski, Dariusz &lt;dariusz.trawinski@intel.com&gt;

---------

Co-authored-by: Damian Kalinowski &lt;damian.kalinowski@intel.com&gt;
Co-authored-by: Trawinski, Dariusz &lt;dariusz.trawinski@intel.com&gt;
diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
@@ -65,6 +65,7 @@ def add_common_arguments(parser):
 parser_embeddings_ov = subparsers.add_parser('embeddings_ov', help='export model for embeddings endpoint with directory structure aligned with OpenVINO tools')
 add_common_arguments(parser_embeddings_ov)
 parser_embeddings_ov.add_argument('--skip_normalize', default=True, action='store_false', help='Skip normalize the embeddings.', dest='normalize')
+parser_embeddings_ov.add_argument('--pooling', default="CLS", choices=["CLS", "LAST"], help='Embeddings pooling mode', dest='pooling')
 parser_embeddings_ov.add_argument('--truncate', default=False, action='store_true', help='Truncate the prompts to fit to the embeddings model', dest='truncate')
 parser_embeddings_ov.add_argument('--num_streams', default=1,type=int, help='The number of parallel execution streams to use for the model. Use at least 2 on 2 socket CPU systems.', dest='num_streams')
 
@@ -139,6 +140,8 @@ def add_common_arguments(parser):
     [type.googleapis.com / mediapipe.EmbeddingsCalculatorOVOptions]: {
       models_path: "{{model_path}}",
       normalize_embeddings: {% if not normalize %}false{% else %}true{% endif%},
+      {%- if pooling %}
+      pooling: {{pooling}},{% endif %}
       target_device: "{{target_device|default("CPU", true)}}"
     }
   }
diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md
@@ -57,15 +57,16 @@ models
 └── config.json
 
 ```
-> **Note** The actual models support version management and can be automatically swapped to newer version when new model is uploaded in newer version folder.
-> In case you trained the pytorch model it can be exported like below:
-> `python export_model.py embeddings_ov --source_model <pytorch model> --model_name Alibaba-NLP/gte-large-en-v1.5 --precision int8 --config_file_path models/config.json --version 2`
 
-The default configuration of the `EmbeddingsCalculator` should work in most cases but the parameters can be tuned inside the `node_options` section in the `graph.pbtxt` file. Runtime configuration for both models can be tuned in `subconfig.json` file. They can be set automatically via export parameters in the `export_model.py` script.
+The default configuration of the `EmbeddingsCalculatorOV` should work in most cases but the parameters can be tuned inside the `node_options` section in the `graph.pbtxt` file. They can be set automatically via export parameters in the `export_model.py` script.
 
 For example:
-`python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --precision int8 --num_streams 2 --skip_normalize --config_file_path models/config.json`
+`python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --weight-format int8 --skip_normalize --config_file_path models/config.json`
 
+> **Note:** By default OVMS returns first token embeddings as sequence embeddings (called CLS pooling). It can be changed using `--pooling` option if needed by the model. Supported values are CLS and LAST. For example:
+```console
+python export_model.py embeddings_ov --source_model Qwen/Qwen3-Embedding-0.6B --weight-format fp16 --pooling LAST --config_file_path models/config.json`
+```
 
 ## Tested models
 All models supported by [optimum-intel](https://github.com/huggingface/optimum-intel) should be compatible. In serving validation are included Hugging Face models:
@@ -240,7 +241,7 @@ The script [compare_results.py](./compare_results.py) can assist with such exper
 ```bash
 popd
 cd model_server/demos/embeddings
-python compare_results.py --model Alibaba-NLP/gte-large-en-v1.5 --service_url http://localhost:8000/v3/embeddings --input "hello world" --input "goodbye world"
+python compare_results.py --model Alibaba-NLP/gte-large-en-v1.5 --service_url http://localhost:8000/v3/embeddings --pooling CLS --input "hello world" --input "goodbye world"
 
 input ['hello world', 'goodbye world']
 HF Duration: 50.626 ms NewModel
diff --git a/demos/embeddings/compare_results.py b/demos/embeddings/compare_results.py
@@ -28,6 +28,8 @@
                     dest='model_name')
 parser.add_argument('--input', default=[], help='List of strings to query. default: []',
                     dest='input', action='append')
+parser.add_argument('--pooling', default="CLS", choices=["CLS", "LAST"], help='Embeddings pooling mode', dest='pooling')
+
 args = vars(parser.parse_args())
 
 model_id = args['model_name']
@@ -43,8 +45,14 @@ def run_model():
         start_time = datetime.datetime.now()
         input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
         model_output = model_pt(**input)
-        embeddings = model_output.last_hidden_state[:, 0]
-        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+        if args['pooling'] == "LAST":
+            sequence_lengths = input['attention_mask'].sum(dim=1) - 1
+            batch_size = model_output.last_hidden_state.shape[0]
+            embeddings = model_output.last_hidden_state[torch.arange(batch_size, device=model_output.last_hidden_state.device), sequence_lengths]
+            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+        else:
+            embeddings = model_output.last_hidden_state[:, 0]
+            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
         end_time = datetime.datetime.now()
         duration = (end_time - start_time).total_seconds() * 1000
         print("HF Duration:", duration, "ms", type(model_pt).__name__)
diff --git a/src/capi_frontend/server_settings.hpp b/src/capi_frontend/server_settings.hpp
@@ -114,7 +114,7 @@ struct EmbeddingsGraphSettingsImpl {
     std::string modelName = "";
     uint32_t numStreams = 1;
     std::string normalize = "true";
-    std::string meanPooling = "false";
+    std::string pooling = "CLS";
 };
 
 struct RerankGraphSettingsImpl {
diff --git a/src/embeddings/BUILD b/src/embeddings/BUILD
@@ -29,6 +29,14 @@ ovms_cc_library(
     alwayslink = 1,
 )
 
+ovms_cc_library(
+    name = "embeddings_servable",
+    hdrs = ["embeddings_servable.hpp"],
+    deps = ["//src:sidepacket_servable",],
+    visibility = ["//visibility:public"],
+    alwayslink = 1,
+)
+
 mediapipe_proto_library(
     name = "embeddings_calculator_proto", # embeddings_calculator_cc_proto - just mediapipe stuff with mediapipe_proto_library adding nonvisible target
     srcs = ["embeddings_calculator.proto"],
@@ -81,11 +89,12 @@ ovms_cc_library(
         "//src:libovmslogging",
         "//src:libovmsprofiler",
         "embeddings_calculator_ov_cc_proto",
-        ":embeddings_api",
+        ":embeddings_servable",
         "//src:sidepacket_servable",
         "//src:model_metric_reporter",
         "//src:executingstreamidguard",
         "//src:libovms_execution_context",
+        ":embeddings_api",
     ],
     visibility = ["//visibility:public"],
     alwayslink = 1,
diff --git a/src/embeddings/embeddings_api.cpp b/src/embeddings/embeddings_api.cpp
@@ -162,7 +162,7 @@ void EmbeddingsHandler::setPromptTokensUsage(int promptTokens) {
 
 #pragma warning(push)
 #pragma warning(disable : 4267)
-absl::Status EmbeddingsHandler::parseResponse(StringBuffer& buffer, const ov::Tensor& embeddingsTensor, const bool normalizeEmbeddings) {
+absl::Status EmbeddingsHandler::parseResponse(StringBuffer& buffer, const ov::Tensor& embeddingsTensor, const bool normalizeEmbeddings, const PoolingMode poolingMode, const std::optional<ov::Tensor>& attentionMask) {
     Writer<StringBuffer> writer(buffer);
     writer.StartObject();
 
@@ -171,15 +171,42 @@ absl::Status EmbeddingsHandler::parseResponse(StringBuffer& buffer, const ov::Te
 
     writer.String("data");
     writer.StartArray();
-    // TODO: mean pooling
 
     ov::Shape outputShape = embeddingsTensor.get_shape();
     if (outputShape.size() != 3) {
         return absl::InvalidArgumentError("Invalid embeddings tensor shape");
     }
     size_t batchSize = outputShape[0];
     for (size_t batchIterator = 0; batchIterator < batchSize; batchIterator++) {
-        size_t stride = batchIterator * outputShape[1] * outputShape[2];
+        size_t stride;
+        if (poolingMode == PoolingMode::LAST) {
+            size_t attendedTokens = 0;
+            if (!attentionMask.has_value()) {
+                return absl::InvalidArgumentError("Last token pooling mode requires attention mask");
+            }
+            auto maxNumberOfTokens = attentionMask->get_shape()[1];
+            if (attentionMask->get_element_type() == ov::element::Type_t::i64) {
+                for (int i = 0; i < maxNumberOfTokens; i++) {
+                    attendedTokens += reinterpret_cast<int64_t*>(attentionMask->data())[i + batchIterator * maxNumberOfTokens];
+                }
+            } else if (attentionMask->get_element_type() == ov::element::Type_t::i32) {
+                for (int i = 0; i < maxNumberOfTokens; i++) {
+                    attendedTokens += reinterpret_cast<int32_t*>(attentionMask->data())[i + batchIterator * maxNumberOfTokens];
+                }
+            } else if (attentionMask->get_element_type() == ov::element::Type_t::i8) {
+                for (int i = 0; i < maxNumberOfTokens; i++) {
+                    attendedTokens += reinterpret_cast<uint8_t*>(attentionMask->data())[i + batchIterator * maxNumberOfTokens];
+                }
+            } else {
+                return absl::InternalError("Attention mask element type invalid.");
+            }
+            if (!(attendedTokens <= outputShape[1])) {
+                return absl::InternalError("Embeddings output and attention mask shape mismatch");
+            }
+            stride = batchIterator * outputShape[1] * outputShape[2] + (attendedTokens - 1) * outputShape[2];
+        } else {
+            stride = batchIterator * outputShape[1] * outputShape[2];
+        }
         size_t size = outputShape[2];
         float* dataPtr = reinterpret_cast<float*>(embeddingsTensor.data()) + stride;
         float* dataPtrEnd = dataPtr + size;
diff --git a/src/embeddings/embeddings_api.hpp b/src/embeddings/embeddings_api.hpp
@@ -36,6 +36,11 @@
 
 namespace ovms {
 
+enum class PoolingMode {
+    CLS,
+    LAST
+};
+
 struct EmbeddingsRequest {
     enum class EncodingFormat {
         FLOAT,
@@ -60,7 +65,7 @@ class EmbeddingsHandler {
     EmbeddingsRequest::EncodingFormat getEncodingFormat() const;
 
     absl::Status parseRequest();
-    absl::Status parseResponse(rapidjson::StringBuffer& buffer, const ov::Tensor& embeddingsTensor, const bool normalizeEmbeddings);
+    absl::Status parseResponse(rapidjson::StringBuffer& buffer, const ov::Tensor& embeddingsTensor, const bool normalizeEmbeddings, const PoolingMode poolingMode = PoolingMode::CLS, const std::optional<ov::Tensor>& attentionMask = std::nullopt);
     void setPromptTokensUsage(int promptTokens);
 };
 }  // namespace ovms
diff --git a/src/embeddings/embeddings_calculator.proto b/src/embeddings/embeddings_calculator.proto
@@ -26,5 +26,4 @@ message EmbeddingsCalculatorOptions {
     optional EmbeddingsCalculatorOptions ext = 1134737;
     }
     optional bool normalize_embeddings = 1 [default = true];
-    optional bool mean_pooling = 2 [default = false];
 }
diff --git a/src/embeddings/embeddings_calculator_ov.cc b/src/embeddings/embeddings_calculator_ov.cc
@@ -40,7 +40,7 @@
 #include "../model_metric_reporter.hpp"
 #include "embeddings_api.hpp"
 #include "src/embeddings/embeddings_calculator_ov.pb.h"
-#include "../sidepacket_servable.hpp"
+#include "embeddings_servable.hpp"
 
 using namespace rapidjson;
 using namespace ovms;
@@ -63,7 +63,7 @@ class EmbeddingsCalculatorOV : public CalculatorBase {
     mediapipe::Timestamp timestamp{0};
 
 protected:
-    std::shared_ptr<ovms::SidepacketServable> embeddings_session{nullptr};
+    std::shared_ptr<ovms::EmbeddingsServable> embeddings_session{nullptr};
 
 public:
     static absl::Status GetContract(CalculatorContract* cc) {
@@ -148,10 +148,12 @@ class EmbeddingsCalculatorOV : public CalculatorBase {
                     for (int i = 0; i < tokens.attention_mask.get_size(); i++) {
                         attendedTokens += reinterpret_cast<int32_t*>(tokens.attention_mask.data())[i];
                     }
-                } else {
+                } else if (tokens.attention_mask.get_element_type() == ov::element::Type_t::i8) {
                     for (int i = 0; i < tokens.attention_mask.get_byte_size(); i++) {
                         attendedTokens += reinterpret_cast<uint8_t*>(tokens.attention_mask.data())[i];
                     }
+                } else {
+                    return absl::InternalError("Attention mask element type invalid.");
                 }
                 handler.setPromptTokensUsage(attendedTokens);
             } else if (auto tokenized_documents = std::get_if<std::vector<std::vector<int64_t>>>(&input)) {
@@ -241,7 +243,13 @@ class EmbeddingsCalculatorOV : public CalculatorBase {
 
         auto parseResponseStartTime = std::chrono::high_resolution_clock::now();
         StringBuffer buffer;
-        status = handler.parseResponse(buffer, embeddingsTensor, cc->Options<EmbeddingsCalculatorOVOptions>().normalize_embeddings());
+        PoolingMode mode;
+        if (cc->Options<EmbeddingsCalculatorOVOptions>().pooling() == mediapipe::EmbeddingsCalculatorOVOptions::LAST) {
+            mode = PoolingMode::LAST;
+        } else {
+            mode = PoolingMode::CLS;
+        }
+        status = handler.parseResponse(buffer, embeddingsTensor, cc->Options<EmbeddingsCalculatorOVOptions>().normalize_embeddings(), mode, tokens.attention_mask);
         if (!status.ok()) {
             return status;
         }
diff --git a/src/embeddings/embeddings_calculator_ov.proto b/src/embeddings/embeddings_calculator_ov.proto
@@ -27,7 +27,11 @@ message EmbeddingsCalculatorOVOptions {
     }
     required string models_path = 1;
     optional bool normalize_embeddings = 2 [default = true];
-    optional bool mean_pooling = 3 [default = false];
-    optional string target_device = 4 [default = "CPU"];
-    optional string plugin_config = 5 [default = ""];
+    optional string target_device = 3 [default = "CPU"];
+    optional string plugin_config = 4 [default = ""];
+    enum Pooling {
+      CLS = 0;
+      LAST = 1;
+    }
+    optional Pooling pooling = 5 [default = CLS];
 }
diff --git a/src/embeddings/embeddings_servable.hpp b/src/embeddings/embeddings_servable.hpp
@@ -0,0 +1,36 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+
+#include "../sidepacket_servable.hpp"
+#include "embeddings_api.hpp"
+#include "../filesystem.hpp"
+#include <rapidjson/istreamwrapper.h>
+#include <rapidjson/error/en.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+namespace ovms {
+
+struct EmbeddingsServable : SidepacketServable {
+public:
+    EmbeddingsServable(const std::string& modelDir, const std::string& targetDevice, const std::string& pluginConfig, const std::string& graphPath) :
+        SidepacketServable(modelDir, targetDevice, pluginConfig, graphPath) {}
+};
+
+using EmbeddingsServableMap = std::unordered_map<std::string, std::shared_ptr<EmbeddingsServable>>;
+}  // namespace ovms
diff --git a/src/graph_export/embeddings_graph_cli_parser.cpp b/src/graph_export/embeddings_graph_cli_parser.cpp
@@ -48,10 +48,10 @@ void EmbeddingsGraphCLIParser::createOptions() {
             "Normalize the embeddings.",
             cxxopts::value<std::string>()->default_value("true"),
             "NORMALIZE")
-        ("mean_pooling",
+        ("pooling",
             "Mean pooling option.",
-            cxxopts::value<std::string>()->default_value("false"),
-            "MEAN_POOLING");
+            cxxopts::value<std::string>()->default_value("CLS"),
+            "POOLING");
 }
 
 void EmbeddingsGraphCLIParser::printHelp() {
@@ -91,7 +91,10 @@ void EmbeddingsGraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsImpl
     } else {
         embeddingsGraphSettings.numStreams = result->operator[]("num_streams").as<uint32_t>();
         embeddingsGraphSettings.normalize = result->operator[]("normalize").as<std::string>();
-        embeddingsGraphSettings.meanPooling = result->operator[]("mean_pooling").as<std::string>();
+        embeddingsGraphSettings.pooling = result->operator[]("pooling").as<std::string>();
+    }
+    if (!(embeddingsGraphSettings.pooling == "CLS" || embeddingsGraphSettings.pooling == "LAST")){
+        throw std::invalid_argument("Only CLS and LAST pooling modes are supported");
     }
     hfSettings.graphSettings = std::move(embeddingsGraphSettings);
 }
diff --git a/src/graph_export/graph_export.cpp b/src/graph_export/graph_export.cpp
@@ -214,8 +214,8 @@ node {
             << graphOkPath << R"(",
             normalize_embeddings: )"
             << graphSettings.normalize << R"(,
-            mean_pooling: )"
-            << graphSettings.meanPooling << R"(,
+            pooling: )"
+            << graphSettings.pooling << R"(,
             target_device: ")" << graphSettings.targetDevice << R"(",
             plugin_config: '{ "NUM_STREAMS": ")" << graphSettings.numStreams << R"("}',
         }
diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp
@@ -528,8 +528,8 @@ Status MediapipeGraphDefinition::initializeNodes() {
             }
             mediapipe::EmbeddingsCalculatorOVOptions nodeOptions;
             config.node(i).node_options(0).UnpackTo(&nodeOptions);
-            std::shared_ptr<SidepacketServable> servable = std::make_shared<SidepacketServable>(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.plugin_config(), mgconfig.getBasePath());
-            embeddingsServableMap.insert(std::pair<std::string, std::shared_ptr<SidepacketServable>>(nodeName, std::move(servable)));
+            std::shared_ptr<EmbeddingsServable> servable = std::make_shared<EmbeddingsServable>(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.plugin_config(), mgconfig.getBasePath());
+            embeddingsServableMap.insert(std::pair<std::string, std::shared_ptr<EmbeddingsServable>>(nodeName, std::move(servable)));
             embeddingsServablesCleaningGuard.disableCleaning();
         }
         if (endsWith(config.node(i).calculator(), RERANK_NODE_CALCULATOR_NAME)) {
diff --git a/src/mediapipe_internal/mediapipegraphdefinition.hpp b/src/mediapipe_internal/mediapipegraphdefinition.hpp
@@ -44,6 +44,7 @@
 #include "packettypes.hpp"
 
 #include "../sidepacket_servable.hpp"
+#include "../embeddings/embeddings_servable.hpp"
 
 namespace ovms {
 class MediapipeGraphDefinitionUnloadGuard;
@@ -59,7 +60,7 @@ class GenAiServable;
 struct ImageGenerationPipelines;
 using PythonNodeResourcesMap = std::unordered_map<std::string, std::shared_ptr<PythonNodeResources>>;
 using GenAiServableMap = std::unordered_map<std::string, std::shared_ptr<GenAiServable>>;
-using EmbeddingsServableMap = std::unordered_map<std::string, std::shared_ptr<SidepacketServable>>;
+using EmbeddingsServableMap = std::unordered_map<std::string, std::shared_ptr<EmbeddingsServable>>;
 using RerankServableMap = std::unordered_map<std::string, std::shared_ptr<SidepacketServable>>;
 using ImageGenerationPipelinesMap = std::unordered_map<std::string, std::shared_ptr<ImageGenerationPipelines>>;
 
diff --git a/src/sidepacket_servable.cpp b/src/sidepacket_servable.cpp
@@ -40,13 +40,12 @@ namespace ovms {
 
 SidepacketServable::SidepacketServable(const std::string& modelDir, const std::string& targetDevice, const std::string& pluginConfig, const std::string& graphPath) {
     auto fsModelsPath = std::filesystem::path(modelDir);
-    std::filesystem::path parsedModelsPath;
     if (fsModelsPath.is_relative()) {
         parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath);
     } else {
         parsedModelsPath = fsModelsPath.string();
     }
-    std::filesystem::path configPath = (std::filesystem::path(graphPath) / fsModelsPath / "config.json");
+    std::filesystem::path configPath = (parsedModelsPath / "config.json");
     if (std::filesystem::exists(configPath)) {
         std::ifstream ifs(configPath.string());
         if (ifs.is_open()) {
diff --git a/src/sidepacket_servable.hpp b/src/sidepacket_servable.hpp
@@ -47,6 +47,7 @@ struct SidepacketServable {
     int64_t bos_token = 0;
     int64_t sep_token = 0;
     std::optional<uint32_t> maxModelLength;
+    std::filesystem::path parsedModelsPath;
 
 public:
     SidepacketServable(const std::string& modelDir, const std::string& targetDevice, const std::string& pluginConfig, const std::string& graphPath);
@@ -76,6 +77,5 @@ struct SidepacketServable {
     }
 };
 
-using EmbeddingsServableMap = std::unordered_map<std::string, std::shared_ptr<SidepacketServable>>;
 using RerankServableMap = std::unordered_map<std::string, std::shared_ptr<SidepacketServable>>;
 }  // namespace ovms
diff --git a/src/test/embeddings_handler_test.cpp b/src/test/embeddings_handler_test.cpp
diff --git a/src/test/graph_export_test.cpp b/src/test/graph_export_test.cpp
diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp
diff --git a/src/test/ovmsconfig_test.cpp b/src/test/ovmsconfig_test.cpp

Original file line number	Diff line number	Diff line change
`@@ -26,5 +26,4 @@ message EmbeddingsCalculatorOptions {`
`26`	`26`	`optional EmbeddingsCalculatorOptions ext = 1134737;`
`27`	`27`	`}`
`28`	`28`	`optional bool normalize_embeddings = 1 [default = true];`
`29`		`- optional bool mean_pooling = 2 [default = false];`
`30`	`29`	`}`