Skip to content

Commit a3786a9

Browse files
Support 3 inputs models in rerank calculator (#3551)
1 parent b41197d commit a3786a9

File tree

6 files changed

+79
-24
lines changed

6 files changed

+79
-24
lines changed

demos/embeddings/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ All models supported by [optimum-intel](https://github.com/huggingface/optimum-i
7676
BAAI/bge-large-en-v1.5
7777
BAAI/bge-large-zh-v1.5
7878
thenlper/gte-small
79+
Qwen/Qwen3-Embedding-0.6B
7980
```
8081

8182
## Server Deployment

demos/rerank/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ Average document length: 92.248 tokens
201201
BAAI/bge-reranker-large
202202
BAAI/bge-reranker-v2-m3
203203
BAAI/bge-reranker-base
204+
cross-encoder/msmarco-MiniLM-L6-en-de-v1
204205
```
205206

206207
## Integration with Langchain

src/embeddings/embeddings_calculator_ov.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ class EmbeddingsCalculatorOV : public CalculatorBase {
179179
try {
180180
for (size_t i = 0; i < received_batch_size; i++) {
181181
int64_t* input_ids_start = reinterpret_cast<int64_t*>(tokens.input_ids.data()) + i * token_count_of_longest_document;
182-
std::fill(input_ids_start, input_ids_start + token_count_of_longest_document, embeddings_session->getPadToken());
182+
std::fill(input_ids_start, input_ids_start + token_count_of_longest_document, embeddings_session->getPadToken().value_or(0));
183183
std::copy(tokenized_documents->at(i).data(), tokenized_documents->at(i).data() + tokenized_documents->at(i).size(), input_ids_start);
184184

185185
int64_t* attention_mask_start = reinterpret_cast<int64_t*>(tokens.attention_mask.data()) + i * token_count_of_longest_document;

src/rerank/rerank_calculator_ov.cc

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ using OutputDataType = std::string;
5959
class RerankCalculatorOV : public CalculatorBase {
6060
static const std::string INPUT_TAG_NAME;
6161
static const std::string OUTPUT_TAG_NAME;
62+
static const std::string RERANK_MODEL_INPUT_IDS_NAME;
63+
static const std::string RERANK_MODEL_ATTENTION_MASK_NAME;
64+
static const std::string RERANK_MODEL_TOKEN_TYPE_IDS_NAME;
6265
static constexpr size_t NUMBER_OF_SPECIAL_TOKENS = 4;
6366

6467
mediapipe::Timestamp timestamp{0};
@@ -106,10 +109,10 @@ class RerankCalculatorOV : public CalculatorBase {
106109
this->max_allowed_chunks = options.max_allowed_chunks();
107110
SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Max allowed chunks: {}", this->max_allowed_chunks);
108111

109-
bos_token = rerank_session->getBosToken();
110-
eos_token = rerank_session->getEosToken();
111-
sep_token = rerank_session->getSepToken();
112-
pad_token = rerank_session->getPadToken();
112+
bos_token = rerank_session->getBosToken().value_or(0);
113+
eos_token = rerank_session->getEosToken().value_or(0);
114+
sep_token = rerank_session->getSepToken().value_or(0);
115+
pad_token = rerank_session->getPadToken().value_or(0);
113116

114117
// max_position_embeddings
115118
if (options.has_max_position_embeddings()) {
@@ -229,12 +232,15 @@ class RerankCalculatorOV : public CalculatorBase {
229232
return std::make_pair(input_ids, attention_mask);
230233
}
231234

232-
std::vector<float> ComputeScoresUsingRerankModel(ov::Tensor input_ids, ov::Tensor attention_mask, const std::vector<size_t>& chunkMapping, size_t actual_batch_size) const {
235+
std::vector<float> ComputeScoresUsingRerankModel(ov::Tensor input_ids, ov::Tensor attention_mask, std::optional<ov::Tensor> typeIds, const std::vector<size_t>& chunkMapping, size_t actual_batch_size) const {
233236
ModelMetricReporter tmp(nullptr, nullptr, "example_pipeline_name", 1);
234237
auto executingStreamIdGuard = std::make_shared<ExecutingStreamIdGuard>(rerank_session->getInferRequestsQueue(), tmp);
235238
ov::InferRequest& inferRequest = executingStreamIdGuard->getInferRequest();
236-
inferRequest.set_tensor("input_ids", input_ids);
237-
inferRequest.set_tensor("attention_mask", attention_mask);
239+
inferRequest.set_tensor(RERANK_MODEL_INPUT_IDS_NAME, input_ids);
240+
inferRequest.set_tensor(RERANK_MODEL_ATTENTION_MASK_NAME, attention_mask);
241+
if (typeIds.has_value()) {
242+
inferRequest.set_tensor(RERANK_MODEL_TOKEN_TYPE_IDS_NAME, typeIds.value());
243+
}
238244
inferRequest.start_async();
239245
inferRequest.wait();
240246
auto logits = inferRequest.get_tensor("logits");
@@ -278,12 +284,17 @@ class RerankCalculatorOV : public CalculatorBase {
278284
// Prepare inputs for rerank model
279285
std::vector<size_t> chunk_mapping;
280286
auto [input_ids, attention_mask] = PrepareInputsForRerankModel(handler, chunk_mapping);
281-
287+
std::optional<ov::Tensor> typeIds;
288+
if (rerank_session->getNumberOfModelInputs() == 3) {
289+
typeIds = ov::Tensor{ov::element::i64, input_ids.get_shape()};
290+
std::fill_n(typeIds->data<int64_t>(), input_ids.get_size(), 0);
291+
}
282292
// Compute scores using rerank model
283293
size_t batch_size = handler.getDocumentsList().size();
284294
auto scores = ComputeScoresUsingRerankModel(
285295
input_ids,
286296
attention_mask,
297+
typeIds,
287298
chunk_mapping,
288299
batch_size);
289300

@@ -309,6 +320,9 @@ class RerankCalculatorOV : public CalculatorBase {
309320
};
310321
const std::string RerankCalculatorOV::INPUT_TAG_NAME{"REQUEST_PAYLOAD"};
311322
const std::string RerankCalculatorOV::OUTPUT_TAG_NAME{"RESPONSE_PAYLOAD"};
323+
const std::string RerankCalculatorOV::RERANK_MODEL_INPUT_IDS_NAME{"input_ids"};
324+
const std::string RerankCalculatorOV::RERANK_MODEL_ATTENTION_MASK_NAME{"attention_mask"};
325+
const std::string RerankCalculatorOV::RERANK_MODEL_TOKEN_TYPE_IDS_NAME{"token_type_ids"};
312326

313327
REGISTER_CALCULATOR(RerankCalculatorOV);
314328

src/sidepacket_servable.cpp

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,20 @@
3333

3434
namespace ovms {
3535

36-
#define SET_TOKEN(token, token_id_name) \
37-
if (modelConfig.HasMember(token_id_name) && modelConfig[token_id_name].IsInt64()) { \
38-
token = modelConfig[token_id_name].GetInt64(); \
36+
#define SET_TOKEN_ID(token, token_id_name) \
37+
if (modelConfig.HasMember(token_id_name) && modelConfig[token_id_name].IsInt64() && modelConfig[token_id_name].GetInt64() != 0) { \
38+
token = modelConfig[token_id_name].GetInt64(); \
39+
}
40+
41+
#define SET_TOKEN(token) \
42+
if (!token.has_value()) { \
43+
if (tokenizerConfig.HasMember(#token) && tokenizerConfig[#token].IsString()) { \
44+
auto tokenizedInputs = tokenizer->encode(tokenizerConfig[#token].GetString()); \
45+
if (tokenizedInputs.input_ids.get_size() == 1 && tokenizedInputs.input_ids.get_element_type() == ov::element::i64) \
46+
token = reinterpret_cast<int64_t*>(tokenizedInputs.input_ids.data())[0]; \
47+
else \
48+
SPDLOG_DEBUG("Parsing {} token from tokenizer_config.json failed", #token); \
49+
} \
3950
}
4051

4152
SidepacketServable::SidepacketServable(const std::string& modelDir, const std::string& targetDevice, const std::string& pluginConfig, const std::string& graphPath) {
@@ -62,9 +73,9 @@ SidepacketServable::SidepacketServable(const std::string& modelDir, const std::s
6273
break;
6374
}
6475
}
65-
SET_TOKEN(pad_token, "pad_token_id");
66-
SET_TOKEN(eos_token, "eos_token_id");
67-
SET_TOKEN(bos_token, "bos_token_id");
76+
SET_TOKEN_ID(pad_token, "pad_token_id");
77+
SET_TOKEN_ID(eos_token, "eos_token_id");
78+
SET_TOKEN_ID(bos_token, "bos_token_id");
6879
if (modelConfig.HasMember("sep_token_id") && modelConfig["sep_token_id"].IsInt64()) {
6980
sep_token = modelConfig["sep_token_id"].GetInt64();
7081
} else {
@@ -79,7 +90,35 @@ SidepacketServable::SidepacketServable(const std::string& modelDir, const std::s
7990
if (!status.ok()) {
8091
SPDLOG_ERROR("Error during embeddings node plugin_config option parsing to JSON: {}", pluginConfig);
8192
}
82-
tokenizer = std::make_shared<ov::genai::Tokenizer>(parsedModelsPath);
93+
ov::AnyMap tokenizerProperties = {{"add_special_tokens", false}};
94+
tokenizer = std::make_shared<ov::genai::Tokenizer>(parsedModelsPath, tokenizerProperties);
95+
std::filesystem::path tokenizerConfigPath = (std::filesystem::path(graphPath) / fsModelsPath / "tokenizer_config.json");
96+
if (std::filesystem::exists(tokenizerConfigPath)) {
97+
std::ifstream ifs(tokenizerConfigPath.string());
98+
if (ifs.is_open()) {
99+
rapidjson::Document tokenizerConfig;
100+
rapidjson::IStreamWrapper isw(ifs);
101+
rapidjson::ParseResult parseResult = tokenizerConfig.ParseStream(isw);
102+
if (parseResult.Code()) {
103+
SPDLOG_ERROR("Parsing tokenizer_config.json failed: {}", rapidjson::GetParseError_En(parseResult.Code()));
104+
} else {
105+
SET_TOKEN(pad_token);
106+
SET_TOKEN(eos_token);
107+
SET_TOKEN(bos_token);
108+
if (!sep_token.has_value()) {
109+
if (tokenizerConfig.HasMember("sep_token") && tokenizerConfig["sep_token"].IsString()) {
110+
auto tokenizedInputs = tokenizer->encode(tokenizerConfig["sep_token"].GetString());
111+
if (tokenizedInputs.input_ids.get_size() == 1 && tokenizedInputs.input_ids.get_element_type() == ov::element::i64)
112+
sep_token = reinterpret_cast<int64_t*>(tokenizedInputs.input_ids.data())[0];
113+
else
114+
SPDLOG_DEBUG("Parsing sep token from tokenizer_config.json failed");
115+
} else if (eos_token.has_value()) {
116+
sep_token = eos_token;
117+
}
118+
}
119+
}
120+
}
121+
}
83122

84123
ov::Core core;
85124
std::shared_ptr<ov::Model> m_model = core.read_model(parsedModelsPath / std::filesystem::path("openvino_model.xml"), {}, properties);

src/sidepacket_servable.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,10 @@ struct SidepacketServable {
4242
std::shared_ptr<ov::Model> model;
4343
ov::CompiledModel compiledModel;
4444
std::unique_ptr<OVInferRequestsQueue> inferRequestsQueue;
45-
int64_t pad_token = 0;
46-
int64_t eos_token = 0;
47-
int64_t bos_token = 0;
48-
int64_t sep_token = 0;
45+
std::optional<int64_t> pad_token;
46+
std::optional<int64_t> eos_token;
47+
std::optional<int64_t> bos_token;
48+
std::optional<int64_t> sep_token;
4949
std::optional<uint32_t> maxModelLength;
5050
std::filesystem::path parsedModelsPath;
5151

@@ -57,16 +57,16 @@ struct SidepacketServable {
5757
ov::genai::Tokenizer& getTokenizer() {
5858
return *tokenizer;
5959
}
60-
const int64_t getPadToken() {
60+
const std::optional<int64_t> getPadToken() {
6161
return pad_token;
6262
}
63-
const int64_t getEosToken() {
63+
const std::optional<int64_t> getEosToken() {
6464
return eos_token;
6565
}
66-
const int64_t getBosToken() {
66+
const std::optional<int64_t> getBosToken() {
6767
return bos_token;
6868
}
69-
const int64_t getSepToken() {
69+
const std::optional<int64_t> getSepToken() {
7070
return sep_token;
7171
}
7272
const std::optional<uint32_t> getMaxModelLength() {

0 commit comments

Comments
 (0)