jd-opensource
diff --git a/‎xllm/api_service/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎xllm/api_service/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xllm/api_service/api_service.cpp‎
Lines changed: 9 additions & 3 deletions b/‎xllm/api_service/api_service.cpp‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎xllm/api_service/api_service.h‎
Lines changed: 1 addition & 0 deletions b/‎xllm/api_service/api_service.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎xllm/api_service/qwen3_rerank_service_impl.cpp‎
Lines changed: 98 additions & 0 deletions b/‎xllm/api_service/qwen3_rerank_service_impl.cpp‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎xllm/api_service/qwen3_rerank_service_impl.h‎
Lines changed: 35 additions & 0 deletions b/‎xllm/api_service/qwen3_rerank_service_impl.h‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎xllm/api_service/rerank_service_impl.cpp‎
Lines changed: 75 additions & 84 deletions b/‎xllm/api_service/rerank_service_impl.cpp‎
Lines changed: 75 additions & 84 deletions
@@ -12,6 +12,7 @@ cc_library(
     embedding_service_impl.h
     image_generation_service_impl.h
     rerank_service_impl.h
+    qwen3_rerank_service_impl.h
     non_stream_call.h
     service_impl_factory.h
     stream_call.h
@@ -25,6 +26,7 @@ cc_library(
     image_generation_service_impl.cpp
     models_service_impl.cpp
     rerank_service_impl.cpp
+    qwen3_rerank_service_impl.cpp
   DEPS
     :master
     :chat_template
 
@@ -51,9 +51,15 @@ APIService::APIService(Master* master,
     embedding_service_impl_ =
         ServiceImplFactory<EmbeddingServiceImpl>::create_service_impl(
             llm_master, model_names);
-    rerank_service_impl_ =
-        ServiceImplFactory<RerankServiceImpl>::create_service_impl(llm_master,
-                                                                   model_names);
+    if (FLAGS_enable_qwen3_reranker) {
+      rerank_service_impl_ =
+          ServiceImplFactory<Qwen3RerankServiceImpl>::create_service_impl(
+              llm_master, model_names);
+    } else {
+      rerank_service_impl_ =
+          ServiceImplFactory<RerankServiceImpl>::create_service_impl(
+              llm_master, model_names);
+    }
   } else if (FLAGS_backend == "vlm") {
     auto vlm_master = dynamic_cast<VLMMaster*>(master);
     mm_chat_service_impl_ =
 
@@ -20,6 +20,7 @@ limitations under the License.
 #include "embedding_service_impl.h"
 #include "image_generation_service_impl.h"
 #include "models_service_impl.h"
+#include "qwen3_rerank_service_impl.h"
 #include "rerank_service_impl.h"
 #include "xllm_service.pb.h"
 
 
@@ -0,0 +1,98 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "api_service/qwen3_rerank_service_impl.h"
+
+#include "util/blocking_counter.h"
+
+namespace xllm {
+
+Qwen3RerankServiceImpl::Qwen3RerankServiceImpl(
+    LLMMaster* master,
+    const std::vector<std::string>& models)
+    : RerankServiceImpl(master, models) {}
+
+void Qwen3RerankServiceImpl::process_async_impl(
+    std::shared_ptr<RerankCall> call) {
+  const auto& rpc_request = call->request();
+  // check if model is supported
+  const auto& model = rpc_request.model();
+  if (!models_.contains(model)) {
+    call->finish_with_error(StatusCode::UNKNOWN, "Model not supported");
+    return;
+  }
+
+  auto query = rpc_request.query();
+  std::vector<std::string> documents;
+  if (rpc_request.documents_size() > 0) {
+    documents = std::vector<std::string>(rpc_request.documents().begin(),
+                                         rpc_request.documents().end());
+  }
+  std::vector<std::string> reqs;
+  reqs.reserve(documents.size());
+  for (size_t i = 0; i < documents.size(); ++i) {
+    reqs.emplace_back(query + documents[i]);
+  }
+
+  // create RequestParams for rerank request
+  RequestParams request_params(
+      rpc_request, call->get_x_request_id(), call->get_x_request_time());
+  std::vector<RequestParams> sps(documents.size(), request_params);
+  auto request_id = request_params.request_id;
+  auto created_time = absl::ToUnixSeconds(absl::Now());
+
+  // schedule the request
+  std::vector<RequestOutput> req_outputs;
+  req_outputs.resize(documents.size());
+  BlockingCounter counter(documents.size());
+
+  auto batch_callback = [&req_outputs, &counter](size_t index,
+                                                 RequestOutput output) -> bool {
+    req_outputs[index] = std::move(output);
+    counter.decrement_count();
+    return true;
+  };
+
+  master_->handle_batch_request(reqs, sps, batch_callback);
+
+  // Wait for all tasks to complete
+  counter.wait();
+
+  // get score
+  size_t doc_size = documents.size();
+  std::vector<RerankRequestOutput> rerank_outputs;
+  rerank_outputs.reserve(doc_size);
+  for (size_t i = 0; i < doc_size; ++i) {
+    if (req_outputs[i].outputs[0].logprobs.has_value()) {
+      auto score = req_outputs[i].outputs[0].logprobs.value()[0].logprob;
+      rerank_outputs.emplace_back(i, documents[i], score);
+    }
+  }
+
+  // send result to client
+  int32_t top_n = documents.size();
+  if (rpc_request.has_top_n()) {
+    top_n = std::min(top_n, rpc_request.top_n());
+  }
+  send_result_to_client_brpc(call,
+                             request_id,
+                             created_time,
+                             model,
+                             top_n,
+                             rerank_outputs,
+                             req_outputs);
+}
+
+}  // namespace xllm
@@ -0,0 +1,35 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include "api_service/rerank_service_impl.h"
+
+namespace xllm {
+using RerankCall = NonStreamCall<proto::RerankRequest, proto::RerankResponse>;
+
+// a class to handle completion requests
+class Qwen3RerankServiceImpl final : public RerankServiceImpl {
+ public:
+  Qwen3RerankServiceImpl(LLMMaster* master,
+                         const std::vector<std::string>& models);
+
+  // brpc call_data needs to use shared_ptr
+  void process_async_impl(std::shared_ptr<RerankCall> call) override;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(Qwen3RerankServiceImpl);
+};
+}  // namespace xllm
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "rerank_service_impl.h"
 
-#include <glog/logging.h>
 #include <torch/torch.h>
 
 #include <string>
@@ -28,29 +27,54 @@ limitations under the License.
 #include "util/uuid.h"
 
 namespace xllm {
-namespace {
-
-struct RerankRequestOutput {
-  int32_t index = 0;
-  std::string document = "";
-  float score = 0.0f;
-
-  RerankRequestOutput(int32_t index, std::string document, float score)
-      : index(index), document(std::move(document)), score(score) {}
-};
-
-bool send_result_to_client_brpc(std::shared_ptr<RerankCall> call,
-                                const std::string& request_id,
-                                int64_t created_time,
-                                const std::string& model,
-                                const std::vector<std::string>& documents,
-                                int32_t top_n,
-                                const std::vector<RequestOutput>& req_outputs) {
-  auto& response = call->response();
-  response.set_id(request_id);
-  response.set_model(model);
+RerankServiceImpl::RerankServiceImpl(LLMMaster* master,
+                                     const std::vector<std::string>& models)
+    : APIServiceImpl(models), master_(master) {
+  CHECK(master_ != nullptr);
+}
+
+// rerank_async for brpc
+void RerankServiceImpl::process_async_impl(std::shared_ptr<RerankCall> call) {
+  const auto& rpc_request = call->request();
+  // check if model is supported
+  const auto& model = rpc_request.model();
+  if (!models_.contains(model)) {
+    call->finish_with_error(StatusCode::UNKNOWN, "Model not supported");
+    return;
+  }
+
+  std::vector<std::string> documents;
+  if (rpc_request.documents_size() > 0) {
+    documents = std::vector<std::string>(rpc_request.documents().begin(),
+                                         rpc_request.documents().end());
+  }
+  documents.emplace_back(rpc_request.query());
+
+  // create RequestParams for rerank request
+  RequestParams request_params(
+      rpc_request, call->get_x_request_id(), call->get_x_request_time());
+  std::vector<RequestParams> sps(documents.size(), request_params);
+  auto request_id = request_params.request_id;
+  auto created_time = absl::ToUnixSeconds(absl::Now());
+
+  // schedule the request
+  std::vector<RequestOutput> req_outputs;
+  req_outputs.resize(documents.size());
+  BlockingCounter counter(documents.size());
+
+  auto batch_callback = [&req_outputs, &counter](size_t index,
+                                                 RequestOutput output) -> bool {
+    req_outputs[index] = std::move(output);
+    counter.decrement_count();
+    return true;
+  };
+
+  master_->handle_batch_request(documents, sps, batch_callback);
+
+  // Wait for all tasks to complete
+  counter.wait();
 
-  // calculate cosine similarity
+  // calculate cosine similarity to get score
   size_t doc_size = documents.size() - 1;
   std::string query = documents[doc_size];
   auto query_embed = req_outputs[doc_size].outputs[0].embeddings.value();
@@ -70,16 +94,41 @@ bool send_result_to_client_brpc(std::shared_ptr<RerankCall> call,
     }
   }
 
+  // send result to client
+  int32_t top_n = documents.size() - 1;
+  if (rpc_request.has_top_n()) {
+    top_n = std::min(top_n, rpc_request.top_n());
+  }
+  send_result_to_client_brpc(call,
+                             request_id,
+                             created_time,
+                             model,
+                             top_n,
+                             rerank_outputs,
+                             req_outputs);
+}
+
+bool RerankServiceImpl::send_result_to_client_brpc(
+    std::shared_ptr<RerankCall> call,
+    const std::string& request_id,
+    int64_t created_time,
+    const std::string& model,
+    int32_t top_n,
+    std::vector<RerankRequestOutput>& rerank_outputs,
+    const std::vector<RequestOutput>& req_outputs) {
+  auto& response = call->response();
+  response.set_id(request_id);
+  response.set_model(model);
+
   std::sort(rerank_outputs.begin(),
             rerank_outputs.end(),
             [](const RerankRequestOutput& a, const RerankRequestOutput& b) {
               return a.score > b.score;
             });
 
   // add top_n results
-  int32_t valid_top_n = std::min(top_n, static_cast<int32_t>(doc_size));
-  response.mutable_results()->Reserve(valid_top_n);
-  for (int32_t i = 0; i < valid_top_n; ++i) {
+  response.mutable_results()->Reserve(top_n);
+  for (int32_t i = 0; i < top_n; ++i) {
     auto* result = response.add_results();
     result->set_index(rerank_outputs[i].index);
     auto* document = result->mutable_document();
@@ -109,62 +158,4 @@ bool send_result_to_client_brpc(std::shared_ptr<RerankCall> call,
   return call->write_and_finish(response);
 }
 
-}  // namespace
-
-RerankServiceImpl::RerankServiceImpl(LLMMaster* master,
-                                     const std::vector<std::string>& models)
-    : APIServiceImpl(models), master_(master) {
-  CHECK(master_ != nullptr);
-}
-
-// rerank_async for brpc
-void RerankServiceImpl::process_async_impl(std::shared_ptr<RerankCall> call) {
-  const auto& rpc_request = call->request();
-  // check if model is supported
-  const auto& model = rpc_request.model();
-  if (!models_.contains(model)) {
-    call->finish_with_error(StatusCode::UNKNOWN, "Model not supported");
-    return;
-  }
-
-  std::vector<std::string> documents;
-  if (rpc_request.documents_size() > 0) {
-    documents = std::vector<std::string>(rpc_request.documents().begin(),
-                                         rpc_request.documents().end());
-  }
-  documents.emplace_back(rpc_request.query());
-
-  // create RequestParams for rerank request
-  RequestParams request_params(
-      rpc_request, call->get_x_request_id(), call->get_x_request_time());
-  std::vector<RequestParams> sps(documents.size(), request_params);
-  auto request_id = request_params.request_id;
-  auto created_time = absl::ToUnixSeconds(absl::Now());
-
-  // schedule the request
-  std::vector<RequestOutput> req_outputs;
-  req_outputs.resize(documents.size());
-  BlockingCounter counter(documents.size());
-
-  auto batch_callback = [&req_outputs, &counter](size_t index,
-                                                 RequestOutput output) -> bool {
-    req_outputs[index] = std::move(output);
-    counter.decrement_count();
-    return true;
-  };
-
-  master_->handle_batch_request(documents, sps, batch_callback);
-
-  // Wait for all tasks to complete
-  counter.wait();
-
-  int32_t top_n = documents.size() - 1;
-  if (rpc_request.has_top_n()) {
-    top_n = rpc_request.top_n();
-  }
-
-  send_result_to_client_brpc(
-      call, request_id, created_time, model, documents, top_n, req_outputs);
-}
-
 }  // namespace xllm