feat: support embedding models for offline inference. (jd-opensource#318)

yq33victor · web-flow · commit 37186371b9bb · 2025-11-05T18:20:31.000+08:00
Signed-off-by: pengtao.156 &lt;pengtao.156@jd.com&gt;
diff --git a/examples/generate_embedding.py b/examples/generate_embedding.py
@@ -0,0 +1,31 @@
+# python examples/generate_embedding.py --model='/path/models/Qwen3-8B' --devices='npu:0'
+# python generate_embedding.py --model='/path/models/Qwen3-8B' --devices='npu:0,npu:1'
+
+from xllm import ArgumentParser, Embedding, RequestParams
+
+# Create an EmbeddingLM.
+parser = ArgumentParser()
+emb = Embedding(**vars(parser.parse_args()))
+
+# Create a reqeust params, include sampling params
+request_params = RequestParams()
+request_params.is_embeddings = True
+request_params.max_tokens = 1
+
+inputs = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+outputs = emb.embedding(inputs, request_params, True)
+
+# Print the outputs.
+for i, output in enumerate(outputs):
+    input_str = output.prompt
+    generated_embedding = output.outputs[0].embeddings
+    print(f"Input: {input_str!r}, Generated embedding: {generated_embedding!r}")
+
+emb.finish()
+
diff --git a/setup.py b/setup.py
@@ -611,7 +611,8 @@ def apply_patch():
         zip_safe=False,
         py_modules=["xllm/launch_xllm", "xllm/__init__",
                     "xllm/pybind/llm", "xllm/pybind/vlm",
-                    "xllm/pybind/util", "xllm/pybind/args"],
+                    "xllm/pybind/embedding", "xllm/pybind/util",
+                    "xllm/pybind/args"],
         entry_points={
             'console_scripts': [
                 'xllm = xllm.launch_xllm:launch_xllm'
diff --git a/xllm/__init__.py b/xllm/__init__.py
@@ -13,6 +13,7 @@
 spec = importlib.util.spec_from_file_location("xllm_export", export_so_path)
 xllm_export = importlib.util.module_from_spec(spec)
 
+from xllm.pybind.embedding import Embedding
 from xllm.pybind.llm import LLM
 from xllm.pybind.vlm import VLM
 from xllm.pybind.args import ArgumentParser
@@ -21,6 +22,7 @@
 
 __all__ = [
     "ArgumentParser",
+    "Embedding",
     "LLM",
     "LLMMaster",
     "VLM",
diff --git a/xllm/core/distributed_runtime/spawn_worker_server/spawn_worker_server.cpp b/xllm/core/distributed_runtime/spawn_worker_server/spawn_worker_server.cpp
@@ -40,7 +40,8 @@ SpawnWorkerServer::SpawnWorkerServer(const std::string& master_node_addr,
                                      int num_decoding_tokens,
                                      int block_size,
                                      bool enable_shm,
-                                     bool is_local) {
+                                     bool is_local,
+                                     const std::string& task_type) {
   // TODO: pass whole xllm::runtime::Options here from main process.
   xllm::runtime::Options runner_options;
   runner_options.block_size(block_size)
@@ -49,7 +50,8 @@ SpawnWorkerServer::SpawnWorkerServer(const std::string& master_node_addr,
       .enable_offline_inference(true)
       .master_node_addr(master_node_addr)
       .enable_shm(enable_shm)
-      .is_local(is_local);
+      .is_local(is_local)
+      .task_type(task_type);
   FLAGS_enable_schedule_overlap = false;
   FLAGS_master_node_addr = master_node_addr;
   FLAGS_block_size = block_size;
diff --git a/xllm/core/distributed_runtime/spawn_worker_server/spawn_worker_server.h b/xllm/core/distributed_runtime/spawn_worker_server/spawn_worker_server.h
@@ -29,7 +29,8 @@ class SpawnWorkerServer final {
                              int num_decoding_tokens,
                              int block_size,
                              bool enable_shm,
-                             bool is_local);
+                             bool is_local,
+                             const std::string& task_type);
 
   ~SpawnWorkerServer() = default;
 
diff --git a/xllm/core/distributed_runtime/spawn_worker_server/spawn_worker_server_process.cpp b/xllm/core/distributed_runtime/spawn_worker_server/spawn_worker_server_process.cpp
@@ -30,10 +30,11 @@ limitations under the License.
 // @block_size
 // @enable_shm
 // @is_local
+// @task_type
 int main(int argc, char* argv[]) {
-  if (argc < 9) {
+  if (argc < 10) {
     LOG(ERROR)
-        << "Spwan worker process receive wrong args. Need 9 args, receive "
+        << "Spwan worker process receive wrong args. Need 10 args, receive "
         << argc;
     return 1;
   }
@@ -54,16 +55,18 @@ int main(int argc, char* argv[]) {
   int block_size = atoi(argv[7]);
   int enable_shm = atoi(argv[8]);
   int is_local = atoi(argv[9]);
+  std::string task_type = std::string(argv[10]);
 
   LOG(INFO) << "Spwan worker: "
             << "master_node_addr = " << master_node_addr
-            << ", is_local = " << is_local << ", local_rank = " << local_rank
+            << ", local_rank = " << local_rank
             << ", world_size = " << world_size
             << ", device_idx = " << device_idx
             << ", num_decoding_tokens = " << num_decoding_tokens
             << ", block_size = " << block_size
             << ", enable_shm = " << (enable_shm > 0)
-            << ", enable_shm = " << (is_local > 0) << "\n";
+            << ", is_local = " << (is_local > 0)
+            << ", task_type = " << task_type << "\n";
 
   xllm::SpawnWorkerServer worker(master_node_addr,
                                  local_rank,
@@ -73,7 +76,8 @@ int main(int argc, char* argv[]) {
                                  num_decoding_tokens,
                                  block_size,
                                  enable_shm > 0,
-                                 is_local > 0);
+                                 is_local > 0,
+                                 task_type);
 
   worker.run();
 
diff --git a/xllm/core/distributed_runtime/worker_server.cpp b/xllm/core/distributed_runtime/worker_server.cpp
@@ -156,6 +156,7 @@ void WorkerServer::create_spawn_server(int local_rank,
                         block_size_ptr,
                         enable_shm_ptr,
                         is_local_ptr,
+                        options.task_type().c_str(),
                         nullptr};
   pid_t pid;
   posix_spawn_file_actions_init(&file_actions_);
diff --git a/xllm/pybind/CMakeLists.txt b/xllm/pybind/CMakeLists.txt
@@ -20,6 +20,9 @@ pybind_extension(
     gflags::gflags
     glog::glog
     Python::Module
+    torch_python
+    torch
+    c10
 )
 target_link_libraries(common PRIVATE leveldb::leveldb ZLIB::ZLIB OpenSSL::SSL OpenSSL::Crypto protobuf::libprotobuf)
 add_dependencies(common brpc-static)
diff --git a/xllm/pybind/bind.cpp b/xllm/pybind/bind.cpp
@@ -17,6 +17,7 @@ limitations under the License.
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
+#include <torch/python.h>
 
 #include "api_service/call.h"
 #include "core/common/options.h"
diff --git a/xllm/pybind/embedding.py b/xllm/pybind/embedding.py
@@ -0,0 +1,128 @@
+import os
+import signal
+import time
+from . import util
+from typing import List, Optional, Union
+
+from xllm_export import (LLMMaster, Options, RequestOutput,
+                         RequestParams)
+
+class Embedding:
+    def __init__(
+        self,
+        model: str,
+        devices: str = 'auto',
+        block_size: int = 128,
+        max_cache_size: int = 0,
+        max_memory_utilization: float = 0.9,
+        disable_prefix_cache: bool = False,
+        max_tokens_per_batch: int = 20000,
+        max_seqs_per_batch: int = 256,
+        max_tokens_per_chunk_for_prefill: int = 512,
+        num_request_handling_threads: int = 4,
+        communication_backend: str = 'lccl',
+        rank_tablefile: str = '',
+        expert_parallel_degree: int = 0,
+        enable_mla: bool = False,
+        disable_chunked_prefill: bool = False,
+        instance_role: str = 'DEFAULT',
+        nnodes: int = 1,
+        node_rank: int = 0,
+        dp_size: int = 1,
+        ep_size: int = 1,
+        enable_shm: bool = False,
+        is_local: bool = True,
+        **kwargs,
+    ) -> None:
+        if not os.path.exists(model):
+            raise ValueError(f"model {model} not exists")
+
+        options = Options()
+        options.model_path = model
+        options.task_type = "embed"
+        options.devices = devices
+        options.draft_model_path = None
+        options.draft_devices = None
+        options.block_size = block_size
+        options.max_cache_size = max_cache_size
+        options.max_memory_utilization = max_memory_utilization
+        if disable_prefix_cache:
+            options.enable_prefix_cache = False
+        else:
+            options.enable_prefix_cache = True
+        options.max_tokens_per_batch = max_tokens_per_batch
+        options.max_seqs_per_batch = max_seqs_per_batch
+        options.max_tokens_per_chunk_for_prefill = max_tokens_per_chunk_for_prefill
+        options.num_request_handling_threads = num_request_handling_threads
+        options.communication_backend = communication_backend
+        options.rank_tablefile = rank_tablefile
+        options.expert_parallel_degree = expert_parallel_degree
+        options.enable_mla = enable_mla
+        if disable_chunked_prefill:
+            options.enable_chunked_prefill = False
+        else:
+            options.enable_chunked_prefill = True
+        free_port = util.get_free_port()
+        options.master_node_addr = "127.0.0.1:" + str(free_port)
+        options.nnodes = nnodes
+        options.node_rank = node_rank
+        options.dp_size = dp_size
+        options.ep_size = ep_size
+        options.enable_disagg_pd = False
+        options.enable_schedule_overlap = False
+        options.enable_offline_inference = True
+        options.spawn_worker_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+        options.enable_shm = enable_shm
+        options.is_local = is_local
+        self.master = LLMMaster(options)
+
+    def finish(self):
+        try:
+            #os.kill(os.getpid(), signal.SIGTERM)
+            os.kill(os.getpid(), signal.SIGKILL)
+        except Exception as e:
+            pass
+
+    def embedding(
+        self,
+        inputs: Union[str, List[str]],
+        request_params: Optional[Union[RequestParams, List[RequestParams]]] = None,
+        wait_schedule_done: bool = True,
+    ) -> List[RequestOutput]:
+        if request_params is None:
+            request_params = RequestParams()
+        if isinstance(inputs, str):
+            inputs = [inputs]
+        if isinstance(request_params, RequestParams):
+            request_params.is_embeddings = True
+            request_params = [request_params]
+        else:
+            for i in range(len(request_params)):
+                request_params[i].is_embeddings = True
+
+        outputs = [None] * len(inputs)
+        def callback(index: int, output: RequestOutput) -> bool:
+            outputs[index] = output
+            return True
+
+        # schedule all requests
+        self.master.handle_batch_request(
+            inputs, request_params, callback
+        )
+
+        # TODO: add wait later
+        if wait_schedule_done:
+            pass
+
+        # generate
+        self.master.generate()
+
+        # wait async output
+        for i in range(len(outputs)):
+            while outputs[i] is None:
+                time.sleep(0.01)
+            if outputs[i].status is not None and not outputs[i].status.ok:
+                raise ValidationError(outputs[i].status.code, outputs[i].status.message)
+            outputs[i].prompt = inputs[i]
+
+        return outputs