feat: update vlm offline interface. (jd-opensource#356)

xiao-yu-chen · web-flow · commit 7e254306c1db · 2025-11-12T11:44:18.000+08:00
diff --git a/examples/generate_vlm.py b/examples/generate_vlm.py
@@ -1,36 +1,62 @@
-# python examples/generate_vlm.py --model='/path/models/Qwen2.5-VL-7B' --devices='npu:0'
-# python generate_vlm.py --model='/path/models/Qwen2.5-VL-7B' --devices='npu:0,npu:1'
+# python generate_vlm.py --model /path/to/Qwen2.5-VL-7B-Instruct/ --disable_prefix_cache --disable_chunked_prefill --max_seqs_per_batch 4
 
 import os
 import signal
-from xllm import ArgumentParser, VLM, RequestParams, MMChatMessage, MMInputData
+
+from xllm import ArgumentParser, VLM, RequestParams
+from xllm_export import MMType, MMData
+
+from PIL import Image
+from transformers import AutoImageProcessor
 
 # Create an VLM.
 parser = ArgumentParser()
-vlm = VLM(**vars(parser.parse_args()))
+args = parser.parse_args()
+
+vlm = VLM(**vars(args))
+processor = AutoImageProcessor.from_pretrained(args.model, trust_remote_code=True)
+
+questions = ["简单介绍下图片"]
+prompts = [
+    (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+        f"{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    for question in questions
+]
+
+paths = ["00307664d4ce393b.png"]
+images = []
+for path in paths:
+    images.append(Image.open(path).convert("RGB"))
+
+multi_modal_datas = []
+for idx in range(len(images)):
+    print(f"Processing image: {paths[idx]}")
+    image = images[idx]
+
+    data = processor.preprocess([image], return_tensors="pt").data
+    mm_data = {
+        "pixel_values": data['pixel_values'],
+        "image_grid_thw": data['image_grid_thw'],            
+    }
+    multi_modal_datas.append(MMData(MMType.IMAGE, mm_data))
+
 
 # Create a reqeust params, include sampling params
 request_params = RequestParams()
-request_params.temperature = 0.8
-request_params.top_p = 0.95
-request_params.max_tokens = 100
-
-# input_data
-mm_input_data1 = MMInputData()
-mm_input_data1.type = 'text'
-mm_input_data1.text = 'Please briefly introduce this picture.'
-mm_input_data2 = MMInputData()
-mm_input_data2.type = 'image_url'
-mm_input_data2.image_url = 'https://img2.baidu.com/it/u=2376489989,3127732063&fm=253&fmt=auto&app=138&f=JPEG?w=500&h=657'
-mm_chat_msg = MMChatMessage()
-mm_chat_msg.role = 'user'
-mm_chat_msg.content = [mm_input_data1, mm_input_data2]
-
-output = vlm.generate(mm_chat_msg, request_params, True)
-
-prompt = output.prompt
-generated_text = output.outputs[0].text
-print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+request_params.temperature = 0
+request_params.max_tokens = 1024
+
+outputs = vlm.generate(prompts, multi_modal_datas, request_params, True)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
 vlm.finish()
 
+
diff --git a/xllm/core/runtime/vlm_master.cpp b/xllm/core/runtime/vlm_master.cpp
@@ -124,74 +124,6 @@ VLMMaster::~VLMMaster() {
   }
 }
 
-void VLMMaster::handle_request(const std::vector<Message>& messages,
-                               const MMInput& mm_inputs,
-                               RequestParams sp,
-                               OutputCallback callback) {
-  MMData mm_data;
-  if (!mm_inputs.empty() && !image_processor_->process(mm_inputs, mm_data)) {
-    LOG(ERROR) << " image processor process failed";
-  }
-
-  this->handle_request(messages, mm_data, sp, callback);
-}
-
-void VLMMaster::handle_batch_request(const std::vector<std::string>& prompts,
-                                     const std::vector<MMData>& mm_datas,
-                                     std::vector<RequestParams> sps,
-                                     BatchOutputCallback callback) {
-  CHECK(prompts.size() == sps.size() || sps.size() == 1)
-      << "Number of prompts and sampling parameters should be the same";
-
-  const size_t num_requests = prompts.size();
-  for (size_t i = 0; i < num_requests; ++i) {
-    handle_request(std::move(prompts[i]),
-                   std::move(mm_datas[i]),
-                   // the sampling parameter may be shared
-                   sps.size() == 1 ? sps[0] : std::move(sps[i]),
-                   [i, callback](const RequestOutput& output) {
-                     output.log_request_status();
-                     return callback(i, output);
-                   });
-  }
-}
-
-void VLMMaster::handle_batch_request(
-    const std::vector<std::vector<Message>>& conversations,
-    const std::vector<MMData>& mm_datas,
-    std::vector<RequestParams> sps,
-    BatchOutputCallback callback) {
-  CHECK(conversations.size() == sps.size() || sps.size() == 1)
-      << "Number of conversations and sampling parameters should be the same";
-
-  const size_t num_requests = conversations.size();
-  for (size_t i = 0; i < num_requests; ++i) {
-    handle_request(std::move(conversations[i]),
-                   std::move(mm_datas[i]),
-                   // the sampling parameter may be shared
-                   sps.size() == 1 ? sps[0] : std::move(sps[i]),
-                   [i, callback](const RequestOutput& output) {
-                     output.log_request_status();
-                     return callback(i, output);
-                   });
-  }
-}
-
-void VLMMaster::handle_request(const std::vector<MMChatMessage>& raw_input_data,
-                               RequestParams sp,
-                               OutputCallback callback) {
-  static MMInputHelper helper;
-  std::vector<Message> messages;
-  MMInput mm_inputs;
-
-  if (!helper.trans(raw_input_data, messages, mm_inputs.items_)) {
-    LOG(ERROR) << "MMInputHelper trans failed, ingnore this input.";
-    return;
-  }
-
-  handle_request(std::move(messages), std::move(mm_inputs), sp, callback);
-}
-
 void VLMMaster::handle_request(const std::string& prompt,
                                const MMData& mm_data,
                                RequestParams sp,
@@ -232,6 +164,18 @@ void VLMMaster::handle_request(const std::string& prompt,
   });
 }
 
+void VLMMaster::handle_request(const std::vector<Message>& messages,
+                               const MMInput& mm_inputs,
+                               RequestParams sp,
+                               OutputCallback callback) {
+  MMData mm_data;
+  if (!mm_inputs.empty() && !image_processor_->process(mm_inputs, mm_data)) {
+    LOG(ERROR) << " image processor process failed";
+  }
+
+  this->handle_request(messages, mm_data, sp, callback);
+}
+
 void VLMMaster::handle_request(const std::vector<Message>& messages,
                                const MMData& mm_data,
                                RequestParams sp,
@@ -270,6 +214,62 @@ void VLMMaster::handle_request(const std::vector<Message>& messages,
   });
 }
 
+void VLMMaster::handle_request(const std::vector<MMChatMessage>& raw_input_data,
+                               RequestParams sp,
+                               OutputCallback callback) {
+  static MMInputHelper helper;
+  std::vector<Message> messages;
+  MMInput mm_inputs;
+
+  if (!helper.trans(raw_input_data, messages, mm_inputs.items_)) {
+    LOG(ERROR) << "MMInputHelper trans failed, ingnore this input.";
+    return;
+  }
+
+  handle_request(std::move(messages), std::move(mm_inputs), sp, callback);
+}
+
+void VLMMaster::handle_batch_request(const std::vector<std::string>& prompts,
+                                     const std::vector<MMData>& mm_datas,
+                                     const std::vector<RequestParams>& sps,
+                                     BatchOutputCallback callback) {
+  CHECK(prompts.size() == sps.size() || sps.size() == 1)
+      << "Number of prompts and sampling parameters should be the same";
+
+  const size_t num_requests = prompts.size();
+  for (size_t i = 0; i < num_requests; ++i) {
+    handle_request(std::move(prompts[i]),
+                   std::move(mm_datas[i]),
+                   // the sampling parameter may be shared
+                   sps.size() == 1 ? sps[0] : std::move(sps[i]),
+                   [i, callback](const RequestOutput& output) {
+                     output.log_request_status();
+                     return callback(i, output);
+                   });
+  }
+}
+
+void VLMMaster::handle_batch_request(
+    const std::vector<std::vector<Message>>& conversations,
+    const std::vector<MMData>& mm_datas,
+    const std::vector<RequestParams>& sps,
+    BatchOutputCallback callback) {
+  CHECK(conversations.size() == sps.size() || sps.size() == 1)
+      << "Number of conversations and sampling parameters should be the same";
+
+  const size_t num_requests = conversations.size();
+  for (size_t i = 0; i < num_requests; ++i) {
+    handle_request(std::move(conversations[i]),
+                   std::move(mm_datas[i]),
+                   // the sampling parameter may be shared
+                   sps.size() == 1 ? sps[0] : std::move(sps[i]),
+                   [i, callback](const RequestOutput& output) {
+                     output.log_request_status();
+                     return callback(i, output);
+                   });
+  }
+}
+
 void VLMMaster::run() {
   const bool already_running = running_.load(std::memory_order_relaxed);
   if (already_running) {
diff --git a/xllm/core/runtime/vlm_master.h b/xllm/core/runtime/vlm_master.h
@@ -46,6 +46,7 @@ class VLMMaster : public Master {
   explicit VLMMaster(const Options& options);
   ~VLMMaster();
 
+  // completion
   void handle_request(const std::string& prompt,
                       const MMData& mm_data,
                       RequestParams sp,
@@ -71,14 +72,14 @@ class VLMMaster : public Master {
   // batch completion
   void handle_batch_request(const std::vector<std::string>& prompts,
                             const std::vector<MMData>& mm_datas,
-                            std::vector<RequestParams> sps,
+                            const std::vector<RequestParams>& sps,
                             BatchOutputCallback callback);
 
   // batch chat
   void handle_batch_request(
       const std::vector<std::vector<Message>>& conversations,
       const std::vector<MMData>& mm_datas,
-      std::vector<RequestParams> sps,
+      const std::vector<RequestParams>& sps,
       BatchOutputCallback callback);
 
   // start the handling loop
diff --git a/xllm/pybind/bind.cpp b/xllm/pybind/bind.cpp
@@ -22,6 +22,7 @@ limitations under the License.
 #include "api_service/call.h"
 #include "core/common/options.h"
 #include "core/common/types.h"
+#include "core/framework/request/mm_data.h"
 #include "core/framework/request/request_output.h"
 #include "core/framework/request/request_params.h"
 #include "core/runtime/llm_master.h"
@@ -232,6 +233,43 @@ PYBIND11_MODULE(xllm_export, m) {
       .def_readwrite("role", &MMChatMessage::role)
       .def_readwrite("content", &MMChatMessage::content);
 
+  // 10. export MMType
+  py::enum_<MMType::Value>(m, "MMType")
+      .value("NONE", MMType::Value::NONE)
+      .value("IMAGE", MMType::Value::IMAGE)
+      .value("VIDEO", MMType::Value::VIDEO)
+      .value("AUDIO", MMType::Value::AUDIO)
+      .value("EMBEDDING", MMType::EMBEDDING)
+      .export_values();
+
+  // 11. export MMData
+  py::class_<MMData>(m, "MMData")
+      .def(py::init<int, const MMDict&>(), py::arg("ty"), py::arg("data"))
+      .def("get",
+           [](const MMData& self, const MMKey& key) -> py::object {
+             auto value = self.get<torch::Tensor>(key);
+             if (value.has_value()) {
+               return py::cast(value.value());
+             }
+             return py::none();
+           })
+      .def("get_list",
+           [](const MMData& self, const MMKey& key) -> py::object {
+             auto value = self.get<std::vector<torch::Tensor>>(key);
+             if (value.has_value()) {
+               return py::cast(value.value());
+             }
+             return py::none();
+           })
+      .def_readwrite("ty", &MMData::ty_)
+      .def_readwrite("data", &MMData::data_)
+      .def("__repr__", [](const MMData& self) {
+        std::stringstream ss;
+        ss << "MMData(" << static_cast<int>(self.ty_) << ": "
+           << self.data_.size() << " items)";
+        return ss.str();
+      });
+
   // 10. export VLMMaster
   py::class_<VLMMaster>(m, "VLMMaster")
       .def(py::init<const Options&>(),
@@ -242,6 +280,13 @@ PYBIND11_MODULE(xllm_export, m) {
                              RequestParams,
                              OutputCallback>(&VLMMaster::handle_request),
            py::call_guard<py::gil_scoped_release>())
+      .def("handle_batch_request",
+           py::overload_cast<const std::vector<std::string>&,
+                             const std::vector<MMData>&,
+                             const std::vector<RequestParams>&,
+                             BatchOutputCallback>(
+               &VLMMaster::handle_batch_request),
+           py::call_guard<py::gil_scoped_release>())
       .def("generate",
            &VLMMaster::generate,
            py::call_guard<py::gil_scoped_release>())
diff --git a/xllm/pybind/vlm.py b/xllm/pybind/vlm.py