xiao-yu-chen
diff --git a/‎examples/__init__.py‎ b/‎examples/__init__.py‎
diff --git a/‎examples/generate.py‎
Lines changed: 33 additions & 0 deletions b/‎examples/generate.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎examples/generate_vlm.py‎
Lines changed: 35 additions & 0 deletions b/‎examples/generate_vlm.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/__init__.py‎
Lines changed: 4 additions & 2 deletions b/‎xllm/__init__.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎xllm/core/common/types.h‎
Lines changed: 13 additions & 0 deletions b/‎xllm/core/common/types.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎xllm/core/framework/request/mm_input_helper.cpp‎
Lines changed: 96 additions & 0 deletions b/‎xllm/core/framework/request/mm_input_helper.cpp‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎xllm/core/framework/request/mm_input_helper.h‎
Lines changed: 9 additions & 0 deletions b/‎xllm/core/framework/request/mm_input_helper.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎xllm/core/runtime/llm_master.cpp‎
Lines changed: 6 additions & 9 deletions b/‎xllm/core/runtime/llm_master.cpp‎
Lines changed: 6 additions & 9 deletions
@@ -0,0 +1,33 @@
+# python examples/generate.py --model='/path/models/Qwen2-7B-Instruct' --devices='npu:0' 
+
+from xllm import ArgumentParser, LLM, RequestParams
+
+# Create an LLM.
+parser = ArgumentParser()
+llm = LLM(**vars(parser.parse_args()))
+
+# Create a reqeust params, include sampling params
+request_params = RequestParams()
+request_params.temperature = 0.8
+request_params.top_p = 0.95
+request_params.max_tokens = 10
+
+# Generate texts from the prompts. The output is a list of RequestOutput
+# objects that contain the prompt, generated text, and other information.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+outputs = llm.generate(prompts, request_params, True)
+
+# Print the outputs.
+for i, output in enumerate(outputs):
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+llm.finish()
+
@@ -0,0 +1,35 @@
+# python examples/generate_vlm.py --model='/path/models/Qwen2.5-VL-7B' --devices='npu:0' --master_node_addr=127.0.0.1:8888
+
+import os
+import signal
+from xllm import ArgumentParser, VLM, RequestParams, MMChatMessage, MMInputData
+
+# Create an VLM.
+parser = ArgumentParser()
+vlm = VLM(**vars(parser.parse_args()))
+
+# Create a reqeust params, include sampling params
+request_params = RequestParams()
+request_params.temperature = 0.8
+request_params.top_p = 0.95
+request_params.max_tokens = 100
+
+# input_data
+mm_input_data1 = MMInputData()
+mm_input_data1.type = 'text'
+mm_input_data1.text = 'Please briefly introduce this picture.'
+mm_input_data2 = MMInputData()
+mm_input_data2.type = 'image_url'
+mm_input_data2.image_url = 'https://img2.baidu.com/it/u=2376489989,3127732063&fm=253&fmt=auto&app=138&f=JPEG?w=500&h=657'
+mm_chat_msg = MMChatMessage()
+mm_chat_msg.role = 'user'
+mm_chat_msg.content = [mm_input_data1, mm_input_data2]
+
+output = vlm.generate(mm_chat_msg, request_params, True)
+
+prompt = output.prompt
+generated_text = output.outputs[0].text
+print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+vlm.finish()
+
@@ -593,7 +593,7 @@ def apply_patch():
                 },
         zip_safe=False,
         py_modules=["xllm/launch_xllm", "xllm/__init__",
-                    "xllm/pybind/llm", "xllm/pybind/args"],
+                    "xllm/pybind/llm", "xllm/pybind/vlm", "xllm/pybind/args"],
         entry_points={
             'console_scripts': [
                 'xllm = xllm.launch_xllm:launch_xllm'
 
@@ -14,19 +14,21 @@
 xllm_export = importlib.util.module_from_spec(spec)
 
 from xllm.pybind.llm import LLM
+from xllm.pybind.vlm import VLM
 from xllm.pybind.args import ArgumentParser
 from xllm_export import (LLMMaster, Options, RequestParams, RequestOutput,
-                         SequenceOutput, Status, StatusCode)
+                         SequenceOutput, Status, StatusCode, MMChatMessage, MMInputData)
 
 __all__ = [
     "ArgumentParser",
     "LLM",
     "LLMMaster",
+    "VLM",
+    "VLMMaster"
     "Options",
     "RequestParams",
     "RequestOutput",
     "SequenceOutput",
     "Status",
     "StatusCode",
 ]
-
 
@@ -274,4 +274,17 @@ struct EplbInfo {
   int32_t update_layer_id = -1;
 };
 
+struct MMInputData {
+  std::string type = "";
+  std::string text = "";
+  std::string image_url = "";
+  std::string video_url = "";
+  std::string audio_url = "";
+};
+
+struct MMChatMessage {
+  std::string role = "";
+  std::vector<MMInputData> content;
+};
+
 }  // namespace xllm
@@ -147,7 +147,22 @@ class Handler {
     return true;
   }
 
+  bool process(const MMInputData& msg, MMInputItem& input) {
+    if (!this->load(msg, input)) {
+      LOG(ERROR) << " load mm data failed";
+      return false;
+    }
+
+    if (!this->decode(input)) {
+      LOG(ERROR) << " decode mm data failed";
+      return false;
+    }
+
+    return true;
+  }
+
   virtual bool load(const proto::MMInputData& msg, MMInputItem& input) = 0;
+  virtual bool load(const MMInputData& msg, MMInputItem& input) = 0;
   virtual bool decode(MMInputItem& input) = 0;
 
  protected:
@@ -197,6 +212,23 @@ class ImageHandler : public Handler {
     }
   }
 
+  virtual bool load(const MMInputData& msg, MMInputItem& input) {
+    input.clear();
+
+    const auto& url = msg.image_url;
+    if (url.compare(0, dataurl_prefix_.size(), dataurl_prefix_) ==
+        0) {  // data url
+
+      input.type_ = MMType::IMAGE;
+      return this->load_from_dataurl(url, input.raw_data_);
+    } else if (url.compare(0, httpurl_prefix_.size(), httpurl_prefix_) ==
+               0) {  // http url
+
+      input.type_ = MMType::IMAGE;
+      return this->load_from_http(url, input.raw_data_);
+    }
+  }
+
   virtual bool decode(MMInputItem& input) {
     OpenCVImageDecoder decoder;
     return decoder.decode(input.raw_data_, input.decode_data_);
@@ -223,6 +255,18 @@ class MMHandlerSet {
     return handler->process(msg, input);
   }
 
+  bool process(const std::string& type,
+               const MMInputData& msg,
+               MMInputItem& input) {
+    auto itor = handlers_.find(type);
+    if (itor == handlers_.end()) {
+      return false;
+    }
+
+    auto& handler = itor->second;
+    return handler->process(msg, input);
+  }
+
  private:
   std::unordered_map<std::string, std::unique_ptr<Handler>> handlers_;
 };
@@ -259,6 +303,32 @@ bool MMInputHelper::trans(const MMChatMessageVec& vec,
   return true;
 }
 
+bool MMInputHelper::trans(const std::vector<MMChatMessage>& raw_input_data,
+                          std::vector<Message>& messages,
+                          MMInputItemVec& inputs) {
+  messages.clear();
+  inputs.clear();
+  messages.reserve(raw_input_data.size());
+  inputs.reserve(raw_input_data.size());
+
+  for (int idx = 0; idx < raw_input_data.size(); ++idx) {
+    const auto& chat = raw_input_data[idx];
+    const auto& role = chat.role;
+    const auto& content = chat.content;
+
+    Message::MMContentVec mmc;
+    MMInputItemVec ins;
+    if (!this->trans(content, mmc, ins)) {
+      return false;
+    }
+
+    messages.emplace_back(role, mmc);
+    inputs.insert(inputs.end(), ins.begin(), ins.end());
+  }
+
+  return true;
+}
+
 bool MMInputHelper::trans(const MMInputDataVec& vec,
                           Message::MMContentVec& mmc,
                           MMInputItemVec& inputs) {
@@ -285,4 +355,30 @@ bool MMInputHelper::trans(const MMInputDataVec& vec,
   return true;
 }
 
+bool MMInputHelper::trans(const std::vector<MMInputData>& vec,
+                          Message::MMContentVec& mmc,
+                          MMInputItemVec& inputs) {
+  mmc.clear();
+  inputs.clear();
+
+  for (int idx = 0; idx < vec.size(); ++idx) {
+    const auto& item = vec[idx];
+    const auto& type = item.type;
+
+    if (type == "text") {
+      mmc.emplace_back(type, item.text);
+    } else {
+      MMInputItem input;
+      if (!mm_handlers_->process(type, item, input)) {
+        return false;
+      }
+
+      mmc.emplace_back(type);
+      inputs.emplace_back(input);
+    }
+  }
+
+  return true;
+}
+
 }  // namespace xllm
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "core/common/types.h"
 #include "core/framework/chat_template/jinja_chat_template.h"
 #include "mm_data.h"
 #include "multimodal.pb.h"
@@ -69,11 +70,19 @@ class MMInputHelper {
              std::vector<Message>& messages,
              MMInputItemVec& inputs);
 
+  bool trans(const std::vector<MMChatMessage>& raw_input_data,
+             std::vector<Message>& messages,
+             MMInputItemVec& inputs);
+
  private:
   bool trans(const MMInputDataVec& vec,
              Message::MMContentVec& mmc,
              MMInputItemVec& input);
 
+  bool trans(const std::vector<MMInputData>& vec,
+             Message::MMContentVec& mmc,
+             MMInputItemVec& input);
+
   std::unique_ptr<MMHandlerSet> mm_handlers_;
 };
 
 
@@ -131,7 +131,6 @@ void LLMMaster::handle_batch_request(std::vector<std::string> prompts,
       << "Number of prompts and sampling parameters should be the same";
 
   const size_t num_requests = prompts.size();
-  scheduler_->incr_pending_requests(num_requests);
   for (size_t i = 0; i < num_requests; ++i) {
     handle_request(std::move(prompts[i]),
                    std::nullopt,
@@ -153,7 +152,6 @@ void LLMMaster::handle_batch_request(
       << "Number of conversations and sampling parameters should be the same";
 
   const size_t num_requests = conversations.size();
-  scheduler_->incr_pending_requests(num_requests);
   for (size_t i = 0; i < num_requests; ++i) {
     handle_request(std::move(conversations[i]),
                    std::nullopt,
@@ -173,8 +171,10 @@ void LLMMaster::handle_request(std::string prompt,
                                std::optional<Call*> call,
                                OutputCallback callback) {
   scheduler_->incr_pending_requests(1);
-  auto cb = [callback = std::move(callback)](const RequestOutput& output) {
+  auto cb = [callback = std::move(callback),
+             scheduler = scheduler_.get()](const RequestOutput& output) {
     output.log_request_status();
+    scheduler->decr_pending_requests();
     return callback(output);
   };
   // add into the queue
@@ -186,9 +186,6 @@ void LLMMaster::handle_request(std::string prompt,
                          call]() mutable {
     AUTO_COUNTER(request_handling_latency_seconds_completion);
 
-    // remove the pending request after scheduling
-    SCOPE_GUARD([this] { scheduler_->decr_pending_requests(); });
-
     Timer timer;
     // verify the prompt
     if (!sp.verify_params(callback)) {
@@ -214,8 +211,10 @@ void LLMMaster::handle_request(std::vector<Message> messages,
                                std::optional<Call*> call,
                                OutputCallback callback) {
   scheduler_->incr_pending_requests(1);
-  auto cb = [callback = std::move(callback)](const RequestOutput& output) {
+  auto cb = [callback = std::move(callback),
+             scheduler = scheduler_.get()](const RequestOutput& output) {
     output.log_request_status();
+    scheduler->decr_pending_requests();
     return callback(output);
   };
   // add into the queue
@@ -226,8 +225,6 @@ void LLMMaster::handle_request(std::vector<Message> messages,
                          callback = std::move(cb),
                          call]() mutable {
     AUTO_COUNTER(request_handling_latency_seconds_chat);
-    // remove the pending request after scheduling
-    SCOPE_GUARD([this] { scheduler_->decr_pending_requests(); });
 
     // verify the prompt
     if (!sp.verify_params(callback)) {