feat: choose model backend automatically.

XuZhang99 · yq33victor · commit 9cc499bf91f3 · 2025-11-11T18:27:01.000+08:00
diff --git a/README.md b/README.md
@@ -167,7 +167,6 @@ Run the following command to start xLLM engine:
 ```bash
 ./build/xllm/core/server/xllm \    # launch xllm server
     --model=/path/to/your/llm  \   # model path（to replace with your own path）
-    --backend=llm \                # indicate the LLM backend
     --port=9977 \                  # set service port to 9977
     --max_memory_utilization 0.90  # set the maximal utilization of device memory
 ```
diff --git a/README_zh.md b/README_zh.md
@@ -168,7 +168,6 @@ python setup.py bdist_wheel
 ```bash
 ./build/xllm/core/server/xllm \    # 启动 xllm 服务器程序
     --model=/path/to/your/llm  \   # 指定模型路径（需替换为实际路径）
-    --backend=llm \                # 指定后端类型为 LLM
     --port=9977 \                  # 设置服务端口为 9977
     --max_memory_utilization 0.90  # 设置最大内存利用率为 90
 ```
diff --git a/docs/en/getting_started/single_node.md b/docs/en/getting_started/single_node.md
@@ -4,7 +4,6 @@ Start the single-node `xllm` service directly:
 ```bash linenums="1"
 ./build/xllm/core/server/xllm \
     --model=/path/to/your/qwen2-7b  \
-    --backend=llm \
     --port=9977 \
     --max_memory_utilization 0.90
 ```
diff --git a/docs/en/getting_started/start_vlm_service.md b/docs/en/getting_started/start_vlm_service.md
@@ -5,11 +5,11 @@ This document describes how to start a VLM model service based on the xLLM infer
 ## Single Device
 Start the service by executing the following command in the main directory of the `xllm` project:
 ```bash
-ASCEND_RT_VISIBLE_DEVICES=0 ./build/xllm/core/server/xllm --model=/path/to/Qwen2.5-VL-7B-Instruct  --port=12345  --max_memory_utilization 0.90 --backend vlm
+ASCEND_RT_VISIBLE_DEVICES=0 ./build/xllm/core/server/xllm --model=/path/to/Qwen2.5-VL-7B-Instruct  --port=12345  --max_memory_utilization 0.90
 ```
 
 ## Multiple Devices
 Start the service by executing the following command in the main directory of the `xllm` project:
 ```bash
-ASCEND_RT_VISIBLE_DEVICES=0,1 ./build/xllm/core/server/xllm --model=/path/to/Qwen2.5-VL-7B-Instruct  --port=12345  --max_memory_utilization 0.90 --backend vlm
+ASCEND_RT_VISIBLE_DEVICES=0,1 ./build/xllm/core/server/xllm --model=/path/to/Qwen2.5-VL-7B-Instruct  --port=12345  --max_memory_utilization 0.90
 ```
diff --git a/docs/zh/cli_reference.md b/docs/zh/cli_reference.md
@@ -26,7 +26,6 @@ xLLM使用gflags来管理服务启动参数，具体的参数含义如下：
 | `enable_prefix_cache` | bool | true | false | 是否开启prefix cache（DeepSeek暂不支持） |  |
 | `communication_backend` | string | "hccl" | "lccl" | 通信操作采用的后端 |  |
 | `block_size` | int32 | 128 |  | KV Cache存储的block size大小 |  |
-| `backend` | string | "llm" | "vlm" | 模型类型 |  |
 | `task` | string | "generate" | "embed" | 服务类型，生成式或embedding |  |
 | `max_cache_size` | int64 | 0 |  | 可使用的KV Cache大小，单位byte |  |
 
diff --git a/docs/zh/getting_started/single_node.md b/docs/zh/getting_started/single_node.md
@@ -3,7 +3,6 @@
 ```bash linenums="1"
 ./build/xllm/core/server/xllm \
     --model=/path/to/your/qwen2-7b  \
-    --backend=llm \
     --port=9977 \
     --max_memory_utilization 0.90
 ```
diff --git a/docs/zh/getting_started/start_vlm_service.md b/docs/zh/getting_started/start_vlm_service.md
@@ -4,11 +4,11 @@
 ## 单卡
 启动服务，在`xllm`工程主目录中执行下面命令：
 ```bash
-ASCEND_RT_VISIBLE_DEVICES=0 ./build/xllm/core/server/xllm --model=/path/to/Qwen2.5-VL-7B-Instruct  --port=12345  --max_memory_utilization 0.90 --backend vlm --devices auto 
+ASCEND_RT_VISIBLE_DEVICES=0 ./build/xllm/core/server/xllm --model=/path/to/Qwen2.5-VL-7B-Instruct  --port=12345  --max_memory_utilization 0.90 --devices auto 
 ```
 
 ## 多卡
 启动服务，在`xllm`工程主目录中执行下面命令：
 ```bash
-ASCEND_RT_VISIBLE_DEVICES=0,1 ./build/xllm/core/server/xllm --model=/path/to/Qwen2.5-VL-7B-Instruct  --port=12345  --max_memory_utilization 0.90 --backend vlm --devices auto
+ASCEND_RT_VISIBLE_DEVICES=0,1 ./build/xllm/core/server/xllm --model=/path/to/Qwen2.5-VL-7B-Instruct  --port=12345  --max_memory_utilization 0.90 --devices auto
 ```
diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
@@ -56,10 +56,11 @@ DEFINE_string(model_id, "", "hf model name.");
 
 DEFINE_string(model, "", "Name or path of the huggingface model to use.");
 
-DEFINE_string(backend,
-              "llm",
-              "Choose the backend model type. 'llm' for text-only, "
-              "'vlm' for multimodal (text and images).");
+DEFINE_string(
+    backend,
+    "",
+    "Choose the backend model type. 'llm' for text-only, "
+    "'vlm' for multimodal (text and images), 'dit' for diffusion models.");
 
 DEFINE_string(task,
               "generate",
diff --git a/xllm/models/model_registry.cpp b/xllm/models/model_registry.cpp
@@ -71,6 +71,7 @@ void ModelRegistry::register_causallm_factory(const std::string& name,
                                               << " already registered.");
   } else {
     instance->model_registry_[name].causal_lm_factory = factory;
+    instance->model_backend_[name] = "llm";
   }
 }
 
@@ -83,6 +84,7 @@ void ModelRegistry::register_causalvlm_factory(const std::string& name,
                                                << " already registered.");
   } else {
     instance->model_registry_[name].causal_vlm_factory = factory;
+    instance->model_backend_[name] = "vlm";
   }
 }
 
@@ -95,6 +97,7 @@ void ModelRegistry::register_embeddinglm_factory(const std::string& name,
                                                  << " already registered.");
   } else {
     instance->model_registry_[name].embedding_lm_factory = factory;
+    instance->model_backend_[name] = "llm";
   }
 }
 
@@ -107,6 +110,7 @@ void ModelRegistry::register_dit_model_factory(const std::string& name,
                                               << " already registered.");
   } else {
     instance->model_registry_[name].dit_model_factory = factory;
+    instance->model_backend_[name] = "dit";
   }
 }
 
@@ -229,6 +233,11 @@ TokenizerArgsLoader ModelRegistry::get_tokenizer_args_loader(
   return instance->model_registry_[name].tokenizer_args_loader;
 }
 
+std::string ModelRegistry::get_model_backend(const std::string& name) {
+  ModelRegistry* instance = get_instance();
+  return instance->model_backend_[name];
+}
+
 std::unique_ptr<CausalLM> create_llm_model(const ModelContext& context) {
   // get the factory function for the model type from model registry
   auto factory = ModelRegistry::get_causallm_factory(
diff --git a/xllm/models/model_registry.h b/xllm/models/model_registry.h
@@ -126,8 +126,11 @@ class ModelRegistry {
   static ImageProcessorFactory get_image_processor_factory(
       const std::string& name);
 
+  static std::string get_model_backend(const std::string& name);
+
  private:
   std::unordered_map<std::string, ModelMeta> model_registry_;
+  std::unordered_map<std::string, std::string> model_backend_;
 };
 
 std::unique_ptr<CausalLM> create_llm_model(const ModelContext& context);
diff --git a/xllm/xllm.cpp b/xllm/xllm.cpp
@@ -30,9 +30,12 @@ limitations under the License.
 #include "core/common/options.h"
 #include "core/common/types.h"
 #include "core/runtime/master.h"
+#include "core/util/json_reader.h"
 #include "core/util/net.h"
 #include "core/util/utils.h"
+#include "models/model_registry.h"
 #include "server/xllm_server_registry.h"
+
 using namespace xllm;
 
 static std::atomic<uint32_t> signal_received{0};
@@ -42,30 +45,35 @@ void shutdown_handler(int signal) {
   exit(1);
 }
 
-std::optional<std::vector<uint32_t>> parse_batch_sizes(
-    const std::string& batch_sizes_str) {
-  if (batch_sizes_str.empty() || batch_sizes_str == "auto") {
-    return std::nullopt;
-  }
+std::string get_model_backend(const std::filesystem::path& model_path) {
+  JsonReader reader;
+  // for llm, vlm and rec models, the config.json file is in the model path
+  std::filesystem::path config_json_path = model_path / "config.json";
+  // for dit models, the model_index.json file is in the model path
+  std::filesystem::path model_index_json_path = model_path / "model_index.json";
 
-  // parse devices string
-  const std::vector<std::string> size_strs =
-      absl::StrSplit(batch_sizes_str, ',');
-  // remove duplicates
-  std::unordered_set<uint32_t> sizes_set;
-  for (const auto& size_str : size_strs) {
-    uint32_t batch_size = 0;
-    if (!absl::SimpleAtoi(size_str, &batch_size)) {
-      LOG(ERROR) << "Failed to parse batch size: " << size_str;
-      continue;
-    }
-    sizes_set.insert(batch_size);
-  }
+  if (std::filesystem::exists(model_index_json_path)) {
+    reader.parse(model_index_json_path);
 
-  if (sizes_set.empty()) {
-    return std::nullopt;
+    if (reader.value<std::string>("_diffusers_version").has_value()) {
+      return "dit";
+    } else {
+      LOG(FATAL) << "Please check model_index.json file in model path: "
+                 << model_path << ", it should contain _diffusers_version key.";
+    }
+  } else if (std::filesystem::exists(config_json_path)) {
+    reader.parse(config_json_path);
+    std::string model_type = reader.value<std::string>("model_type").value();
+    if (model_type.empty()) {
+      LOG(FATAL) << "Please check config.json file in model path: "
+                 << model_path << ", it should contain model_type key.";
+    }
+    return ModelRegistry::get_model_backend(model_type);
+  } else {
+    LOG(FATAL) << "Please check config.json or model_index.json file, one of "
+                  "them should exist in the model path: "
+               << model_path;
   }
-  return std::vector<uint32_t>{sizes_set.begin(), sizes_set.end()};
 }
 
 int run() {
@@ -74,10 +82,11 @@ int run() {
     LOG(FATAL) << "Model path " << FLAGS_model << " does not exist.";
   }
 
+  std::filesystem::path model_path =
+      std::filesystem::path(FLAGS_model).lexically_normal();
+
   if (FLAGS_model_id.empty()) {
     // use last part of the path as model id
-    std::filesystem::path model_path =
-        std::filesystem::path(FLAGS_model).lexically_normal();
     if (model_path.has_filename()) {
       FLAGS_model_id = std::filesystem::path(FLAGS_model).filename();
     } else {
@@ -86,6 +95,10 @@ int run() {
     }
   }
 
+  if (FLAGS_backend.empty()) {
+    FLAGS_backend = get_model_backend(model_path);
+  }
+
   if (FLAGS_host.empty()) {
     // set the host to the local IP when the host is empty
     FLAGS_host = net::get_local_ip_addr();
@@ -200,8 +213,6 @@ int run() {
 
   // supported models
   std::vector<std::string> model_names = {FLAGS_model_id};
-  std::filesystem::path model_path =
-      std::filesystem::path(FLAGS_model).lexically_normal();
   std::string model_version;
   if (model_path.has_filename()) {
     model_version = std::filesystem::path(FLAGS_model).filename();

Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,7 @@ void ModelRegistry::register_causallm_factory(const std::string& name,`
`71`	`71`	`<< " already registered.");`
`72`	`72`	`} else {`
`73`	`73`	`instance->model_registry_[name].causal_lm_factory = factory;`
	`74`	`+ instance->model_backend_[name] = "llm";`
`74`	`75`	`}`
`75`	`76`	`}`
`76`	`77`
`@@ -83,6 +84,7 @@ void ModelRegistry::register_causalvlm_factory(const std::string& name,`
`83`	`84`	`<< " already registered.");`
`84`	`85`	`} else {`
`85`	`86`	`instance->model_registry_[name].causal_vlm_factory = factory;`
	`87`	`+ instance->model_backend_[name] = "vlm";`
`86`	`88`	`}`
`87`	`89`	`}`
`88`	`90`
`@@ -95,6 +97,7 @@ void ModelRegistry::register_embeddinglm_factory(const std::string& name,`
`95`	`97`	`<< " already registered.");`
`96`	`98`	`} else {`
`97`	`99`	`instance->model_registry_[name].embedding_lm_factory = factory;`
	`100`	`+ instance->model_backend_[name] = "llm";`
`98`	`101`	`}`
`99`	`102`	`}`
`100`	`103`
`@@ -107,6 +110,7 @@ void ModelRegistry::register_dit_model_factory(const std::string& name,`
`107`	`110`	`<< " already registered.");`
`108`	`111`	`} else {`
`109`	`112`	`instance->model_registry_[name].dit_model_factory = factory;`
	`113`	`+ instance->model_backend_[name] = "dit";`
`110`	`114`	`}`
`111`	`115`	`}`
`112`	`116`
`@@ -229,6 +233,11 @@ TokenizerArgsLoader ModelRegistry::get_tokenizer_args_loader(`
`229`	`233`	`return instance->model_registry_[name].tokenizer_args_loader;`
`230`	`234`	`}`
`231`	`235`
	`236`	`+std::string ModelRegistry::get_model_backend(const std::string& name) {`
	`237`	`+ ModelRegistry* instance = get_instance();`
	`238`	`+ return instance->model_backend_[name];`
	`239`	`+}`
	`240`	`+`
`232`	`241`	`std::unique_ptr<CausalLM> create_llm_model(const ModelContext& context) {`
`233`	`242`	`// get the factory function for the model type from model registry`
`234`	`243`	`auto factory = ModelRegistry::get_causallm_factory(`