refactor: change default value of enable_schedule_overlap flag from false to true.

guojinrong-nn · yq33victor · commit fefea34560ed · 2025-09-05T20:06:37.000+08:00
diff --git a/docs/en/features/async_schedule.md b/docs/en/features/async_schedule.md
@@ -13,16 +13,17 @@ In the overall architecture, stages 1 and 3 on the CPU side are handled by diffe
 
 ## Usage
 
-xLLM provides the gflags parameter enable_schedule_overlap, which defaults to false. To enable this feature, simply set it to true in xLLM’s service startup script, as
+xLLM provides the gflags parameter enable_schedule_overlap, which defaults to true. To disable this feature, simply set it to false in xLLM's service startup script, as
 ```shell
---enable_schedule_overlap=true
+--enable_schedule_overlap=false
 ```
 
 ## Performance
 
-- With asynchronous scheduling enabled, the device idle time between two steps is approximately 200μs - comparable to a single kernel launch duration.
+- With asynchronous scheduling enabled, the device idle time between two steps is approximately 200us - comparable to a single kernel launch duration.
 - On the DeepSeek-R1-Distill-Qwen-1.5B model with TPOT constrained to 50ms, this achieves 17% throughput improvement.
 
 
 ## Notice
-The asynchronous scheduling feature requires the server to compute one additional step. For use cases involving limited output tokens (e.g., few-token generation) or single-output scenarios like embedding models, enabling this feature is not recommended as it may reduce server-side throughput.
+The asynchronous scheduling feature requires the server to compute one additional step. For use cases involving limited output tokens (e.g., few-token generation) or single-output scenarios like embedding models, enabling this feature is not recommended as it may reduce server-side throughput, thus hard-disabled internally.
+The VLM model is currently being adapted, will be temporarily disabled.
diff --git a/docs/zh/features/async_schedule.md b/docs/zh/features/async_schedule.md
@@ -17,9 +17,9 @@ xLLM在框架层支持了异步调度功能，在device执行 step-i 计算的
 
 ## 使用方式
 
-xLLM中提供了gflags参数`enable_schedule_overlap`，默认false，如需开启在xLLM的服务启动脚本中设置为true即可，示例如下：
+xLLM中提供了gflags参数`enable_schedule_overlap`，默认true，如需关闭在xLLM的服务启动脚本中设置为false即可，示例如下：
 ```shell
---enable_schedule_overlap=true
+--enable_schedule_overlap=false
 ```
 
 
@@ -29,4 +29,5 @@ xLLM中提供了gflags参数`enable_schedule_overlap`，默认false，如需开
 
 
 !!! warning "注意"
-    - 异步调度功能会在服务端额外计算一个step，当使用场景中输出token数量较少，或是类似embedding模型只一次性输出的场景，不建议开启，会影响服务端吞吐。
+    - 异步调度功能会在服务端额外计算一个step，当使用场景中输出token数量较少，或是类似embedding模型只一次性输出的场景，会影响服务端吞吐，所以强制关闭异步调度。
+    - VLM模型正在适配中，暂时会强制关闭异步调度。
diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
@@ -112,7 +112,7 @@ DEFINE_int32(dp_size, 1, "Data parallel size for MLA attention.");
 DEFINE_int32(ep_size, 1, "Expert parallel size for MoE model.");
 
 DEFINE_bool(enable_schedule_overlap,
-            false,
+            true,
             "Whether to enable schedule overlap.");
 
 DEFINE_double(prefill_scheduling_memory_usage_threshold,
diff --git a/xllm/core/common/options.h b/xllm/core/common/options.h
@@ -109,7 +109,7 @@ class Options {
 
   PROPERTY(bool, enable_disagg_pd) = false;
 
-  PROPERTY(bool, enable_schedule_overlap) = false;
+  PROPERTY(bool, enable_schedule_overlap) = true;
 
   PROPERTY(InstanceRole, instance_role) = InstanceRole::DEFAULT;
 
diff --git a/xllm/core/runtime/master.cpp b/xllm/core/runtime/master.cpp
@@ -98,6 +98,9 @@ Master::Master(const Options& options, EngineType type) : options_(options) {
   }
 
   if (type == EngineType::VLM) {
+    options_.enable_schedule_overlap(false);
+    LOG(WARNING) << "Force to disable schedule overlap for VLM model, not "
+                    "supported yet.";
     runtime::Options eng_options;
     eng_options.model_path(options_.model_path())
         .devices(devices)
@@ -109,7 +112,8 @@ Master::Master(const Options& options, EngineType type) : options_(options) {
         .enable_chunked_prefill(options_.enable_chunked_prefill())
         .enable_disagg_pd(options_.enable_disagg_pd())
         .enable_service_routing(options_.enable_service_routing())
-        .enable_cache_upload(options_.enable_cache_upload());
+        .enable_cache_upload(options_.enable_cache_upload())
+        .enable_schedule_overlap(options_.enable_schedule_overlap());
 
     auto engine = std::make_unique<VLMEngine>(eng_options);
     engine_ = std::move(engine);
@@ -157,6 +161,11 @@ Master::Master(const Options& options, EngineType type) : options_(options) {
     auto spec_engine = std::make_unique<SpeculativeEngine>(spec_options);
     engine_ = std::move(spec_engine);
   } else if (type == EngineType::LLM) {
+    if (options_.task_type() == "embed") {
+      options_.enable_schedule_overlap(false);
+      LOG(WARNING) << "Force to disable schedule overlap for embedding model, "
+                      "avoiding performance degradation.";
+    }
     runtime::Options eng_options;
     eng_options.model_path(options_.model_path())
         .devices(devices)
diff --git a/xllm/core/runtime/options.h b/xllm/core/runtime/options.h
@@ -86,7 +86,7 @@ struct Options {
   PROPERTY(int32_t, ep_size) = 1;
 
   // enable enable_schedule_overlap to improve runtime execution efficiency.
-  PROPERTY(bool, enable_schedule_overlap) = false;
+  PROPERTY(bool, enable_schedule_overlap) = true;
 
   // enable chunked prefill.
   PROPERTY(bool, enable_chunked_prefill) = true;
diff --git a/xllm/core/scheduler/continuous_scheduler.h b/xllm/core/scheduler/continuous_scheduler.h
@@ -83,7 +83,7 @@ class ContinuousScheduler : public Scheduler {
     // default value is 1.
     PROPERTY(int32_t, max_reqs_p2d_once) = 1;
 
-    PROPERTY(bool, enable_schedule_overlap) = false;
+    PROPERTY(bool, enable_schedule_overlap) = true;
 
     PROPERTY(bool, enable_chunked_prefill) = true;
 
diff --git a/xllm/models/model_registry.cpp b/xllm/models/model_registry.cpp
@@ -93,7 +93,7 @@ void ModelRegistry::register_model_args_loader(const std::string& name,
   ModelRegistry* instance = get_instance();
 
   if (instance->model_registry_[name].model_args_loader != nullptr) {
-    LOG(WARNING) << "model args loader for " << name << "already registered.";
+    LOG(WARNING) << "model args loader for " << name << " already registered.";
   } else {
     instance->model_registry_[name].model_args_loader = loader;
   }

Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,7 @@ void ModelRegistry::register_model_args_loader(const std::string& name,`
`93`	`93`	`ModelRegistry* instance = get_instance();`
`94`	`94`
`95`	`95`	`if (instance->model_registry_[name].model_args_loader != nullptr) {`
`96`		`- LOG(WARNING) << "model args loader for " << name << "already registered.";`
	`96`	`+ LOG(WARNING) << "model args loader for " << name << " already registered.";`
`97`	`97`	`} else {`
`98`	`98`	`instance->model_registry_[name].model_args_loader = loader;`
`99`	`99`	`}`