Skip to content

Commit fefea34

Browse files
guojinrong-nnyq33victor
authored andcommitted
refactor: change default value of enable_schedule_overlap flag from false to true.
1 parent 0594936 commit fefea34

File tree

8 files changed

+24
-13
lines changed

8 files changed

+24
-13
lines changed

docs/en/features/async_schedule.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,17 @@ In the overall architecture, stages 1 and 3 on the CPU side are handled by diffe
1313

1414
## Usage
1515

16-
xLLM provides the gflags parameter enable_schedule_overlap, which defaults to false. To enable this feature, simply set it to true in xLLMs service startup script, as
16+
xLLM provides the gflags parameter enable_schedule_overlap, which defaults to true. To disable this feature, simply set it to false in xLLM's service startup script, as
1717
```shell
18-
--enable_schedule_overlap=true
18+
--enable_schedule_overlap=false
1919
```
2020

2121
## Performance
2222

23-
- With asynchronous scheduling enabled, the device idle time between two steps is approximately 200μs - comparable to a single kernel launch duration.
23+
- With asynchronous scheduling enabled, the device idle time between two steps is approximately 200us - comparable to a single kernel launch duration.
2424
- On the DeepSeek-R1-Distill-Qwen-1.5B model with TPOT constrained to 50ms, this achieves 17% throughput improvement.
2525

2626

2727
## Notice
28-
The asynchronous scheduling feature requires the server to compute one additional step. For use cases involving limited output tokens (e.g., few-token generation) or single-output scenarios like embedding models, enabling this feature is not recommended as it may reduce server-side throughput.
28+
The asynchronous scheduling feature requires the server to compute one additional step. For use cases involving limited output tokens (e.g., few-token generation) or single-output scenarios like embedding models, enabling this feature is not recommended as it may reduce server-side throughput, thus hard-disabled internally.
29+
The VLM model is currently being adapted, will be temporarily disabled.

docs/zh/features/async_schedule.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ xLLM在框架层支持了异步调度功能,在device执行 step-i 计算的
1717

1818
## 使用方式
1919

20-
xLLM中提供了gflags参数`enable_schedule_overlap`默认false,如需开启在xLLM的服务启动脚本中设置为true即可,示例如下:
20+
xLLM中提供了gflags参数`enable_schedule_overlap`默认true,如需关闭在xLLM的服务启动脚本中设置为false即可,示例如下:
2121
```shell
22-
--enable_schedule_overlap=true
22+
--enable_schedule_overlap=false
2323
```
2424

2525

@@ -29,4 +29,5 @@ xLLM中提供了gflags参数`enable_schedule_overlap`,默认false,如需开
2929

3030

3131
!!! warning "注意"
32-
- 异步调度功能会在服务端额外计算一个step,当使用场景中输出token数量较少,或是类似embedding模型只一次性输出的场景,不建议开启,会影响服务端吞吐。
32+
- 异步调度功能会在服务端额外计算一个step,当使用场景中输出token数量较少,或是类似embedding模型只一次性输出的场景,会影响服务端吞吐,所以强制关闭异步调度。
33+
- VLM模型正在适配中,暂时会强制关闭异步调度。

xllm/core/common/global_flags.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ DEFINE_int32(dp_size, 1, "Data parallel size for MLA attention.");
112112
DEFINE_int32(ep_size, 1, "Expert parallel size for MoE model.");
113113

114114
DEFINE_bool(enable_schedule_overlap,
115-
false,
115+
true,
116116
"Whether to enable schedule overlap.");
117117

118118
DEFINE_double(prefill_scheduling_memory_usage_threshold,

xllm/core/common/options.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ class Options {
109109

110110
PROPERTY(bool, enable_disagg_pd) = false;
111111

112-
PROPERTY(bool, enable_schedule_overlap) = false;
112+
PROPERTY(bool, enable_schedule_overlap) = true;
113113

114114
PROPERTY(InstanceRole, instance_role) = InstanceRole::DEFAULT;
115115

xllm/core/runtime/master.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,9 @@ Master::Master(const Options& options, EngineType type) : options_(options) {
9898
}
9999

100100
if (type == EngineType::VLM) {
101+
options_.enable_schedule_overlap(false);
102+
LOG(WARNING) << "Force to disable schedule overlap for VLM model, not "
103+
"supported yet.";
101104
runtime::Options eng_options;
102105
eng_options.model_path(options_.model_path())
103106
.devices(devices)
@@ -109,7 +112,8 @@ Master::Master(const Options& options, EngineType type) : options_(options) {
109112
.enable_chunked_prefill(options_.enable_chunked_prefill())
110113
.enable_disagg_pd(options_.enable_disagg_pd())
111114
.enable_service_routing(options_.enable_service_routing())
112-
.enable_cache_upload(options_.enable_cache_upload());
115+
.enable_cache_upload(options_.enable_cache_upload())
116+
.enable_schedule_overlap(options_.enable_schedule_overlap());
113117

114118
auto engine = std::make_unique<VLMEngine>(eng_options);
115119
engine_ = std::move(engine);
@@ -157,6 +161,11 @@ Master::Master(const Options& options, EngineType type) : options_(options) {
157161
auto spec_engine = std::make_unique<SpeculativeEngine>(spec_options);
158162
engine_ = std::move(spec_engine);
159163
} else if (type == EngineType::LLM) {
164+
if (options_.task_type() == "embed") {
165+
options_.enable_schedule_overlap(false);
166+
LOG(WARNING) << "Force to disable schedule overlap for embedding model, "
167+
"avoiding performance degradation.";
168+
}
160169
runtime::Options eng_options;
161170
eng_options.model_path(options_.model_path())
162171
.devices(devices)

xllm/core/runtime/options.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ struct Options {
8686
PROPERTY(int32_t, ep_size) = 1;
8787

8888
// enable enable_schedule_overlap to improve runtime execution efficiency.
89-
PROPERTY(bool, enable_schedule_overlap) = false;
89+
PROPERTY(bool, enable_schedule_overlap) = true;
9090

9191
// enable chunked prefill.
9292
PROPERTY(bool, enable_chunked_prefill) = true;

xllm/core/scheduler/continuous_scheduler.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ class ContinuousScheduler : public Scheduler {
8383
// default value is 1.
8484
PROPERTY(int32_t, max_reqs_p2d_once) = 1;
8585

86-
PROPERTY(bool, enable_schedule_overlap) = false;
86+
PROPERTY(bool, enable_schedule_overlap) = true;
8787

8888
PROPERTY(bool, enable_chunked_prefill) = true;
8989

xllm/models/model_registry.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ void ModelRegistry::register_model_args_loader(const std::string& name,
9393
ModelRegistry* instance = get_instance();
9494

9595
if (instance->model_registry_[name].model_args_loader != nullptr) {
96-
LOG(WARNING) << "model args loader for " << name << "already registered.";
96+
LOG(WARNING) << "model args loader for " << name << " already registered.";
9797
} else {
9898
instance->model_registry_[name].model_args_loader = loader;
9999
}

0 commit comments

Comments
 (0)