Skip to content

Commit f044309

Browse files
committed
feat: rename and delete some flags.
Signed-off-by: Tao Peng <[email protected]>
1 parent b3af2da commit f044309

32 files changed

+127
-136
lines changed

docs/zh/cli_reference.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ xLLM使用gflags来管理服务启动参数,具体的参数含义如下:
7070
|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|
7171
| `max_concurrent_requests` | int32 | 0 | 任意大于0的整数 | 限流用,限制实例中正在处理的总请求数 | |
7272
| `model_id` | string | "" | ip:port | 模型名称,非路径 | |
73-
| `num_handling_threads` | int32 | 4 | 任意大于0的整数 | 处理输入请求的线程池大小 | |
73+
| `num_request_handling_threads` | int32 | 4 | 任意大于0的整数 | 处理输入请求的线程池大小 | |
7474
| `num_response_handling_threads` | int32 | 4 | 任意大于0的整数 | 处理输出的线程池大小 | |
7575
| `prefill_scheduling_memory_usage_threshold` | double | 0.95 | 0-1之间的值 | 当kv cache使用量达到该阈值时,暂停prefill请求的调度 | |
7676
| `num_response_handling_threads` | int32 | 4 | 任意大于0的整数 | 处理输出的线程池大小 | |

xllm/core/common/global_flags.cpp

Lines changed: 28 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,21 @@ DEFINE_string(host, "", "Host name for brpc server.");
2727

2828
DEFINE_int32(port, 8010, "Port for brpc server.");
2929

30-
DEFINE_int32(idle_timeout_s,
30+
DEFINE_int32(
31+
rpc_idle_timeout_s,
32+
-1,
33+
"Connection will be closed if there is no read/write operations "
34+
"during the last `rpc_idle_timeout_s`. -1 means wait indefinitely.");
35+
36+
DEFINE_int32(rpc_channel_timeout_ms,
3137
-1,
32-
"Connection will be closed if there is no read/write operations "
33-
"during the last `idle_timeout_s`. -1 means wait indefinitely.");
38+
"Max duration of bRPC Channel. -1 means wait indefinitely.");
3439

35-
DEFINE_int32(num_threads, 32, "Number of threads to process requests.");
40+
DEFINE_int32(max_reconnect_count,
41+
40,
42+
"The max count for worker try to connect to server.");
3643

37-
DEFINE_int32(max_concurrency,
38-
0,
39-
"Limit number of requests processed in parallel.");
44+
DEFINE_int32(num_threads, 32, "Number of threads to process requests.");
4045

4146
DEFINE_int32(
4247
max_concurrent_requests,
@@ -74,11 +79,13 @@ DEFINE_bool(enable_mla,
7479
false,
7580
"Whether to enable multi-head latent attention.");
7681

82+
// --- graph mode execution config ---
83+
7784
DEFINE_bool(enable_acl_graph,
7885
false,
7986
"Whether to enable ACL graph execution for decode phase.");
8087

81-
DEFINE_int32(max_tokens_per_seq,
88+
DEFINE_int32(max_seq_len_for_graph_mode,
8289
20480,
8390
"Maximum number of tokens per sequence for ACL graph execution.");
8491

@@ -91,11 +98,13 @@ DEFINE_int32(limit_image_per_prompt,
9198

9299
// --- threading config ---
93100

94-
DEFINE_int32(num_handling_threads, 4, "Number of handling threads.");
101+
DEFINE_int32(num_request_handling_threads,
102+
4,
103+
"Number of threads for handling input requests.");
95104

96105
DEFINE_int32(num_response_handling_threads,
97106
4,
98-
"Number of response handling threads.");
107+
"Number of threads for handling responses.");
99108

100109
// --- kvcache config ---
101110

@@ -141,9 +150,10 @@ DEFINE_bool(use_zero_evict,
141150
false,
142151
"Use ZeroEvictionScheduler but ContinuousScheduler.");
143152

144-
DEFINE_int32(max_decode_token_per_sequence,
145-
256,
146-
"Max decode token per sequence.");
153+
DEFINE_int32(
154+
max_decode_token_per_sequence,
155+
256,
156+
"Max decode token per sequence which used for ZeroEvictionScheduler.");
147157

148158
// --- parallel config ---
149159

@@ -168,10 +178,10 @@ DEFINE_int64(eplb_update_interval, 1000, "EPLB update rate.");
168178

169179
DEFINE_double(eplb_update_threshold, 0.8, "EPLB update threshold.");
170180

171-
DEFINE_string(rank_tablefile, "", "ATB HCCL rank table file.");
172-
173181
DEFINE_int32(expert_parallel_degree, 0, "Expert parallel degree.");
174182

183+
DEFINE_string(rank_tablefile, "", "ATB HCCL rank table file.");
184+
175185
// --- profile config ---
176186

177187
DEFINE_bool(enable_profile_step_time,
@@ -261,20 +271,8 @@ DEFINE_string(kv_cache_transfer_mode,
261271
"PUSH",
262272
"The mode of kv cache transfer(e.g. PUSH, PULL).");
263273

264-
DEFINE_string(device_ip, "", "The device ip.");
265-
266274
DEFINE_int32(transfer_listen_port, 26000, "The KVCacheTranfer listen port.");
267275

268-
// --- worker server config ---
269-
270-
DEFINE_int32(max_connect_count,
271-
40,
272-
"The max count for worker try to connect to server.");
273-
274-
DEFINE_int32(sleep_time_second,
275-
3,
276-
"The sleep time for worker try to connect to server next time.");
277-
278276
DEFINE_bool(enable_shm,
279277
true,
280278
"Whether to enable shared memory for executing model.");
@@ -311,10 +309,6 @@ DEFINE_double(heart_beat_interval, 0.5, "Heart beat interval.");
311309

312310
DEFINE_int32(etcd_ttl, 3, "Time to live for etcd.");
313311

314-
DEFINE_int32(timeout_ms,
315-
-1,
316-
"Max duration of bRPC Channel. -1 means wait indefinitely.");
317-
318312
// --- priority strategy config ---
319313

320314
DEFINE_string(priority_strategy,
@@ -354,7 +348,7 @@ DEFINE_bool(
354348
"Whether to enable computation communication parallel by two streams "
355349
"and two micro batches in prefill stage.");
356350

357-
DEFINE_int32(default_micro_batch_num,
351+
DEFINE_int32(micro_batch_num,
358352
2,
359353
"Default use two micro batches for multi-stream parallel.");
360354

@@ -368,7 +362,7 @@ DEFINE_bool(enable_continuous_kvcache,
368362
"Whether to enable continuous kv cache.");
369363

370364
DEFINE_int64(
371-
granularity_size,
365+
phy_page_granularity_size,
372366
2 * 1024 * 1024,
373367
"Granularity size for one physical page in bytes, default 2MB, when enable "
374368
"continuous kv cache.");
@@ -388,4 +382,4 @@ DEFINE_bool(enable_beam_search_kernel,
388382
"Whether to enable beam search kernel.");
389383

390384
// --- qwen3 reranker config
391-
DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");
385+
DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");

xllm/core/common/global_flags.h

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,10 @@ DECLARE_int32(port);
2525

2626
DECLARE_int32(disagg_pd_port);
2727

28-
DECLARE_int32(idle_timeout_s);
28+
DECLARE_int32(rpc_idle_timeout_s);
2929

3030
DECLARE_int32(num_threads);
3131

32-
DECLARE_int32(max_concurrency);
33-
3432
DECLARE_string(model_id);
3533

3634
DECLARE_string(model);
@@ -67,7 +65,7 @@ DECLARE_int32(max_tokens_per_chunk_for_prefill);
6765

6866
DECLARE_int32(num_speculative_tokens);
6967

70-
DECLARE_int32(num_handling_threads);
68+
DECLARE_int32(num_request_handling_threads);
7169

7270
DECLARE_int32(num_response_handling_threads);
7371

@@ -87,7 +85,7 @@ DECLARE_bool(enable_mla);
8785

8886
DECLARE_bool(enable_acl_graph);
8987

90-
DECLARE_int32(max_tokens_per_seq);
88+
DECLARE_int32(max_seq_len_for_graph_mode);
9189

9290
DECLARE_bool(enable_chunked_prefill);
9391

@@ -125,9 +123,7 @@ DECLARE_double(prefill_scheduling_memory_usage_threshold);
125123

126124
DECLARE_int32(expert_parallel_degree);
127125

128-
DECLARE_int32(max_connect_count);
129-
130-
DECLARE_int32(sleep_time_second);
126+
DECLARE_int32(max_reconnect_count);
131127

132128
DECLARE_bool(enable_atb_comm_multiprocess);
133129

@@ -145,7 +141,7 @@ DECLARE_double(heart_beat_interval);
145141

146142
DECLARE_int32(etcd_ttl);
147143

148-
DECLARE_int32(timeout_ms);
144+
DECLARE_int32(rpc_channel_timeout_ms);
149145

150146
DECLARE_int32(chunked_match_frequency);
151147

@@ -169,7 +165,7 @@ DECLARE_string(store_metadata_connstring);
169165

170166
DECLARE_bool(enable_multi_stream_parallel);
171167

172-
DECLARE_int32(default_micro_batch_num);
168+
DECLARE_int32(micro_batch_num);
173169

174170
DECLARE_bool(enable_profile_step_time);
175171

@@ -193,7 +189,7 @@ DECLARE_int32(max_requests_per_batch);
193189

194190
DECLARE_bool(enable_continuous_kvcache);
195191

196-
DECLARE_int64(granularity_size);
192+
DECLARE_int64(phy_page_granularity_size);
197193

198194
DECLARE_int64(cache_size_per_token);
199195

xllm/core/common/options.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ std::string Options::to_string() const {
3333
<< ", max_tokens_per_chunk_for_prefill: "
3434
<< max_tokens_per_chunk_for_prefill()
3535
<< ", num_speculative_tokens: " << num_speculative_tokens()
36-
<< ", num_handling_threads: " << num_handling_threads()
36+
<< ", num_request_handling_threads: " << num_request_handling_threads()
3737
<< ", communication_backend: " << communication_backend().value_or("null")
3838
<< ", rank_tablefile: " << rank_tablefile().value_or("null")
3939
<< ", expert_parallel_degree: " << expert_parallel_degree().value_or(0)

xllm/core/common/options.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ class Options {
7373
PROPERTY(int32_t, num_speculative_tokens) = 0;
7474

7575
// thread num to handle requests
76-
PROPERTY(size_t, num_handling_threads) = 4;
76+
PROPERTY(size_t, num_request_handling_threads) = 4;
7777

7878
PROPERTY(std::optional<bool>, enable_eplb);
7979

xllm/core/distributed_runtime/remote_worker.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,20 +46,20 @@ RemoteWorker::RemoteWorker(int32_t global_rank,
4646
bool RemoteWorker::wait_for_server_ready(const std::string& server_address) {
4747
// Retry until server initialize ready
4848
int try_count = 0;
49-
while (try_count < FLAGS_max_connect_count) {
49+
const int sleep_time_second = 3;
50+
while (try_count < FLAGS_max_reconnect_count) {
5051
if (channel_->hello()) {
5152
LOG(INFO) << "RemoteWorker Hello connected, server_address: "
5253
<< server_address << ", global_rank_: " << global_rank_;
5354
break;
5455
} else {
55-
std::this_thread::sleep_for(
56-
std::chrono::seconds(FLAGS_sleep_time_second));
56+
std::this_thread::sleep_for(std::chrono::seconds(sleep_time_second));
5757
}
5858

5959
try_count++;
6060
}
6161

62-
if (try_count >= FLAGS_max_connect_count) {
62+
if (try_count >= FLAGS_max_reconnect_count) {
6363
LOG(ERROR) << "RemoteWorker Hello method failed, global_rank_ is "
6464
<< global_rank_;
6565
return false;

xllm/core/distributed_runtime/worker_server.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -250,15 +250,15 @@ bool WorkerServer::sync_master_node(const std::string& master_node_addr,
250250
// Retry until master node ready
251251
int try_count = 0;
252252
brpc::Controller cntl;
253-
while (try_count < FLAGS_max_connect_count) {
253+
const int sleep_time_second = 3;
254+
while (try_count < FLAGS_max_reconnect_count) {
254255
cntl.Reset();
255256
stub.Sync(&cntl, &addr_info, &uids, NULL);
256257
if (cntl.Failed()) {
257258
LOG(WARNING) << "Worker#" << addr_info.global_rank()
258259
<< " try connect to engine server error, try again."
259260
<< " Error message: " << cntl.ErrorText();
260-
std::this_thread::sleep_for(
261-
std::chrono::seconds(FLAGS_sleep_time_second));
261+
std::this_thread::sleep_for(std::chrono::seconds(sleep_time_second));
262262
} else {
263263
LOG(INFO) << "Worker#" << addr_info.global_rank() << " connect to "
264264
<< master_node_addr << " success.";
@@ -267,7 +267,7 @@ bool WorkerServer::sync_master_node(const std::string& master_node_addr,
267267
try_count++;
268268
}
269269

270-
if (try_count >= FLAGS_max_connect_count) {
270+
if (try_count >= FLAGS_max_reconnect_count) {
271271
LOG(ERROR) << "Worker#" << addr_info.global_rank() << " connect to "
272272
<< master_node_addr << " failed."
273273
<< " Error message: " << cntl.ErrorText();

xllm/core/framework/xtensor/multi_layer_xtensor.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ void MultiLayerXTensor::append_phy_pages(
4242

4343
void MultiLayerXTensor::free(int32_t seq_id) {
4444
size_t aligned_size =
45-
get_num_pages_per_layer(seq_id) * FLAGS_granularity_size;
45+
get_num_pages_per_layer(seq_id) * FLAGS_phy_page_granularity_size;
4646
for (size_t layer_idx = 0; layer_idx < num_layers_; layer_idx++) {
4747
VirPtr vir_ptr = get_vir_ptr(seq_id, layer_idx);
4848
vmm::unmap(vir_ptr, aligned_size);
@@ -60,4 +60,4 @@ void MultiLayerXTensor::deallocate_seq_id(int32_t seq_id) {
6060
free_seq_ids_[num_free_seq_ids_++] = seq_id;
6161
}
6262

63-
} // namespace xllm
63+
} // namespace xllm

xllm/core/framework/xtensor/phy_page_pool.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,16 @@ void PhyPagePool::batch_map(VirPtr vir_ptr,
9999
int64_t layer_idx) const {
100100
size_t num_pages = page_ids.size();
101101

102-
size_t ptr_offset = (num_pages - num_new_pages) * FLAGS_granularity_size;
102+
size_t ptr_offset =
103+
(num_pages - num_new_pages) * FLAGS_phy_page_granularity_size;
103104

104-
VirPtr temp_vir_ptr = reinterpret_cast<VirPtr>((char*)vir_ptr + ptr_offset);
105+
VirPtr temp_vir_ptr = reinterpret_cast<VirPtr>(vir_ptr + ptr_offset);
105106

106107
for (size_t j = num_new_pages; j > 0; --j) {
107108
uint32_t page_id = page_ids[num_pages - j];
108109
map(temp_vir_ptr, page_id, layer_idx);
109-
temp_vir_ptr =
110-
reinterpret_cast<VirPtr>((char*)temp_vir_ptr + FLAGS_granularity_size);
110+
temp_vir_ptr = reinterpret_cast<VirPtr>(temp_vir_ptr +
111+
FLAGS_phy_page_granularity_size);
111112
}
112113
}
113114
} // namespace xllm

xllm/core/framework/xtensor/remote_xtensor_manager.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,12 @@ bool RemoteXTensorManager::wait_for_server_ready(
5656
// Retry until server initialize ready
5757
int try_count = 0;
5858
brpc::Controller cntl;
59-
while (try_count < FLAGS_max_connect_count) {
59+
const int sleep_time_second = 3;
60+
while (try_count < FLAGS_max_reconnect_count) {
6061
cntl.Reset();
6162
stub_->Hello(&cntl, &req, &resp, nullptr);
6263
if (cntl.Failed() || !resp.ok()) {
63-
std::this_thread::sleep_for(
64-
std::chrono::seconds(FLAGS_sleep_time_second));
64+
std::this_thread::sleep_for(std::chrono::seconds(sleep_time_second));
6565
} else {
6666
LOG(INFO) << "RemoteXTensorManager Hello connected, server_address: "
6767
<< server_address << ", global_rank_: " << global_rank_;
@@ -71,7 +71,7 @@ bool RemoteXTensorManager::wait_for_server_ready(
7171
try_count++;
7272
}
7373

74-
if (try_count >= FLAGS_max_connect_count) {
74+
if (try_count >= FLAGS_max_reconnect_count) {
7575
LOG(ERROR) << "RemoteXTensorManager Hello method failed, global_rank_ is "
7676
<< global_rank_ << ", error: " << cntl.ErrorText();
7777
return false;

0 commit comments

Comments
 (0)