jd-opensource
diff --git a/‎docs/zh/cli_reference.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/zh/cli_reference.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/core/common/global_flags.cpp‎
Lines changed: 28 additions & 34 deletions b/‎xllm/core/common/global_flags.cpp‎
Lines changed: 28 additions & 34 deletions
diff --git a/‎xllm/core/common/global_flags.h‎
Lines changed: 7 additions & 11 deletions b/‎xllm/core/common/global_flags.h‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎xllm/core/common/options.cpp‎
Lines changed: 1 addition & 1 deletion b/‎xllm/core/common/options.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/core/common/options.h‎
Lines changed: 1 addition & 1 deletion b/‎xllm/core/common/options.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/core/distributed_runtime/remote_worker.cpp‎
Lines changed: 4 additions & 4 deletions b/‎xllm/core/distributed_runtime/remote_worker.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎xllm/core/distributed_runtime/worker_server.cpp‎
Lines changed: 4 additions & 4 deletions b/‎xllm/core/distributed_runtime/worker_server.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎xllm/core/framework/xtensor/multi_layer_xtensor.cpp‎
Lines changed: 2 additions & 2 deletions b/‎xllm/core/framework/xtensor/multi_layer_xtensor.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎xllm/core/framework/xtensor/phy_page_pool.cpp‎
Lines changed: 5 additions & 4 deletions b/‎xllm/core/framework/xtensor/phy_page_pool.cpp‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎xllm/core/framework/xtensor/remote_xtensor_manager.cpp‎
Lines changed: 4 additions & 4 deletions b/‎xllm/core/framework/xtensor/remote_xtensor_manager.cpp‎
Lines changed: 4 additions & 4 deletions
@@ -70,7 +70,7 @@ xLLM使用gflags来管理服务启动参数，具体的参数含义如下：
 |:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|
 | `max_concurrent_requests` | int32 | 0 | 任意大于0的整数 | 限流用，限制实例中正在处理的总请求数 |  |
 | `model_id` | string | "" | ip:port | 模型名称，非路径 |  |
-| `num_handling_threads` | int32 | 4 | 任意大于0的整数 | 处理输入请求的线程池大小 |  |
+| `num_request_handling_threads` | int32 | 4 | 任意大于0的整数 | 处理输入请求的线程池大小 |  |
 | `num_response_handling_threads` | int32 | 4 | 任意大于0的整数 | 处理输出的线程池大小 |  |
 | `prefill_scheduling_memory_usage_threshold` | double | 0.95 | 0-1之间的值 | 当kv cache使用量达到该阈值时，暂停prefill请求的调度 |  |
 | `num_response_handling_threads` | int32 | 4 | 任意大于0的整数 | 处理输出的线程池大小 |  |
@@ -27,16 +27,21 @@ DEFINE_string(host, "", "Host name for brpc server.");
 
 DEFINE_int32(port, 8010, "Port for brpc server.");
 
-DEFINE_int32(idle_timeout_s,
+DEFINE_int32(
+    rpc_idle_timeout_s,
+    -1,
+    "Connection will be closed if there is no read/write operations "
+    "during the last `rpc_idle_timeout_s`. -1 means wait indefinitely.");
+
+DEFINE_int32(rpc_channel_timeout_ms,
              -1,
-             "Connection will be closed if there is no read/write operations "
-             "during the last `idle_timeout_s`. -1 means wait indefinitely.");
+             "Max duration of bRPC Channel. -1 means wait indefinitely.");
 
-DEFINE_int32(num_threads, 32, "Number of threads to process requests.");
+DEFINE_int32(max_reconnect_count,
+             40,
+             "The max count for worker try to connect to server.");
 
-DEFINE_int32(max_concurrency,
-             0,
-             "Limit number of requests processed in parallel.");
+DEFINE_int32(num_threads, 32, "Number of threads to process requests.");
 
 DEFINE_int32(
     max_concurrent_requests,
@@ -74,11 +79,13 @@ DEFINE_bool(enable_mla,
             false,
             "Whether to enable multi-head latent attention.");
 
+// --- graph mode execution config ---
+
 DEFINE_bool(enable_acl_graph,
             false,
             "Whether to enable ACL graph execution for decode phase.");
 
-DEFINE_int32(max_tokens_per_seq,
+DEFINE_int32(max_seq_len_for_graph_mode,
              20480,
              "Maximum number of tokens per sequence for ACL graph execution.");
 
@@ -91,11 +98,13 @@ DEFINE_int32(limit_image_per_prompt,
 
 // --- threading config ---
 
-DEFINE_int32(num_handling_threads, 4, "Number of handling threads.");
+DEFINE_int32(num_request_handling_threads,
+             4,
+             "Number of threads for handling input requests.");
 
 DEFINE_int32(num_response_handling_threads,
              4,
-             "Number of response handling threads.");
+             "Number of threads for handling responses.");
 
 // --- kvcache config ---
 
@@ -141,9 +150,10 @@ DEFINE_bool(use_zero_evict,
             false,
             "Use ZeroEvictionScheduler but ContinuousScheduler.");
 
-DEFINE_int32(max_decode_token_per_sequence,
-             256,
-             "Max decode token per sequence.");
+DEFINE_int32(
+    max_decode_token_per_sequence,
+    256,
+    "Max decode token per sequence which used for ZeroEvictionScheduler.");
 
 // --- parallel config ---
 
@@ -168,10 +178,10 @@ DEFINE_int64(eplb_update_interval, 1000, "EPLB update rate.");
 
 DEFINE_double(eplb_update_threshold, 0.8, "EPLB update threshold.");
 
-DEFINE_string(rank_tablefile, "", "ATB HCCL rank table file.");
-
 DEFINE_int32(expert_parallel_degree, 0, "Expert parallel degree.");
 
+DEFINE_string(rank_tablefile, "", "ATB HCCL rank table file.");
+
 // --- profile config ---
 
 DEFINE_bool(enable_profile_step_time,
@@ -261,20 +271,8 @@ DEFINE_string(kv_cache_transfer_mode,
               "PUSH",
               "The mode of kv cache transfer(e.g. PUSH, PULL).");
 
-DEFINE_string(device_ip, "", "The device ip.");
-
 DEFINE_int32(transfer_listen_port, 26000, "The KVCacheTranfer listen port.");
 
-// --- worker server config ---
-
-DEFINE_int32(max_connect_count,
-             40,
-             "The max count for worker try to connect to server.");
-
-DEFINE_int32(sleep_time_second,
-             3,
-             "The sleep time for worker try to connect to server next time.");
-
 DEFINE_bool(enable_shm,
             true,
             "Whether to enable shared memory for executing model.");
@@ -311,10 +309,6 @@ DEFINE_double(heart_beat_interval, 0.5, "Heart beat interval.");
 
 DEFINE_int32(etcd_ttl, 3, "Time to live for etcd.");
 
-DEFINE_int32(timeout_ms,
-             -1,
-             "Max duration of bRPC Channel. -1 means wait indefinitely.");
-
 // --- priority strategy config ---
 
 DEFINE_string(priority_strategy,
@@ -354,7 +348,7 @@ DEFINE_bool(
     "Whether to enable computation communication parallel by two streams "
     "and two micro batches in prefill stage.");
 
-DEFINE_int32(default_micro_batch_num,
+DEFINE_int32(micro_batch_num,
              2,
              "Default use two micro batches for multi-stream parallel.");
 
@@ -368,7 +362,7 @@ DEFINE_bool(enable_continuous_kvcache,
             "Whether to enable continuous kv cache.");
 
 DEFINE_int64(
-    granularity_size,
+    phy_page_granularity_size,
     2 * 1024 * 1024,
     "Granularity size for one physical page in bytes, default 2MB, when enable "
     "continuous kv cache.");
@@ -388,4 +382,4 @@ DEFINE_bool(enable_beam_search_kernel,
             "Whether to enable beam search kernel.");
 
 // --- qwen3 reranker config
-DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");
+DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");
@@ -25,12 +25,10 @@ DECLARE_int32(port);
 
 DECLARE_int32(disagg_pd_port);
 
-DECLARE_int32(idle_timeout_s);
+DECLARE_int32(rpc_idle_timeout_s);
 
 DECLARE_int32(num_threads);
 
-DECLARE_int32(max_concurrency);
-
 DECLARE_string(model_id);
 
 DECLARE_string(model);
@@ -67,7 +65,7 @@ DECLARE_int32(max_tokens_per_chunk_for_prefill);
 
 DECLARE_int32(num_speculative_tokens);
 
-DECLARE_int32(num_handling_threads);
+DECLARE_int32(num_request_handling_threads);
 
 DECLARE_int32(num_response_handling_threads);
 
@@ -87,7 +85,7 @@ DECLARE_bool(enable_mla);
 
 DECLARE_bool(enable_acl_graph);
 
-DECLARE_int32(max_tokens_per_seq);
+DECLARE_int32(max_seq_len_for_graph_mode);
 
 DECLARE_bool(enable_chunked_prefill);
 
@@ -125,9 +123,7 @@ DECLARE_double(prefill_scheduling_memory_usage_threshold);
 
 DECLARE_int32(expert_parallel_degree);
 
-DECLARE_int32(max_connect_count);
-
-DECLARE_int32(sleep_time_second);
+DECLARE_int32(max_reconnect_count);
 
 DECLARE_bool(enable_atb_comm_multiprocess);
 
@@ -145,7 +141,7 @@ DECLARE_double(heart_beat_interval);
 
 DECLARE_int32(etcd_ttl);
 
-DECLARE_int32(timeout_ms);
+DECLARE_int32(rpc_channel_timeout_ms);
 
 DECLARE_int32(chunked_match_frequency);
 
@@ -169,7 +165,7 @@ DECLARE_string(store_metadata_connstring);
 
 DECLARE_bool(enable_multi_stream_parallel);
 
-DECLARE_int32(default_micro_batch_num);
+DECLARE_int32(micro_batch_num);
 
 DECLARE_bool(enable_profile_step_time);
 
@@ -193,7 +189,7 @@ DECLARE_int32(max_requests_per_batch);
 
 DECLARE_bool(enable_continuous_kvcache);
 
-DECLARE_int64(granularity_size);
+DECLARE_int64(phy_page_granularity_size);
 
 DECLARE_int64(cache_size_per_token);
 
 
@@ -33,7 +33,7 @@ std::string Options::to_string() const {
      << ", max_tokens_per_chunk_for_prefill: "
      << max_tokens_per_chunk_for_prefill()
      << ", num_speculative_tokens: " << num_speculative_tokens()
-     << ", num_handling_threads: " << num_handling_threads()
+     << ", num_request_handling_threads: " << num_request_handling_threads()
      << ", communication_backend: " << communication_backend().value_or("null")
      << ", rank_tablefile: " << rank_tablefile().value_or("null")
      << ", expert_parallel_degree: " << expert_parallel_degree().value_or(0)
 
@@ -73,7 +73,7 @@ class Options {
   PROPERTY(int32_t, num_speculative_tokens) = 0;
 
   // thread num to handle requests
-  PROPERTY(size_t, num_handling_threads) = 4;
+  PROPERTY(size_t, num_request_handling_threads) = 4;
 
   PROPERTY(std::optional<bool>, enable_eplb);
 
 
@@ -46,20 +46,20 @@ RemoteWorker::RemoteWorker(int32_t global_rank,
 bool RemoteWorker::wait_for_server_ready(const std::string& server_address) {
   // Retry until server initialize ready
   int try_count = 0;
-  while (try_count < FLAGS_max_connect_count) {
+  const int sleep_time_second = 3;
+  while (try_count < FLAGS_max_reconnect_count) {
     if (channel_->hello()) {
       LOG(INFO) << "RemoteWorker Hello connected, server_address: "
                 << server_address << ", global_rank_: " << global_rank_;
       break;
     } else {
-      std::this_thread::sleep_for(
-          std::chrono::seconds(FLAGS_sleep_time_second));
+      std::this_thread::sleep_for(std::chrono::seconds(sleep_time_second));
     }
 
     try_count++;
   }
 
-  if (try_count >= FLAGS_max_connect_count) {
+  if (try_count >= FLAGS_max_reconnect_count) {
     LOG(ERROR) << "RemoteWorker Hello method failed, global_rank_ is "
                << global_rank_;
     return false;
 
@@ -250,15 +250,15 @@ bool WorkerServer::sync_master_node(const std::string& master_node_addr,
   // Retry until master node ready
   int try_count = 0;
   brpc::Controller cntl;
-  while (try_count < FLAGS_max_connect_count) {
+  const int sleep_time_second = 3;
+  while (try_count < FLAGS_max_reconnect_count) {
     cntl.Reset();
     stub.Sync(&cntl, &addr_info, &uids, NULL);
     if (cntl.Failed()) {
       LOG(WARNING) << "Worker#" << addr_info.global_rank()
                    << " try connect to engine server error, try again."
                    << " Error message: " << cntl.ErrorText();
-      std::this_thread::sleep_for(
-          std::chrono::seconds(FLAGS_sleep_time_second));
+      std::this_thread::sleep_for(std::chrono::seconds(sleep_time_second));
     } else {
       LOG(INFO) << "Worker#" << addr_info.global_rank() << " connect to "
                 << master_node_addr << " success.";
@@ -267,7 +267,7 @@ bool WorkerServer::sync_master_node(const std::string& master_node_addr,
     try_count++;
   }
 
-  if (try_count >= FLAGS_max_connect_count) {
+  if (try_count >= FLAGS_max_reconnect_count) {
     LOG(ERROR) << "Worker#" << addr_info.global_rank() << " connect to "
                << master_node_addr << " failed."
                << " Error message: " << cntl.ErrorText();
 
@@ -42,7 +42,7 @@ void MultiLayerXTensor::append_phy_pages(
 
 void MultiLayerXTensor::free(int32_t seq_id) {
   size_t aligned_size =
-      get_num_pages_per_layer(seq_id) * FLAGS_granularity_size;
+      get_num_pages_per_layer(seq_id) * FLAGS_phy_page_granularity_size;
   for (size_t layer_idx = 0; layer_idx < num_layers_; layer_idx++) {
     VirPtr vir_ptr = get_vir_ptr(seq_id, layer_idx);
     vmm::unmap(vir_ptr, aligned_size);
@@ -60,4 +60,4 @@ void MultiLayerXTensor::deallocate_seq_id(int32_t seq_id) {
   free_seq_ids_[num_free_seq_ids_++] = seq_id;
 }
 
-}  // namespace xllm
+}  // namespace xllm
@@ -99,15 +99,16 @@ void PhyPagePool::batch_map(VirPtr vir_ptr,
                             int64_t layer_idx) const {
   size_t num_pages = page_ids.size();
 
-  size_t ptr_offset = (num_pages - num_new_pages) * FLAGS_granularity_size;
+  size_t ptr_offset =
+      (num_pages - num_new_pages) * FLAGS_phy_page_granularity_size;
 
-  VirPtr temp_vir_ptr = reinterpret_cast<VirPtr>((char*)vir_ptr + ptr_offset);
+  VirPtr temp_vir_ptr = reinterpret_cast<VirPtr>(vir_ptr + ptr_offset);
 
   for (size_t j = num_new_pages; j > 0; --j) {
     uint32_t page_id = page_ids[num_pages - j];
     map(temp_vir_ptr, page_id, layer_idx);
-    temp_vir_ptr =
-        reinterpret_cast<VirPtr>((char*)temp_vir_ptr + FLAGS_granularity_size);
+    temp_vir_ptr = reinterpret_cast<VirPtr>(temp_vir_ptr +
+                                            FLAGS_phy_page_granularity_size);
   }
 }
 }  // namespace xllm
@@ -56,12 +56,12 @@ bool RemoteXTensorManager::wait_for_server_ready(
   // Retry until server initialize ready
   int try_count = 0;
   brpc::Controller cntl;
-  while (try_count < FLAGS_max_connect_count) {
+  const int sleep_time_second = 3;
+  while (try_count < FLAGS_max_reconnect_count) {
     cntl.Reset();
     stub_->Hello(&cntl, &req, &resp, nullptr);
     if (cntl.Failed() || !resp.ok()) {
-      std::this_thread::sleep_for(
-          std::chrono::seconds(FLAGS_sleep_time_second));
+      std::this_thread::sleep_for(std::chrono::seconds(sleep_time_second));
     } else {
       LOG(INFO) << "RemoteXTensorManager Hello connected, server_address: "
                 << server_address << ", global_rank_: " << global_rank_;
@@ -71,7 +71,7 @@ bool RemoteXTensorManager::wait_for_server_ready(
     try_count++;
   }
 
-  if (try_count >= FLAGS_max_connect_count) {
+  if (try_count >= FLAGS_max_reconnect_count) {
     LOG(ERROR) << "RemoteXTensorManager Hello method failed, global_rank_ is "
                << global_rank_ << ", error: " << cntl.ErrorText();
     return false;