feat: support profiling tpot in disaggregated pd mode.

JimHsiung · liutongxuan · commit db15be91677d · 2025-11-05T16:44:26.000+08:00
diff --git a/xllm/core/common/types.h b/xllm/core/common/types.h
@@ -199,7 +199,9 @@ struct InstanceInfo {
   std::vector<int64_t> v_cache_ids;
   int32_t dp_size;
   // ttft profiling data
-  std::vector<std::pair<int32_t, int64_t>> ttft_profiling_data;
+  std::vector<std::pair<int32_t, double>> ttft_profiling_data;
+  // tpot profiling data
+  std::vector<std::tuple<int32_t, int32_t, double>> tpot_profiling_data;
 
   nlohmann::json serialize_to_json() const {
     nlohmann::json json_val;
@@ -223,6 +225,7 @@ struct InstanceInfo {
     json_val["v_cache_ids"] = v_cache_ids;
     json_val["dp_size"] = dp_size;
     json_val["ttft_profiling_data"] = ttft_profiling_data;
+    json_val["tpot_profiling_data"] = tpot_profiling_data;
     return json_val;
   }
 };
diff --git a/xllm/core/scheduler/disagg_pd_scheduler.cpp b/xllm/core/scheduler/disagg_pd_scheduler.cpp
@@ -57,10 +57,11 @@ DisaggPDScheduler::DisaggPDScheduler(Engine* engine, const Options& options)
     initialize_rpc_server_and_client("DisaggPDServer");
     register_instance_info("DisaggPDServer", engine);
 
-    // Profile ttft and update instance info (for non-decode instances)
+    // Profile ttft & topt and update instance info (for mix instances)
     if (!options_.disable_ttft_profiling() &&
-        options_.instance_role().value() != InstanceRole::DECODE) {
+        options_.instance_role().value() == InstanceRole::MIX) {
       profile_ttft();
+      profile_tpot();
     }
   }
 }
@@ -114,6 +115,7 @@ void DisaggPDScheduler::register_instance_info(const std::string& server_name,
 }
 
 void DisaggPDScheduler::profile_ttft() {
+  LOG(INFO) << "Start profiling TTFT.";
   // get the maximum prefill token length
   auto& model_args = engine_->model_args();
   int32_t max_context_len = model_args.max_position_embeddings();
@@ -125,16 +127,53 @@ void DisaggPDScheduler::profile_ttft() {
   // warm up
   profile_manager_->run_request(max_context_len, 0);
 
-  // get TTFT starting from max_context_len, dividing the token length by 2 in
-  // each loop iteration
+  // get TTFT starting from max_context_len
   for (int32_t token_length = max_context_len; token_length > 1;
-       token_length >>= 1) {
-    int64_t latency = profile_manager_->run_request(token_length, 0);
+       token_length *= 0.9) {
+    double latency = profile_manager_->run_request(token_length, 0);
     instance_info_.ttft_profiling_data.emplace_back(
         std::make_pair(token_length, latency));
   }
 }
 
+void DisaggPDScheduler::profile_tpot() {
+  LOG(INFO) << "Start profiling TPOT.";
+  // get the maximum token length
+  auto& model_args = engine_->model_args();
+  int32_t max_context_len = model_args.max_position_embeddings();
+  if (!options_.enable_chunked_prefill()) {
+    max_context_len =
+        std::min(max_context_len, options_.max_tokens_per_batch());
+  }
+
+  int32_t num_blocks = kv_cache_manager_->num_blocks();
+  int32_t block_size = kv_cache_manager_->block_size();
+  int32_t max_seqs_per_batch = options_.max_seqs_per_batch();
+  int32_t request_blocks = max_context_len / block_size + 1;
+  int32_t max_batch_size = num_blocks / request_blocks;
+
+  // warm up
+  profile_manager_->run_request(
+      max_context_len, max_context_len - 1, max_batch_size);
+
+  // get TPOT starting from max_context_len, dividing the token length by 2 in
+  // each loop iteration. Skip small token lengths to speed up profiling.
+  for (int32_t token_length = max_context_len; token_length > 64;
+       token_length >>= 1) {
+    max_batch_size = num_blocks / (token_length / block_size + 1);
+    int32_t current_max_batch_size = max_batch_size > max_seqs_per_batch
+                                         ? max_seqs_per_batch
+                                         : max_batch_size;
+    for (int32_t batch_size = current_max_batch_size; batch_size > 0;
+         batch_size *= 0.9) {
+      double latency = profile_manager_->profile_decode_step_time(
+          token_length, batch_size, /*min_context_len=*/64, max_context_len);
+      instance_info_.tpot_profiling_data.emplace_back(
+          token_length, batch_size, latency);
+    }
+  }
+}
+
 // TODO: maybe we should consider update info case even if info already exists
 // in local.
 bool DisaggPDScheduler::check_remote_instance_info(
diff --git a/xllm/core/scheduler/disagg_pd_scheduler.h b/xllm/core/scheduler/disagg_pd_scheduler.h
@@ -101,6 +101,8 @@ class DisaggPDScheduler : public ContinuousScheduler {
   // corresponding TTFT for calculating the estimated TTFT of requests.
   void profile_ttft();
 
+  void profile_tpot();
+
   // check remote instance info, if not exist, get from master service
   bool check_remote_instance_info(const std::string& instance_name);
 
diff --git a/xllm/core/scheduler/profile/profile_manager.cpp b/xllm/core/scheduler/profile/profile_manager.cpp
@@ -604,4 +604,69 @@ double ProfileManager::run_request(
   return latency;
 }
 
+// Generate a batch of decode requests and execute it, then return the step
+// latency.
+double ProfileManager::profile_decode_step_time(int32_t token_length,
+                                                int32_t batch_size,
+                                                int32_t min_context_len,
+                                                int32_t max_context_len) {
+  double total_latency = 0;
+  for (int32_t i = 0; i < profile_count_per_step_; ++i) {
+    std::vector<int32_t> token_length_vec;
+    std::vector<int32_t> prefix_length_vec;
+    generate_random_decode_batch(batch_size * token_length,
+                                 batch_size,
+                                 min_context_len,
+                                 max_context_len,
+                                 token_length_vec,
+                                 prefix_length_vec);
+    double latency = run_request(token_length_vec, prefix_length_vec);
+    total_latency += latency;
+  }
+  return total_latency / profile_count_per_step_;
+}
+
+// Generate a batch of random decode requests with an average length of
+// token_length.
+void ProfileManager::generate_random_decode_batch(
+    int32_t total_length,
+    int32_t batch_size,
+    int32_t min_context_len,
+    int32_t max_context_len,
+    std::vector<int32_t>& token_length_vec,
+    std::vector<int32_t>& prefix_length_vec) {
+  CHECK(total_length >= batch_size * min_context_len);
+
+  token_length_vec.resize(batch_size, min_context_len);
+  prefix_length_vec.resize(batch_size, min_context_len - 1);
+  int remain = total_length - batch_size * min_context_len;
+
+  std::random_device rd;
+  std::mt19937_64 gen(rd());
+
+  for (int i = 0; i < batch_size; ++i) {
+    if (remain == 0) break;
+
+    int max = remain > (max_context_len - min_context_len)
+                  ? (max_context_len - min_context_len)
+                  : remain;
+
+    std::uniform_int_distribution<int> dis(0, max);
+    int add = dis(gen);
+    token_length_vec[i] += add;
+    prefix_length_vec[i] += add;
+    remain -= add;
+  }
+
+  int idx = 0;
+  while (remain > 0) {
+    if (token_length_vec[idx % batch_size] < max_context_len) {
+      token_length_vec[idx % batch_size] += 1;
+      prefix_length_vec[idx % batch_size] += 1;
+      --remain;
+    }
+    ++idx;
+  }
+}
+
 }  // namespace xllm
diff --git a/xllm/core/scheduler/profile/profile_manager.h b/xllm/core/scheduler/profile/profile_manager.h
@@ -79,6 +79,13 @@ class ProfileManager {
   double run_request(const std::vector<int32_t>& token_length_vec,
                      const std::vector<int32_t>& prefix_length_vec);
 
+  // Generate a batch of decode requests and execute it, then return the step
+  // latency.
+  double profile_decode_step_time(int32_t token_length,
+                                  int32_t batch_size,
+                                  int32_t min_context_len,
+                                  int32_t max_context_len);
+
   void train_prefill_time_predictor(
       std::vector<std::tuple<int32_t, int32_t, double>> time_profiling_data);
 
@@ -119,6 +126,15 @@ class ProfileManager {
                                    int32_t lower_bound,
                                    int32_t upper_bound);
 
+  // Generate a batch of random decode requests with an average length of
+  // token_length.
+  void generate_random_decode_batch(int32_t total_length,
+                                    int32_t batch_size,
+                                    int32_t min_context_len,
+                                    int32_t max_context_len,
+                                    std::vector<int32_t>& token_length_vec,
+                                    std::vector<int32_t>& prefix_length_vec);
+
   std::unique_ptr<TimePredictor> prefill_time_predictor_;
   std::unique_ptr<TimePredictor> decode_time_predictor_;