fix, modify block reserve logic

xinfei-shi · LLLLKKKK · commit d1e93ce12c0d · 2025-10-31T00:34:56.000+08:00
diff --git a/docs/backend/pd_disaggregation.ipynb b/docs/backend/pd_disaggregation.ipynb
@@ -123,6 +123,7 @@
     "| **LOAD_CACHE_TIMEOUT_MS** | Timeout for remote KVCache loading (milliseconds) | `5000` |\n",
     "| **DECODE_RETRY_TIMES** | Number of retries for decode process, 0 means retry disabled | `100` |\n",
     "| **DECODE_RETRY_TIMEOUT_MS** | Total timeout for decode process retries (milliseconds) | `100` |\n",
+    "| **DECODE_RETRY_INTERVAL_MS** | interval for decode process retries (milliseconds) | `1` |\n",
     "| **RDMA_CONNECT_RETRY_TIMES** | Number of retries for RDMA connection establishment | `5000` |\n",
     "| **DECODE_POLLING_KV_CACHE_STEP_MS** | Interval time for polling KV loading status (milliseconds) | `30` |\n",
     "| **DECODE_ENTRANCE** | Whether Decode serves as traffic entry point | `false` |"
diff --git a/rtp_llm/config/gpt_init_model_parameters.py b/rtp_llm/config/gpt_init_model_parameters.py
@@ -240,6 +240,7 @@ class GptInitModelParameters:
     decode_polling_kv_cache_step_ms: int
     decode_retry_timeout_ms: int
     decode_retry_times: int
+    decode_retry_interval: int
     deepseek_mscale_all_dim: float
     deepseek_rope_mscale: float
     dp_rank: int
@@ -1102,6 +1103,10 @@ def update_common(
                 self.py_env_configs.pd_separation_config.decode_retry_timeout_ms
             )
             logging.info(f"decode_retry_timeout_ms: {self.decode_retry_timeout_ms}")
+            self.decode_retry_interval_ms = (
+                self.py_env_configs.pd_separation_config.decode_retry_interval_ms
+            )
+            logging.info(f"decode_retry_interval_ms: {self.decode_retry_interval_ms}")
 
             self.rdma_connect_retry_times = (
                 self.py_env_configs.pd_separation_config.rdma_connect_retry_times
@@ -1133,6 +1138,7 @@ def update_common(
         logging.info(
             f"scheduler_reserve_resource_ratio: {self.scheduler_reserve_resource_ratio}"
         )
+
         self.reuse_cache = self.py_env_configs.py_kv_cache_config.reuse_cache
         logging.info(f"reuse_cache: {self.reuse_cache}")
         self.pre_allocate_op_mem = bool(int(os.environ.get("PRE_ALLOCATE_OP_MEM", 1)))
diff --git a/rtp_llm/config/py_config_modules.py b/rtp_llm/config/py_config_modules.py
@@ -648,6 +648,7 @@ def __init__(self):
         # Decode related configuration
         self.decode_retry_times: int = 100
         self.decode_retry_timeout_ms: int = 100
+        self.decode_retry_interval_ms: int = 1
         self.decode_polling_kv_cache_step_ms: int = 30
         self.decode_entrance: int = 0
 
@@ -676,6 +677,9 @@ def update_from_env(self):
         self.decode_retry_timeout_ms = int(
             os.environ.get("DECODE_RETRY_TIMEOUT_MS", self.decode_retry_timeout_ms)
         )
+        self.decode_retry_interval_ms = int(
+            os.environ.get("DECODE_RETRY_INTERVAL_MS", self.decode_retry_interval_ms)
+        )
         self.decode_polling_kv_cache_step_ms = int(
             os.environ.get(
                 "DECODE_POLLING_KV_CACHE_STEP_MS", self.decode_polling_kv_cache_step_ms
@@ -700,6 +704,7 @@ def to_string(self):
             f"prefill_max_wait_timeout_ms: {self.prefill_max_wait_timeout_ms}\n"
             f"decode_retry_times: {self.decode_retry_times}\n"
             f"decode_retry_timeout_ms: {self.decode_retry_timeout_ms}\n"
+            f"decode_retry_interval_ms: {self.decode_retry_interval_ms}\n"
             f"decode_polling_kv_cache_step_ms: {self.decode_polling_kv_cache_step_ms}\n"
             f"decode_entrance: {self.decode_entrance}\n"
             f"rdma_connect_retry_times: {self.rdma_connect_retry_times}\n"
diff --git a/rtp_llm/cpp/api_server/test/mock/MockEngineBase.h b/rtp_llm/cpp/api_server/test/mock/MockEngineBase.h
@@ -19,7 +19,7 @@ class MockEngineBase: public EngineBase {
                  std::vector<GenerateStreamPtr>(const std::vector<std::shared_ptr<GenerateInput>>& inputs));
     MOCK_METHOD0(stop, absl::Status());
     MOCK_METHOD2(preRun, absl::StatusOr<GenerateStreamPtr>(const std::shared_ptr<GenerateInput>&, preRunMode));
-    MOCK_METHOD(KVCacheInfo, getCacheStatusInfo, (int64_t, bool), (const, override));
+    MOCK_METHOD(KVCacheInfo, getCacheStatusInfo, (int64_t, bool), (override));
 };
 
 }  // namespace rtp_llm
diff --git a/rtp_llm/cpp/cache/CacheManager.cc b/rtp_llm/cpp/cache/CacheManager.cc
@@ -92,11 +92,11 @@ CacheManager::~CacheManager() {
     allocator_.reset();
 }
 
-uint32_t CacheManager::totalBlocks() const {
+size_t CacheManager::totalBlocks() const {
     return allocator_->totalBlocks();
 }
 
-uint32_t CacheManager::maxSeqLen() const {
+size_t CacheManager::maxSeqLen() const {
     return totalBlocks() * seq_size_per_block_;
 }
 
@@ -110,7 +110,7 @@ void CacheManager::reportMetricsLoop() {
             {
                 std::lock_guard<std::mutex> guard(mutex_);
                 collector.kv_cache_item_num         = block_cache_.size();
-                auto available_blocks               = availableBlockNums();
+                auto available_blocks               = availableBlockNumsWithoutLock();
                 collector.kv_cache_left_seq         = available_blocks * seq_size_per_block_;
                 collector.kv_cache_available_blocks = available_blocks;
                 collector.kv_cache_free_blocks      = freeBlockNums();
@@ -156,11 +156,16 @@ size_t CacheManager::freeBlockNums() const {
     return allocator_->freeBlockNums();
 }
 
-size_t CacheManager::availableBlockNums() const {
+size_t CacheManager::availableBlockNums() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    return available_blocks_;
+}
+
+size_t CacheManager::availableBlockNumsWithoutLock() {
     return available_blocks_;
 }
 
-KVCacheInfo CacheManager::getKVCacheInfo(int64_t latest_version, bool need_cache_keys) const {
+KVCacheInfo CacheManager::getKVCacheInfo(int64_t latest_version, bool need_cache_keys) {
     auto                 snapshot = block_cache_.cacheSnapshot(latest_version);
     std::vector<int64_t> cachekeys;
     if (need_cache_keys) {
diff --git a/rtp_llm/cpp/cache/CacheManager.h b/rtp_llm/cpp/cache/CacheManager.h
@@ -103,9 +103,10 @@ class CacheManager {
 
     const CacheConfig&                     cacheConfig() const;
     size_t                                 freeBlockNums() const;
-    size_t                                 availableBlockNums() const;
-    KVCacheInfo                            getKVCacheInfo(int64_t latest_version, bool need_cache_keys) const;
-    uint32_t                               maxSeqLen() const;
+    size_t                                 availableBlockNums();
+    size_t                                 totalBlocks() const;
+    size_t                                 maxSeqLen() const;
+    KVCacheInfo                            getKVCacheInfo(int64_t latest_version, bool need_cache_keys);
     const KVCacheAllocator::KVCacheBuffer& kvCacheBuffer() const;
 
     std::tuple<bool, KVCacheResource> malloc(const KVCacheAllocator::SimpleMallocInfo& malloc_info);
@@ -150,10 +151,10 @@ class CacheManager {
 protected:
     const BlockCache&  blockCache() const;
     size_t             cacheItemNum() const;
-    uint32_t           totalBlocks() const;
     void               initFreeBlock();
     rtp_llm::BufferPtr tryAllocateMaxBuffer();
     void               allocateAndSync();
+    size_t             availableBlockNumsWithoutLock();
 
     MatchInfo                          matchImpl(const AdvancedMallocInfo& malloc_info);
     std::tuple<bool, std::vector<int>> mallocIndex(const KVCacheAllocator::SimpleMallocInfo& malloc_info);
diff --git a/rtp_llm/cpp/config/ConfigModules.h b/rtp_llm/cpp/config/ConfigModules.h
@@ -198,11 +198,12 @@ struct BatchDecodeSchedulerConfig {
 };
 
 struct FIFOSchedulerConfig {
-    int64_t     max_context_batch_size           = 1;
-    int         scheduler_reserve_resource_ratio = 5;
-    bool        enable_fast_gen                  = false;
-    bool        enable_partial_fallback          = false;
-    int64_t     fast_gen_context_budget          = -1;
+    int64_t max_context_batch_size           = 1;
+    int     scheduler_reserve_resource_ratio = 5;
+    bool    enable_fast_gen                  = false;
+    bool    enable_partial_fallback          = false;
+    int64_t fast_gen_context_budget          = -1;
+
     std::string to_string() const;
     void        update_from_env_for_test();
 };
diff --git a/rtp_llm/cpp/config/GptInitParameter.h b/rtp_llm/cpp/config/GptInitParameter.h
@@ -252,6 +252,7 @@ class GptInitParameter {
     int64_t  prefill_max_wait_timeout_ms_     = 0;
     int64_t  decode_retry_times_              = 0;
     int64_t  decode_retry_timeout_ms_         = 0;
+    int64_t  decode_retry_interval_ms_        = 1;
     int64_t  decode_polling_kv_cache_step_ms_ = 0;
     int64_t  decode_polling_call_prefill_ms_  = 0;
     int64_t  rdma_connect_retry_times_        = 0;
diff --git a/rtp_llm/cpp/engine_base/EngineBase.h b/rtp_llm/cpp/engine_base/EngineBase.h
@@ -83,7 +83,7 @@ class EngineBase {
     virtual absl::StatusOr<GenerateStreamPtr> preRun(const std::shared_ptr<GenerateInput>& generate_input,
                                                      preRunMode                            mode) = 0;
 
-    virtual KVCacheInfo getCacheStatusInfo(int64_t latest_version, bool need_cache_keys) const = 0;
+    virtual KVCacheInfo getCacheStatusInfo(int64_t latest_version, bool need_cache_keys) = 0;
 
     virtual const ResourceContext& resourceContext() const {
         return resource_context_;
diff --git a/rtp_llm/cpp/engine_base/schedulers/FIFOScheduler.cc b/rtp_llm/cpp/engine_base/schedulers/FIFOScheduler.cc
@@ -17,16 +17,18 @@ FIFOScheduler::FIFOScheduler(const rtp_llm::GptInitParameter&     params,
     max_seq_len_(params.max_seq_len_),
     max_batch_tokens_size_(params.max_batch_tokens_size_),
     max_generate_batch_size_(params.max_generate_batch_size_),
-    reserve_block_num_(params.scheduler_reserve_resource_ratio_ * cache_manager->availableBlockNums() / 100),
     // not support fallback when use pd_speration:use_cache_store
     enable_partial_fallback_(params.enable_partial_fallback_ && params.role_type_ == RoleType::PDFUSION),
     enable_whole_fallback_(params.role_type_ == RoleType::PDFUSION),
     enable_fast_gen_(params.enable_fast_gen_),
     need_fill_fake_stream_(params.dp_size_ > 1 && params.tp_rank_ == 0),
     fast_gen_max_context_len_(params.fast_gen_max_context_len_),
     metrics_reporter_(metrics_reporter) {
-    RTP_LLM_LOG_INFO("max_generate_batch_size %d", max_generate_batch_size_);
-    RTP_LLM_LOG_INFO("max_batch_tokens_size %d", max_batch_tokens_size_);
+    reserve_block_num_ = params.scheduler_reserve_resource_ratio_ * cache_manager->availableBlockNums() / 100;
+    RTP_LLM_LOG_INFO("max_generate_batch_size is [%d], max_batch_tokens_size is [%d], reserve_block_num is [%d]",
+                     max_generate_batch_size_,
+                     max_batch_tokens_size_,
+                     reserve_block_num_);
 }
 
 FIFOScheduler::~FIFOScheduler() {
@@ -228,13 +230,27 @@ bool FIFOScheduler::evaluateNewStream(const list<GenerateStreamPtr>& streams,
         return false;
     }
 
-    auto result = new_stream->initKVBlock(token_capacity_, reserve_step);
+    auto old_blocks = new_stream->maxBlockSize();
+    auto result     = new_stream->initKVBlock(token_capacity_, reserve_step);
     if (result.ok() && enable_fast_gen_) {
         token_capacity_ -= result.value();
         RTP_LLM_LOG_DEBUG(
             "after stream [%ld] acquireCapacity, token_capacity is %d", new_stream->streamId(), token_capacity_);
     }
-    return result.ok() && cache_manager_->availableBlockNums() >= reserve_block_num_;
+    if (result.ok()) {
+        if (cache_manager_->availableBlockNums() >= reserve_block_num_) {
+            return true;
+        } else {
+            RTP_LLM_LOG_INFO(
+                "current availableBlockNums is [%ld], reserve_block_num is [%ld], so stream [%ld] malloc failed",
+                cache_manager_->availableBlockNums(),
+                reserve_block_num_,
+                new_stream->streamId());
+            new_stream->tryReleaseKVBlock(new_stream->maxBlockSize() - old_blocks);
+            return false;
+        }
+    }
+    return false;
 }
 
 list<GenerateStreamPtr> FIFOScheduler::scheduleNew(size_t reserve_step) {
diff --git a/rtp_llm/cpp/engine_base/stream/GenerateStream.cc b/rtp_llm/cpp/engine_base/stream/GenerateStream.cc
@@ -172,7 +172,8 @@ absl::StatusOr<int> GenerateStream::incrKVBlock(int token_capacity, size_t reser
 
 int GenerateStream::tryReleaseKVBlock(int nums) {
     std::lock_guard<std::mutex> lock(*output_mutex_);
-    auto                        release_blocks = stream_cache_resource_->tryReleaseKVBlock(nums);
+    RTP_LLM_CHECK_WITH_INFO(nums >= 0, "release block nums is < 0");
+    auto release_blocks = stream_cache_resource_->tryReleaseKVBlock(nums);
     incrFallbackBlock(release_blocks);
     return release_blocks;
 }
diff --git a/rtp_llm/cpp/metrics/RtpLLMMetrics.cc b/rtp_llm/cpp/metrics/RtpLLMMetrics.cc
@@ -43,6 +43,7 @@ bool RpcMetrics::init(kmonitor::MetricsGroupManager* manager) {
     REGISTER_GAUGE_MUTABLE_METRIC(total_rt_us_metric, "rtp_llm_rpc_total_rt_us");
 
     REGISTER_GAUGE_MUTABLE_METRIC(retry_times_metric, "rtp_llm_rpc_retry_times");
+    REGISTER_GAUGE_MUTABLE_METRIC(retry_cost_time_ms_metric, "rtp_llm_rpc_retry_cost_time_ms");
     REGISTER_GAUGE_MUTABLE_METRIC(loading_cache_request_metric, "rtp_llm_rpc_loading_cache_request");
 
     REGISTER_GAUGE_MUTABLE_METRIC(get_rpc_connection_rt_us_metric, "rtp_llm_rpc_get_rpc_connection_rt_us");
@@ -102,6 +103,7 @@ void RpcMetrics::report(const kmonitor::MetricsTags* tags, RpcMetricsCollector*
     REPORT_GAUGE(total_rt_us);
 
     REPORT_GAUGE(retry_times);
+    REPORT_GAUGE(retry_cost_time_ms);
     REPORT_GAUGE(loading_cache_request);
 
     REPORT_GAUGE(get_rpc_connection_rt_us);
diff --git a/rtp_llm/cpp/metrics/RtpLLMMetrics.h b/rtp_llm/cpp/metrics/RtpLLMMetrics.h
@@ -24,8 +24,9 @@ class RpcMetricsCollector final {
     int64_t total_rt_us      = 0;
 
     // pd-sep prefill and decode metrics
-    int retry_times           = 0;
-    int loading_cache_request = 0;
+    int     retry_times           = 0;
+    int64_t retry_cost_time_ms    = 0;
+    int     loading_cache_request = 0;
 
     // pd-sep prefill metrics
     int64_t get_rpc_connection_rt_us       = 0;
@@ -70,6 +71,7 @@ class RpcMetrics: public kmonitor::MetricsGroup {
     kmonitor::MutableMetric* total_rt_us_metric      = nullptr;
 
     kmonitor::MutableMetric* retry_times_metric           = nullptr;
+    kmonitor::MutableMetric* retry_cost_time_ms_metric    = nullptr;
     kmonitor::MutableMetric* loading_cache_request_metric = nullptr;
 
     kmonitor::MutableMetric* get_rpc_connection_rt_us_metric       = nullptr;
diff --git a/rtp_llm/cpp/model_rpc/DecodeGenerateContext.cc b/rtp_llm/cpp/model_rpc/DecodeGenerateContext.cc
@@ -65,8 +65,9 @@ int64_t DecodeGenerateContext::TimeInfo::loadCacheTimeMs() const {
 
 void DecodeGenerateContext::reportTime() {
     RpcMetricsCollector collector;
+
     collectBasicMetrics(collector);
-    collector.retry_times                    = retry_times;
+
     collector.loading_cache_request          = loading_cache_requests;
     collector.prepare_generate_context_rt_us = stat_info.prepare_generate_context_rt_us;
     collector.allocate_resource_rt_us        = stat_info.allocate_resource_rt_us;
diff --git a/rtp_llm/cpp/model_rpc/DecodeRpcServer.cc b/rtp_llm/cpp/model_rpc/DecodeRpcServer.cc
@@ -87,11 +87,26 @@ void DecodeRpcServer::allocateResource(DecodeGenerateContext& decode_context) {
     auto input                        = QueryConverter::transQuery(&decode_context.allocate_request.input());
     auto generate_stream              = engine_->makeStream(input);
     decode_context.request_timeout_ms = generate_stream->getTimeoutMs();
-    auto status                       = generate_stream->initKVBlock(0);
+
+    auto cache_manager = engine_->resourceContext().cache_manager;
+    auto reserve_block_num =
+        maga_init_params_.gpt_init_parameter.scheduler_reserve_resource_ratio_ * cache_manager->totalBlocks() / 100;
+    auto current_blocks = cache_manager->availableBlockNums();
+    if (current_blocks < reserve_block_num) {
+        string error_msg = "request: [" + decode_context.request_key + "] malloc kv cache block failed at decode node, "
+                           + "current_blocks = " + std::to_string(current_blocks)
+                           + ", reserve_block_num = " + std::to_string(reserve_block_num);
+        RTP_LLM_LOG_ERROR(error_msg);
+        decode_context.error_status = grpc::Status(grpc::StatusCode::RESOURCE_EXHAUSTED, error_msg);
+        return;
+    }
+
+    auto status = generate_stream->initKVBlock(0);
     decode_context.setStream(generate_stream);
     if (!status.ok()) {
         string error_msg = "request: [" + decode_context.request_key + "] malloc kv cache block failed at decode node";
         RTP_LLM_LOG_ERROR(error_msg);
+        generate_stream->setStop(ErrorCode::DECODE_MALLOC_FAILED, "decode malloc failed");
         decode_context.error_status = grpc::Status(grpc::StatusCode::RESOURCE_EXHAUSTED, error_msg);
         return;
     }
@@ -722,10 +737,12 @@ grpc::Status DecodeRpcServer::RemoteGenerate(grpc::ServerContext* server_context
 
     auto max_retry_times      = maga_init_params_.gpt_init_parameter.decode_retry_times_;
     auto max_retry_timeout_ms = maga_init_params_.gpt_init_parameter.decode_retry_timeout_ms_;
+    auto retry_interval_ms    = maga_init_params_.gpt_init_parameter.decode_retry_interval_ms_;
 
     try {
         EXECUTE_STAGE_FUNC(prepareGenerateContext, decode_context);
-        EXECUTE_WITH_RETRY(allocateResourceFunc, decode_context, max_retry_times, max_retry_timeout_ms);
+        EXECUTE_WITH_RETRY(
+            allocateResourceFunc, decode_context, max_retry_times, max_retry_timeout_ms, retry_interval_ms);
         if (decode_context.hasError()) {
             RTP_LLM_LOG_WARNING("request [%s] allocate resource failed after retry %d times, cost time ms [%ld], "
                                 "max retry time [%ld], max retry timeout ms [%ld]",
diff --git a/rtp_llm/cpp/model_rpc/GenerateContext.cc b/rtp_llm/cpp/model_rpc/GenerateContext.cc
@@ -37,11 +37,13 @@ void GenerateContext::reportTime() {
 }
 
 void GenerateContext::collectBasicMetrics(RpcMetricsCollector& collector) {
-    collector.qps              = true;
-    collector.error_qps        = hasError();
-    collector.cancel_qps       = cancelled();
-    collector.onflight_request = onflight_requests;
-    collector.total_rt_us      = executeTimeMs() * 1000;
+    collector.qps                = true;
+    collector.error_qps          = hasError();
+    collector.cancel_qps         = cancelled();
+    collector.onflight_request   = onflight_requests;
+    collector.total_rt_us        = executeTimeMs() * 1000;
+    collector.retry_times        = retry_times;
+    collector.retry_cost_time_ms = retry_cost_time_ms;
 }
 
 void GenerateContext::reportMetrics(RpcMetricsCollector& collector) {
diff --git a/rtp_llm/cpp/model_rpc/GenerateContext.h b/rtp_llm/cpp/model_rpc/GenerateContext.h
@@ -97,7 +97,7 @@ class GenerateContext {
     CHECK_ERROR_STATUS(generate_context)
 
 // for prefill or decode retry
-#define EXECUTE_WITH_RETRY(func, generate_context, max_retries, retry_timeout_ms)                                      \
+#define EXECUTE_WITH_RETRY(func, generate_context, max_retries, retry_timeout_ms, retry_interval_ms)                   \
     int64_t begin_time_us = currentTimeUs();                                                                           \
     auto    stage         = generate_context.stat_info.saveStage();                                                    \
     for (int attempt = 0; attempt <= max_retries; ++attempt) {                                                         \
@@ -114,7 +114,7 @@ class GenerateContext {
             break;                                                                                                     \
         }                                                                                                              \
         CHECK_REQUEST_STOP(generate_context)                                                                           \
-        usleep(1000 * 1);                                                                                              \
+        usleep(retry_interval_ms * 1000);                                                                              \
     }
 
 }  // namespace rtp_llm
diff --git a/rtp_llm/cpp/model_rpc/PrefillGenerateContext.cc b/rtp_llm/cpp/model_rpc/PrefillGenerateContext.cc
@@ -154,8 +154,9 @@ void PrefillGenerateContext::markRequestEnd() {
 
 void PrefillGenerateContext::reportTime() {
     RpcMetricsCollector collector;
+
     collectBasicMetrics(collector);
-    collector.retry_times                    = retry_times;
+
     collector.loading_cache_request          = loading_cache_requests;
     collector.get_rpc_connection_rt_us       = stat_info.get_rpc_connection_rt_us;
     collector.remote_allocate_resource_rt_us = stat_info.remote_allocate_resource_rt_us;
@@ -166,6 +167,7 @@ void PrefillGenerateContext::reportTime() {
     collector.remote_load_cache_end_rt_us    = stat_info.remote_load_cache_end_rt_us;
     collector.remote_generate_rt_us          = stat_info.remote_generate_rt_us;
     collector.poll_remote_output_rt_us       = stat_info.poll_remote_output_rt_us;
+
     reportMetrics(collector);
     metrics_reporter.reset();  // avoid to report metrics in base class
 }
diff --git a/rtp_llm/cpp/model_rpc/PrefillRpcServer.cc b/rtp_llm/cpp/model_rpc/PrefillRpcServer.cc
diff --git a/rtp_llm/cpp/normal_engine/NormalEngine.cc b/rtp_llm/cpp/normal_engine/NormalEngine.cc
diff --git a/rtp_llm/cpp/normal_engine/NormalEngine.h b/rtp_llm/cpp/normal_engine/NormalEngine.h
diff --git a/rtp_llm/cpp/pybind/ConfigInit.cc b/rtp_llm/cpp/pybind/ConfigInit.cc
diff --git a/rtp_llm/cpp/speculative_engine/SpeculativeEngine.cc b/rtp_llm/cpp/speculative_engine/SpeculativeEngine.cc
diff --git a/rtp_llm/cpp/speculative_engine/SpeculativeEngine.h b/rtp_llm/cpp/speculative_engine/SpeculativeEngine.h
diff --git a/rtp_llm/ops/libth_transformer.pyi b/rtp_llm/ops/libth_transformer.pyi
diff --git a/rtp_llm/server/server_args/pd_separation_group_args.py b/rtp_llm/server/server_args/pd_separation_group_args.py