free_radix_cache_to_get_enough_token instead of skip

WANDY666 · WANDY666 · commit 112f0eec82a9 · 2025-11-04T04:40:21.000Z
diff --git a/lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py b/lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py
@@ -29,7 +29,7 @@
 from lightllm.common.mem_manager import MemoryManager
 import torch.multiprocessing as mp
 
-min_trans_token_num = os.getenv("MIN_TRANS_TOKEN_NUM", 1)
+min_trans_token_num = os.getenv("MIN_TRANS_TOKEN_NUM", 128)
 
 
 class DPChunkedPrefillBackend(ModeBackend):
@@ -189,12 +189,8 @@ def _post_init_reqs(self, infer_reqs: List[InferReq], other_reqs: List[Tuple] =
         if alloc_size < self.min_trans_token_num:
             return
 
-        # Exit if alloc fails
-        try:
-            mem_indexes = self.model.mem_manager.alloc(alloc_size).cuda()
-        except Exception as e:
-            self.logger.error(f"dp_i {self.dp_rank_in_node} error alloc mem manager: {str(e)}")
-            return
+        g_infer_context.radix_cache.free_radix_cache_to_get_enough_token(alloc_size)
+        mem_indexes = self.model.mem_manager.alloc(alloc_size).cuda()
 
         move_token_indexes = torch.tensor(move_token_indexes, dtype=torch.int64, device="cuda")
         token_dp_indexes = torch.tensor(token_dp_indexes, dtype=torch.int32, device="cuda")