Skip to content

Commit 112f0ee

Browse files
committed
free_radix_cache_to_get_enough_token instead of skip
1 parent 93f77de commit 112f0ee

File tree

1 file changed

+3
-7
lines changed
  • lightllm/server/router/model_infer/mode_backend/dp_backend

1 file changed

+3
-7
lines changed

lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from lightllm.common.mem_manager import MemoryManager
3030
import torch.multiprocessing as mp
3131

32-
min_trans_token_num = os.getenv("MIN_TRANS_TOKEN_NUM", 1)
32+
min_trans_token_num = os.getenv("MIN_TRANS_TOKEN_NUM", 128)
3333

3434

3535
class DPChunkedPrefillBackend(ModeBackend):
@@ -189,12 +189,8 @@ def _post_init_reqs(self, infer_reqs: List[InferReq], other_reqs: List[Tuple] =
189189
if alloc_size < self.min_trans_token_num:
190190
return
191191

192-
# Exit if alloc fails
193-
try:
194-
mem_indexes = self.model.mem_manager.alloc(alloc_size).cuda()
195-
except Exception as e:
196-
self.logger.error(f"dp_i {self.dp_rank_in_node} error alloc mem manager: {str(e)}")
197-
return
192+
g_infer_context.radix_cache.free_radix_cache_to_get_enough_token(alloc_size)
193+
mem_indexes = self.model.mem_manager.alloc(alloc_size).cuda()
198194

199195
move_token_indexes = torch.tensor(move_token_indexes, dtype=torch.int64, device="cuda")
200196
token_dp_indexes = torch.tensor(token_dp_indexes, dtype=torch.int32, device="cuda")

0 commit comments

Comments
 (0)