@@ -71,39 +71,31 @@ def get_cpu_kv_cache_stream(self) -> torch.cuda.Stream:
7171 self .cpu_kv_cache_stream = torch .cuda .Stream ()
7272 return self .cpu_kv_cache_stream
7373
74- def _maybe_alloc_and_copy_req_buffers (self , req_objs : List ["InferReq" ]) -> None :
75- """
76- For hybrid/linear-attention models (e.g. Qwen3-Next) we allocate a fixed-size buffer per request.
77- If radix cache hits and the matched node has a buffer, copy that buffer content to the newly
78- allocated buffer for this request.
79- """
80- if not self .use_buffer_manager or not req_objs :
81- return
82-
74+ def _alloc_and_copy_req_buffers (self , req_objs : List ["InferReq" ]) -> None :
75+ # 为请求分配 buffer, 如果 shared_kv_node 不为 None,则从 radix cache 复制 buffer。
8376 if self .radix_cache is not None :
84- # Ensure enough buffer capacity by evicting radix cache buffers if needed.
8577 self .radix_cache .free_radix_cache_to_get_enough_buffer (len (req_objs ))
8678
87- req_idxs = np .array ([r .req_idx for r in req_objs ], dtype = np .int64 )
88- request_indices_gpu = torch .from_numpy (req_idxs ).to (device = "cuda" , dtype = torch .int64 )
79+ req_idxs = []
80+ copy_indices = []
81+ copy_buffers = []
82+
83+ for r in req_objs :
84+ req_idxs .append (r .req_idx )
85+ if r .shared_kv_node is not None :
86+ copy_indices .append (r .req_idx )
87+ copy_buffers .append (r .shared_kv_node .buffer_idx )
88+
89+ request_indices_gpu = torch .tensor (req_idxs , device = "cuda" , dtype = torch .int64 )
8990 self .req_manager .alloc_buffer_for_req (request_indices_gpu )
9091
9192 if self .radix_cache is None :
9293 return
9394
94- # `shared_kv_node` may be None on cache miss; treat it as "no buffer to copy".
95- buffer_idxs = np .array (
96- [None if r .shared_kv_node is None else r .shared_kv_node .buffer_idx for r in req_objs ], dtype = object
97- )
98- mask = buffer_idxs == None # noqa: E711 (intentional elementwise comparison against None)
99- copy_indices = req_idxs [~ mask ].tolist ()
100- if not copy_indices :
101- return
102-
103- copy_buffers = buffer_idxs [~ mask ].tolist ()
104- copy_indices_tensor = torch .tensor (copy_indices , device = "cuda" , dtype = torch .int64 )
105- copy_buffers_tensor = torch .tensor (copy_buffers , device = "cuda" , dtype = torch .int64 )
106- self .req_manager .copy_buffer_from_another_buffer (copy_buffers_tensor , copy_indices_tensor )
95+ if copy_indices :
96+ copy_indices_tensor = torch .tensor (copy_indices , device = "cuda" , dtype = torch .int64 )
97+ copy_buffers_tensor = torch .tensor (copy_buffers , device = "cuda" , dtype = torch .int64 )
98+ self .req_manager .copy_buffer_from_another_buffer (copy_buffers_tensor , copy_indices_tensor )
10799
108100 def add_reqs (self , requests : List [Tuple [int , int , Any , int ]], init_prefix_cache : bool = True ) -> List ["InferReq" ]:
109101 req_objs = []
@@ -143,8 +135,8 @@ def add_reqs(self, requests: List[Tuple[int, int, Any, int]], init_prefix_cache:
143135 slave_req : InferReq = slave_req
144136 slave_req .related_master_req = master_req
145137
146- # Hybrid/linear-attention models
147- self ._maybe_alloc_and_copy_req_buffers (req_objs )
138+ if self . use_buffer_manager and len ( req_objs ) > 0 :
139+ self ._alloc_and_copy_req_buffers (req_objs )
148140
149141 return req_objs
150142
@@ -169,11 +161,11 @@ def free_a_req_mem(self, free_token_index: List, req: "InferReq", free_buffer_in
169161 if self .use_buffer_manager :
170162 buffer_idx = self .req_manager .req_to_buffer_index [req .req_idx ].item ()
171163 if node .buffer_idx is None :
172- self .radix_cache .set_node_buffer_idx (node , buffer_idx )
164+ self .radix_cache .add_buffer_idx_to_node (node , buffer_idx )
173165 else :
174166 free_buffer_index .append (buffer_idx )
175167
176- old_prefix_len = 0 if req .shared_kv_node is None else req . shared_kv_node . node_prefix_total_len
168+ old_prefix_len = req .shm_req . prompt_cache_len
177169 free_token_index .append (self .req_manager .req_to_token_indexs [req .req_idx ][old_prefix_len :prefix_len ])
178170 if req .shared_kv_node is not None :
179171 assert req .shared_kv_node .node_prefix_total_len <= prefix_len
@@ -218,7 +210,6 @@ def _filter(self, finished_request_ids: List[int]):
218210 self .req_manager .free (free_req_index , free_token_index )
219211
220212 if self .use_buffer_manager and len (free_buffer_index ) != 0 :
221- free_buffer_index = torch .tensor (free_buffer_index , dtype = torch .int64 , device = "cpu" )
222213 self .req_manager .free_buffer (free_buffer_index )
223214
224215 finished_req_ids_set = set (finished_request_ids )
@@ -278,6 +269,7 @@ def pause_reqs(self, pause_reqs: List["InferReq"], is_master_in_dp: bool):
278269 def recover_paused_reqs (self , paused_reqs : List ["InferReq" ], is_master_in_dp : bool , can_alloc_token_num : int ):
279270 if paused_reqs :
280271 g_infer_state_lock .acquire ()
272+ revovered_reqs = []
281273 for req in paused_reqs :
282274 prefill_need_token_num = req .get_cur_total_len ()
283275 if prefill_need_token_num > can_alloc_token_num :
@@ -288,8 +280,10 @@ def recover_paused_reqs(self, paused_reqs: List["InferReq"], is_master_in_dp: bo
288280 if is_master_in_dp :
289281 req .shm_req .is_paused = False
290282 can_alloc_token_num -= prefill_need_token_num
283+ revovered_reqs .append (req )
291284
292- self ._maybe_alloc_and_copy_req_buffers (paused_reqs )
285+ self ._alloc_and_copy_req_buffers (revovered_reqs )
286+ g_infer_state_lock .release ()
293287 return
294288
295289 def get_can_alloc_token_num (self ):
@@ -413,14 +407,13 @@ def __init__(
413407 self .nixl_pd_task_failed_num : int = 0
414408 self .nixl_trans_device_id : int = - 1
415409
410+ # 在开启radix cache的情况下,用于标记命中情况,用于插入算法
411+ self .mamba_model_match_len = 0
412+
416413 # 在开启 enable_cpu_cache 的情况下,当请求结束后,会将请求的 kv cache
417414 # 卸载到 cpu cache 中,该标志变量用于标记请求的卸载任务的状态
418415 self .cpu_cache_task_status : "InferReq._CpuCacheTaskStatus" = InferReq ._CpuCacheTaskStatus .NOT_STARTED
419416
420- # 用于管理该请求整个生命周期固定大小的 buffer 索引,None 表示未分配
421- # 用于线性注意力模型,比如 Qwen3-Next
422- self .buffer_idx : int = None
423-
424417 # mtp_step 用来记录一个请求 draft模型每步需要生成的token数量
425418 # 正常模式下,这个值为0,在 mtp 模式下,这个值为 draft 模型每步需要生成的token数量
426419 self .mtp_step : int = get_env_start_args ().mtp_step
@@ -469,6 +462,7 @@ def _match_radix_cache(self):
469462 key = torch .tensor (input_token_ids , dtype = torch .int64 , device = "cpu" )
470463 key = key [0 : len (key ) - 1 ] # 最后一个不需要,因为需要一个额外的token,让其在prefill的时候输出下一个token的值
471464 share_node , kv_len , value_tensor = g_infer_context .radix_cache .match_prefix (key , update_refs = True )
465+ self .mamba_model_match_len = kv_len
472466 if share_node is not None :
473467 self .shared_kv_node = share_node
474468 ready_cache_len = share_node .node_prefix_total_len
0 commit comments