@@ -302,7 +302,6 @@ def _prefill(
302302 infer_state .mem_manager = self .mem_manager
303303 infer_state .req_manager = self .req_manager
304304
305- infer_state .mem_is_contiguous = False
306305 infer_state .mem_index = mem_indexes
307306 infer_state .kv_buffer = torch .empty (
308307 (input_ids .shape [0 ], self .tp_k_head_num_ + self .tp_v_head_num_ , self .head_dim_ ),
@@ -351,9 +350,6 @@ def _decode(
351350 infer_state .mem_manager = self .mem_manager
352351 infer_state .req_manager = self .req_manager
353352
354- # 在使用 cuda graph 特性的时候,必须保证每次推理的流程一致
355- # 所以不再使用分配连续的mem带来的优化,保证推理流程的一致
356- infer_state .mem_is_contiguous = False
357353 infer_state .mem_index = mem_indexes
358354 infer_state .kv_buffer = torch .empty (
359355 (batch_size , self .tp_k_head_num_ + self .tp_v_head_num_ , self .head_dim_ ),
@@ -398,9 +394,6 @@ def create_inferstate(cur_batch: DecodeMicroBatch, batch_index):
398394 infer_state .mem_manager = self .mem_manager
399395 infer_state .req_manager = self .req_manager
400396
401- # 在使用 cuda graph 特性的时候,必须保证每次推理的流程一致
402- # 所以不再使用分配连续的mem带来的优化,保证推理流程的一致
403- infer_state .mem_is_contiguous = False
404397 infer_state .mem_index = cur_batch .mem_indexes
405398 infer_state .kv_buffer = torch .empty (
406399 (cur_batch .batch_size , self .tp_k_head_num_ + self .tp_v_head_num_ , self .head_dim_ ),
@@ -475,9 +468,6 @@ def create_inferstate(cur_batch: PrefillMicroBatch, batch_index):
475468 infer_state .mem_manager = self .mem_manager
476469 infer_state .req_manager = self .req_manager
477470
478- # 在使用 cuda graph 特性的时候,必须保证每次推理的流程一致
479- # 所以不再使用分配连续的mem带来的优化,保证推理流程的一致
480- infer_state .mem_is_contiguous = False
481471 infer_state .mem_index = cur_batch .mem_indexes
482472 infer_state .kv_buffer = torch .empty (
483473 (cur_batch .input_ids .shape [0 ], self .tp_k_head_num_ + self .tp_v_head_num_ , self .head_dim_ ),
0 commit comments