@@ -467,7 +467,11 @@ def update_cache_blocks(self, task, block_size, num_computed_tokens):
467
467
block_tables = task .block_tables
468
468
469
469
last_node , num_cached_tokens = self .cache_info [req_id ]
470
- input_ids = task .prompt_token_ids + task .output_token_ids
470
+ if isinstance (task .prompt_token_ids , np .ndarray ):
471
+ prompt_token_ids = task .prompt_token_ids .tolist ()
472
+ else :
473
+ prompt_token_ids = task .prompt_token_ids
474
+ input_ids = prompt_token_ids + task .output_token_ids
471
475
can_cache_computed_tokens = num_computed_tokens - num_computed_tokens % block_size
472
476
left_input_ids = input_ids [num_cached_tokens :can_cache_computed_tokens ]
473
477
gpu_extra_block_ids = block_tables [num_cached_tokens // block_size :]
@@ -517,7 +521,11 @@ def request_match_blocks(self, task, block_size, *args):
517
521
hit_info ["gpu_cache_blocks" ] = 0
518
522
hit_info ["cpu_cache_blocks" ] = 0
519
523
self .metrics .req_count += 1
520
- input_ids = task .prompt_token_ids + task .output_token_ids
524
+ if isinstance (task .prompt_token_ids , np .ndarray ):
525
+ prompt_token_ids = task .prompt_token_ids .tolist ()
526
+ else :
527
+ prompt_token_ids = task .prompt_token_ids
528
+ input_ids = prompt_token_ids + task .output_token_ids
521
529
req_id = task .request_id
522
530
logger .info (f"request_match_blocks: start to allocate blocks for req_id { req_id } " )
523
531
input_token_num = len (input_ids )
0 commit comments