fix next token ids.

hiworldwzj · hiworldwzj · commit 8705d0a727cb · 2025-07-16T08:26:49.000Z
diff --git a/lightllm/common/req_manager.py b/lightllm/common/req_manager.py
@@ -106,13 +106,12 @@ def __init__(self, max_request_num):
             "LIGHTLLM_ENABLE_GPU_BUFFER_FOR_OUT_TOKEN_ID_COUNTER"
         )
         self.vocab_size = get_vocab_size(get_env_start_args().model_dir)
-        self.req_to_next_token_id = torch.zeros(max_request_num + 1, dtype=torch.int64, device="cuda")
         self.req_to_presence_penalty = torch.zeros(max_request_num + 1, dtype=torch.float32, device="cuda")
         self.req_to_frequency_penalty = torch.zeros(max_request_num + 1, dtype=torch.float32, device="cuda")
         self.req_to_repetition_penalty = torch.zeros(max_request_num + 1, dtype=torch.float32, device="cuda")
         self.req_to_next_token_ids = torch.zeros(
             (max_request_num + 1, 8),
-            dtype=torch.int32,
+            dtype=torch.int64,
             device="cuda",
         )
         self.req_to_exponential_decay_length_penalty = torch.zeros(
@@ -143,7 +142,7 @@ def init_req_sampling_params(self, req):
         req: InferReq = req
 
         shm_param = req.sampling_param.shm_param
-        self.req_to_next_token_id[req.req_idx].fill_(req.get_last_gen_token())
+        self.req_to_next_token_ids[req.req_idx][0:1].fill_(req.get_last_gen_token())
         self.req_to_presence_penalty[req.req_idx].fill_(shm_param.presence_penalty)
         self.req_to_frequency_penalty[req.req_idx].fill_(shm_param.frequency_penalty)
         self.req_to_repetition_penalty[req.req_idx].fill_(shm_param.repetition_penalty)