improve pin mem manager

hiworldwzj · hiworldwzj · commit 7c1a5970a247 · 2025-07-18T02:13:40.000Z
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
@@ -103,19 +103,23 @@ def prefill_normal(
             next_token_ids, next_token_logprobs = sample(logits, run_reqs, self.eos_id)
 
             scatter_token(
-                next_token_ids,
-                self.model.req_manager.req_sampling_params_manager.req_to_next_token_ids,
-                model_input.b_req_idx,
-                model_input.b_mtp_index,
+                next_token_ids=next_token_ids,
+                req_to_next_token_ids=self.model.req_manager.req_sampling_params_manager.req_to_next_token_ids,
+                b_req_idx=model_input.b_req_idx,
+                b_mtp_index=model_input.b_mtp_index,
+                b_has_out=g_pin_mem_manager.gen_from_list(
+                    key="b_has_out", data=model_input.b_prefill_has_output_cpu, dtype=torch.bool
+                ).cuda(non_blocking=True),
             )
-            next_token_ids_cpu = g_pin_mem_manager.alloc_pin_tensor(
-                "next_token_ids", next_token_ids.shape[0], next_token_ids.dtype
+            next_token_ids_cpu = g_pin_mem_manager.async_copy_from_gpu_tensor(
+                key="next_token_ids",
+                gpu_tensor=next_token_ids,
             )
-            next_token_logprobs_cpu = g_pin_mem_manager.alloc_pin_tensor(
-                "next_token_logprobs", next_token_logprobs.shape[0], next_token_logprobs.dtype
+            next_token_logprobs_cpu = g_pin_mem_manager.async_copy_from_gpu_tensor(
+                key="next_token_logprobs",
+                gpu_tensor=next_token_logprobs,
             )
-            next_token_ids_cpu.copy_(next_token_ids, non_blocking=True)
-            next_token_logprobs_cpu.copy_(next_token_logprobs, non_blocking=True)
+
             sync_event = torch.cuda.Event()
             sync_event.record()
 
diff --git a/lightllm/server/router/model_infer/mode_backend/diverse_backend/impl.py b/lightllm/server/router/model_infer/mode_backend/diverse_backend/impl.py
@@ -63,13 +63,16 @@ def beam_prefill(self, event_pack: OverlapEventPack, prefill_reqs: List[InferReq
             b_req_idx = [req.req_idx for req in run_reqs]
             b_has_out = [model_input.b_prefill_has_output_cpu[i] for i in batch_idx]
 
-            batch_idx = torch.tensor(batch_idx, dtype=torch.int64, device="cpu", pin_memory=True).cuda(
+            batch_idx = g_pin_mem_manager.gen_from_list(key="batch_idx_", data=batch_idx, dtype=torch.int64).cuda(
                 non_blocking=True
             )
-            b_req_idx = torch.tensor(b_req_idx, dtype=torch.int32, device="cpu", pin_memory=True).cuda(
+            b_req_idx = g_pin_mem_manager.gen_from_list(key="b_req_idx_", data=b_req_idx, dtype=torch.int32).cuda(
                 non_blocking=True
             )
-            b_has_out = torch.tensor(b_has_out, dtype=torch.bool, device="cpu", pin_memory=True).cuda(non_blocking=True)
+            b_has_out = g_pin_mem_manager.gen_from_list(key="b_has_out_", data=b_has_out, dtype=torch.bool).cuda(
+                non_blocking=True
+            )
+
             logits = logits[batch_idx]
             b_mtp_index = model_input.b_mtp_index[batch_idx]
 
@@ -83,14 +86,14 @@ def beam_prefill(self, event_pack: OverlapEventPack, prefill_reqs: List[InferReq
                 b_has_out=b_has_out,
             )
 
-            next_token_ids_cpu = g_pin_mem_manager.alloc_pin_tensor(
-                "next_token_ids", next_token_ids.shape[0], next_token_ids.dtype
+            next_token_ids_cpu = g_pin_mem_manager.async_copy_from_gpu_tensor(
+                key="next_token_ids",
+                gpu_tensor=next_token_ids,
             )
-            next_token_logprobs_cpu = g_pin_mem_manager.alloc_pin_tensor(
-                "next_token_logprobs", next_token_logprobs.shape[0], next_token_logprobs.dtype
+            next_token_logprobs_cpu = g_pin_mem_manager.async_copy_from_gpu_tensor(
+                key="next_token_logprobs",
+                gpu_tensor=next_token_logprobs,
             )
-            next_token_ids_cpu.copy_(next_token_ids, non_blocking=True)
-            next_token_logprobs_cpu.copy_(next_token_logprobs, non_blocking=True)
             sync_event = torch.cuda.Event()
             sync_event.record()
 
diff --git a/lightllm/server/router/model_infer/pin_mem_manager.py b/lightllm/server/router/model_infer/pin_mem_manager.py
@@ -11,7 +11,7 @@ def __init__(self):
         self.key_to_alloc_index: Dict[str, int] = {}
         self.buffer_size = 4
 
-    def alloc_pin_tensor(self, key: str, size: int, dtype: torch.dtype):
+    def alloc_pin_tensor(self, key: str, size: int, dtype: torch.dtype) -> torch.Tensor:
         """
         利用 buffer_size buffer的 pin mem的cache，加速对pin mem的申请和释放操作。
         """
@@ -34,5 +34,17 @@ def alloc_pin_tensor(self, key: str, size: int, dtype: torch.dtype):
             self.key_to_alloc_index[key] = (alloc_index + 1) % self.buffer_size
             return buff_tensor[0:size]
 
+    def gen_from_list(self, key: str, data: List, dtype: torch.dtype) -> torch.Tensor:
+        size = len(data)
+        pin_mem = self.alloc_pin_tensor(key, size=size, dtype=dtype)
+        pin_mem.numpy()[:] = data
+        return pin_mem
+
+    def async_copy_from_gpu_tensor(self, key: str, gpu_tensor: torch.Tensor) -> torch.Tensor:
+        size = gpu_tensor.numel()
+        pin_mem = self.alloc_pin_tensor(key, size=size, dtype=gpu_tensor.dtype)
+        pin_mem.copy_(gpu_tensor.view(-1), non_blocking=True)
+        return pin_mem.view(gpu_tensor.shape)
+
 
 g_pin_mem_manager = PinMemTensorManager()