remove dep code

wangzaijun · wangzaijun · commit e12f8d699093 · 2025-12-19T02:35:03.000Z
diff --git a/lightllm/server/embed_cache/embed_cache_client.py b/lightllm/server/embed_cache/embed_cache_client.py
@@ -5,12 +5,8 @@
 from typing import Optional, List
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.utils.log_utils import init_logger
-from lightllm.utils.embed_utils import (
-    calcu_embed_cache_meta,
-    create_shm_embed_cache_ptr,
-    attach_shm_kv_cache_ptr,
-    register_shm_ptr_to_pin,
-)
+from lightllm.utils.embed_utils import calcu_embed_cache_meta
+from lightllm.utils.kv_cache_utils import create_shm_kv_cache_ptr, attach_shm_kv_cache_ptr, register_shm_ptr_to_pin
 
 logger = init_logger(__name__)
 
@@ -52,7 +48,11 @@ def copy_to_cache(self, embed_tensor: torch.Tensor, start_index_in_cache: int):
         )
 
     def _create_shm_embed_kv_cache(self):
-        shm_ptr = create_shm_embed_cache_ptr()
+        shm_ptr = create_shm_kv_cache_ptr(
+            key=self.args.multi_modal_cache_shm_id, size=self.embed_cache_tensor_meta.calcu_size()
+        )
+        handle = register_shm_ptr_to_pin(shm_ptr=shm_ptr, size=self.embed_cache_tensor_meta.calcu_size())
+        handle.wait()
         numpy_array = np.frombuffer(
             memoryview((ctypes.c_uint8 * self.embed_cache_tensor_meta.calcu_size()).from_address(shm_ptr)),
             dtype=np.uint8,
@@ -69,7 +69,9 @@ def _create_shm_embed_kv_cache(self):
         return
 
     def _attach_shm_cpu_embed_cache(self):
-        shm_ptr = attach_shm_kv_cache_ptr()
+        shm_ptr = attach_shm_kv_cache_ptr(
+            key=self.args.multi_modal_cache_shm_id, size=self.embed_cache_tensor_meta.calcu_size()
+        )
         handle = register_shm_ptr_to_pin(shm_ptr=shm_ptr, size=self.embed_cache_tensor_meta.calcu_size())
         handle.wait()
         numpy_array = np.frombuffer(
diff --git a/lightllm/server/multi_level_kv_cache/cpu_cache_client.py b/lightllm/server/multi_level_kv_cache/cpu_cache_client.py
@@ -275,7 +275,9 @@ def _create_cpu_status_list(self, init_shm_data: bool):
         return
 
     def _create_shm_cpu_kv_cache(self):
-        shm_ptr = create_shm_kv_cache_ptr()
+        shm_ptr = create_shm_kv_cache_ptr(
+            key=self.args.cpu_kv_cache_shm_id, size=self.kv_cache_tensor_meta.calcu_size()
+        )
         numpy_array = np.frombuffer(
             memoryview((ctypes.c_uint8 * self.kv_cache_tensor_meta.calcu_size()).from_address(shm_ptr)), dtype=np.uint8
         )
@@ -293,7 +295,9 @@ def _create_shm_cpu_kv_cache(self):
         return
 
     def _attach_shm_cpu_kv_cache(self):
-        shm_ptr = attach_shm_kv_cache_ptr()
+        shm_ptr = attach_shm_kv_cache_ptr(
+            key=self.args.cpu_kv_cache_shm_id, size=self.kv_cache_tensor_meta.calcu_size()
+        )
         handle = register_shm_ptr_to_pin(shm_ptr=shm_ptr, size=self.kv_cache_tensor_meta.calcu_size())
         numpy_array = np.frombuffer(
             memoryview((ctypes.c_uint8 * self.kv_cache_tensor_meta.calcu_size()).from_address(shm_ptr)), dtype=np.uint8
diff --git a/lightllm/utils/embed_utils.py b/lightllm/utils/embed_utils.py
@@ -1,19 +1,9 @@
 import torch
-import ctypes
 import dataclasses
-import os
-import threading
-import time
-import numpy as np
-import triton
 from functools import lru_cache
-from lightllm.utils.envs_utils import get_env_start_args, enable_huge_page, get_llm_data_type
+from lightllm.utils.envs_utils import get_env_start_args, get_llm_data_type
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.config_utils import get_hidden_size
-from typing import List, Tuple, Optional
-from tqdm import tqdm
-from lightllm.utils.auto_shm_cleanup import register_sysv_shm_for_cleanup
-from lightllm.utils.dist_utils import get_current_device_id
 
 logger = init_logger(__name__)
 
@@ -64,190 +54,3 @@ def calcu_embed_cache_meta() -> "EmbedCacheMeta":
     logger.info(f"embed cache token num: {embed_cache_meta_data.token_num}")
 
     return embed_cache_meta_data
-
-
-@lru_cache(maxsize=None)
-def create_shm_embed_cache_ptr() -> int:
-    libc = ctypes.CDLL("/usr/lib/x86_64-linux-gnu/libc.so.6", use_errno=True)
-    libc.shmget.argtypes = (ctypes.c_long, ctypes.c_size_t, ctypes.c_int)
-    libc.shmget.restype = ctypes.c_int
-    libc.shmat.argtypes = (ctypes.c_int, ctypes.c_void_p, ctypes.c_int)
-    libc.shmat.restype = ctypes.c_void_p
-
-    args = get_env_start_args()
-    key = args.multi_modal_cache_shm_id
-    requested_size = calcu_embed_cache_meta().calcu_size()
-    use_hugetlb = enable_huge_page()
-
-    # 计算大页大小（默认从 /proc/meminfo 读取 Hugepagesize）
-    def _get_default_hugepage_size() -> int:
-        try:
-            with open("/proc/meminfo", "r") as f:
-                for line in f:
-                    if line.startswith("Hugepagesize:"):
-                        parts = line.split()
-                        if len(parts) >= 2:
-                            kb = int(parts[1])
-                            return kb * 1024
-        except Exception:
-            pass
-        return 2 * 1024 * 1024  # fallback 2MB
-
-    shmflg = 0o666 | 0o1000  # 权限和 IPC_CREAT 标志
-    if use_hugetlb:
-        # 向上对齐到大页大小
-        huge_sz = _get_default_hugepage_size()
-        size_to_alloc = triton.cdiv(requested_size, huge_sz) * huge_sz
-        SHM_HUGETLB = 0o4000
-        shmflg |= SHM_HUGETLB
-        logger.info(
-            f"Using SHM_HUGETLB, hugepage_size={huge_sz} bytes, requested={requested_size}, alloc={size_to_alloc}"
-        )
-    else:
-        size_to_alloc = requested_size
-        logger.info(f"Using regular pages, requested={requested_size}, alloc={size_to_alloc}")
-
-    shmid = libc.shmget(key, size_to_alloc, shmflg)
-    hugepages_num = (size_to_alloc + 1024 * 1024 * 1024 - 1) // (1024 * 1024 * 1024)
-    if shmid < 0:
-        err = ctypes.get_errno()
-        if use_hugetlb:
-            raise Exception(
-                f"shmget with SHM_HUGETLB failed (errno={err}). Falling back to regular pages."
-                f"You may need to configure hugepages manually, e.g.,"
-                f"sudo sed -i 's/^GRUB_CMDLINE_LINUX=\"/& default_hugepagesz=1G \
-                    hugepagesz=1G hugepages={hugepages_num}/' /etc/default/grub"
-                f"sudo update-grub"
-                f"sudo reboot"
-            )
-        else:
-            raise Exception(f"Error creating regular shared memory (errno={err})")
-
-    register_sysv_shm_for_cleanup(key, shmid)
-    logger.info(f"Shared memory ID: {shmid}")
-
-    # 附加共享内存
-    shm_addr = libc.shmat(shmid, ctypes.c_void_p(0), 0)
-    if shm_addr == ctypes.c_void_p(-1).value:
-        raise Exception("Error attaching shared memory")
-    logger.info(f"Shared cpu kv cache tensor memory at address: {shm_addr}")
-
-    # Best-effort memory prefaulting in background to speed up subsequent cudaHostRegister
-    def _pre_warm_memory():
-        page_size = _get_default_hugepage_size() if use_hugetlb else 4096
-        arr = np.ctypeslib.as_array(ctypes.cast(shm_addr, ctypes.POINTER(ctypes.c_uint8)), shape=(size_to_alloc,))
-        volatile_sum = int(arr[::page_size].sum())
-        logger.info(f"pre warmed shared memory pages successfully, checksum={volatile_sum})")
-
-    th = threading.Thread(target=_pre_warm_memory, name="cpu_cache_pre_warm", daemon=True)
-    th.start()
-
-    return shm_addr
-
-
-@lru_cache(maxsize=None)
-def register_shm_ptr_to_pin(shm_ptr: int, size: int) -> "AsyncRegistrationHandle":
-    """Start async cudaHostRegister on the given [shm_ptr, shm_ptr+size) and return a handle."""
-    chunk_bytes = 128 * 1024 * 1024  # 128M性能最好
-    tasks: list[tuple[int, int]] = []
-    offset = 0
-    while offset < size:
-        seg_len = min(chunk_bytes, size - offset)
-        tasks.append((offset, seg_len))
-        offset += seg_len
-
-    handle = AsyncRegistrationHandle(total_tasks=len(tasks))
-
-    def _worker():
-        cuda = ctypes.CDLL("/usr/local/cuda/targets/x86_64-linux/lib/libcudart.so")
-        cuda.cudaHostRegister.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint]
-        cuda.cudaHostRegister.restype = ctypes.c_int
-        cuda.cudaHostGetDevicePointer.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_void_p, ctypes.c_int]
-        cuda.cudaHostGetDevicePointer.restype = ctypes.c_int
-
-        cudaHostRegisterFlag = 3
-
-        torch.cuda.set_device(get_current_device_id())
-        # TODO 这个地方的分块注册是否具备合法性和合理性。
-        for offset, seg_len in tasks:
-            ptr = ctypes.c_void_p(shm_ptr + offset)
-            r = cuda.cudaHostRegister(ptr, ctypes.c_size_t(seg_len), cudaHostRegisterFlag)
-            if r != 0:
-                raise Exception(f"cudaHostRegister failed with error code {r}, prefer to use hugetlb")
-            handle.task_count += 1
-
-        device_ptr = ctypes.c_void_p()
-        host_ptr = ctypes.c_void_p(shm_ptr)
-        res = cuda.cudaHostGetDevicePointer(ctypes.byref(device_ptr), host_ptr, 0)
-        if res != 0:
-            raise Exception(f"cudaHostGetDevicePointer failed with error code {res}")
-        assert host_ptr.value == device_ptr.value
-        handle.tasks_finished.set()
-
-    th = threading.Thread(target=_worker, name="cpu_cache_register", daemon=True)
-    handle.thread = th
-    th.start()
-    return handle
-
-
-class AsyncRegistrationHandle:
-    """A handle for async host memory registration.
-
-    - wait(): blocks until registration finishes, prints tqdm progress, and returns device pointer (int).
-    """
-
-    def __init__(self, total_tasks: int):
-        self.total_tasks = total_tasks
-        self.task_count = 0
-        self.thread: Optional[threading.Thread] = None
-        self.tasks_finished = threading.Event()
-
-    def wait(self):
-        """Block until the async registration completes. Only here we print tqdm progress."""
-        last_count = 0
-        desc = f"pid {os.getpid()} Registering pinned host memory (async)"
-        with tqdm(total=self.total_tasks, desc=desc) as pbar:
-            while not self.tasks_finished.is_set():
-                cur = self.task_count
-                if cur > last_count:
-                    pbar.update(cur - last_count)
-                    last_count = cur
-                time.sleep(0.01)
-            # final update
-            cur = self.task_count
-            if cur > last_count:
-                pbar.update(cur - last_count)
-                last_count = cur
-
-        if self.thread is not None and self.thread.is_alive():
-            self.thread.join()
-
-        return
-
-
-@lru_cache(maxsize=None)
-def attach_shm_kv_cache_ptr() -> int:
-    libc = ctypes.CDLL("/usr/lib/x86_64-linux-gnu/libc.so.6", use_errno=True)
-    libc.shmget.argtypes = (ctypes.c_long, ctypes.c_size_t, ctypes.c_int)
-    libc.shmget.restype = ctypes.c_int
-    libc.shmat.argtypes = (ctypes.c_int, ctypes.c_void_p, ctypes.c_int)
-    libc.shmat.restype = ctypes.c_void_p
-
-    # Try to locate an existing SHM without creating a new one
-    args = get_env_start_args()
-    key = args.multi_modal_cache_shm_id
-    shmid = libc.shmget(key, 0, 0)
-    if shmid < 0:
-        size = calcu_embed_cache_meta().calcu_size()
-        shmid = libc.shmget(key, size, 0)
-    if shmid < 0:
-        err = ctypes.get_errno()
-        raise Exception(f"Error locating existing shared memory (errno={err})")
-
-    shm_addr = libc.shmat(shmid, ctypes.c_void_p(0), 0)
-    if shm_addr == ctypes.c_void_p(-1).value:
-        err = ctypes.get_errno()
-        raise Exception(f"Error attaching shared memory (errno={err})")
-
-    logger.info(f"Attached to SHM key={key}, shmid={shmid}, addr={shm_addr}")
-    return shm_addr
diff --git a/lightllm/utils/kv_cache_utils.py b/lightllm/utils/kv_cache_utils.py