Skip to content

Commit e074439

Browse files
author
niushengxiao
committed
feat: reduce startup time for cpu cache
1 parent aff4049 commit e074439

File tree

3 files changed

+45
-6
lines changed

3 files changed

+45
-6
lines changed

lightllm/server/multi_level_kv_cache/cpu_cache_client.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,7 @@ def __init__(self, only_create_meta_data: bool, init_shm_data: bool):
3131

3232
if not only_create_meta_data:
3333
if init_shm_data:
34-
self._create_shm_cpu_kv_cache()
35-
self.attach_shm_handle = None
34+
self.attach_shm_handle = self._create_shm_cpu_kv_cache()
3635
else:
3736
self.attach_shm_handle = self._attach_shm_cpu_kv_cache()
3837
return
@@ -275,7 +274,7 @@ def _create_cpu_status_list(self, init_shm_data: bool):
275274
return
276275

277276
def _create_shm_cpu_kv_cache(self):
278-
shm_ptr = create_shm_kv_cache_ptr()
277+
shm_ptr, prefault_handle = create_shm_kv_cache_ptr()
279278
numpy_array = np.frombuffer(
280279
memoryview((ctypes.c_uint8 * self.kv_cache_tensor_meta.calcu_size()).from_address(shm_ptr)), dtype=np.uint8
281280
)
@@ -290,7 +289,7 @@ def _create_shm_cpu_kv_cache(self):
290289
self.cpu_kv_cache_tensor = (
291290
torch.from_numpy(numpy_array).view(dtype=self.kv_cache_tensor_meta.data_type).view(shape)
292291
)
293-
return
292+
return prefault_handle
294293

295294
def _attach_shm_cpu_kv_cache(self):
296295
shm_ptr = attach_shm_kv_cache_ptr()

lightllm/server/multi_level_kv_cache/manager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ def __init__(
6161
return
6262

6363
def cpu_cache_hanle_loop(self):
64+
if self.cpu_cache_client.attach_shm_handle is not None:
65+
self.cpu_cache_client.attach_shm_handle.wait()
66+
6467
while True:
6568
try:
6669
current_group_req = self.recv_queue.get()

lightllm/utils/kv_cache_utils.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,12 +124,16 @@ def calcu_cpu_cache_meta() -> "CpuKVCacheMeta":
124124

125125

126126
@lru_cache(maxsize=None)
127-
def create_shm_kv_cache_ptr() -> int:
127+
def create_shm_kv_cache_ptr() -> tuple[int, "AsyncPrefaultHandle"]:
128128
libc = ctypes.CDLL("/usr/lib/x86_64-linux-gnu/libc.so.6", use_errno=True)
129129
libc.shmget.argtypes = (ctypes.c_long, ctypes.c_size_t, ctypes.c_int)
130130
libc.shmget.restype = ctypes.c_int
131131
libc.shmat.argtypes = (ctypes.c_int, ctypes.c_void_p, ctypes.c_int)
132132
libc.shmat.restype = ctypes.c_void_p
133+
libc.madvise.argtypes = (ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int)
134+
libc.madvise.restype = ctypes.c_int
135+
libc.memset.argtypes = (ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t)
136+
libc.memset.restype = ctypes.c_void_p
133137

134138
args = get_env_start_args()
135139
key = args.cpu_kv_cache_shm_id
@@ -189,7 +193,28 @@ def _get_default_hugepage_size() -> int:
189193
raise Exception("Error attaching shared memory")
190194
logger.info(f"Shared cpu kv cache tensor memory at address: {shm_addr}")
191195

192-
return shm_addr
196+
# Best-effort memory prefaulting in background to speed up subsequent cudaHostRegister
197+
def _prefault_memory():
198+
if not use_hugetlb:
199+
# MADV_HUGEPAGE only makes sense for regular pages with THP, not for SHM_HUGETLB
200+
MADV_HUGEPAGE = 14
201+
ret = libc.madvise(ctypes.c_void_p(shm_addr), ctypes.c_size_t(size_to_alloc), MADV_HUGEPAGE)
202+
if ret != 0:
203+
err = ctypes.get_errno()
204+
logger.warning(f"madvise(MADV_HUGEPAGE) failed with errno {err}, continue without THP")
205+
else:
206+
logger.info("madvise(MADV_HUGEPAGE) succeeded (best-effort THP)")
207+
else:
208+
logger.debug("Skipping MADV_HUGEPAGE for SHM_HUGETLB mode (already using hugepages)")
209+
210+
# memset触发页面提前分配
211+
libc.memset(ctypes.c_void_p(shm_addr), 0, ctypes.c_size_t(size_to_alloc))
212+
logger.info("prefaulted shared memory pages successfully")
213+
214+
th = threading.Thread(target=_prefault_memory, name="cpu_cache_prefault", daemon=True)
215+
th.start()
216+
217+
return shm_addr, AsyncPrefaultHandle(thread=th)
193218

194219

195220
@dataclasses.dataclass
@@ -306,6 +331,18 @@ def wait(self):
306331
return
307332

308333

334+
class AsyncPrefaultHandle:
335+
"""A handle for async memory prefaulting."""
336+
337+
def __init__(self, thread: Optional[threading.Thread]):
338+
self.thread = thread
339+
340+
def wait(self):
341+
if self.thread is not None and self.thread.is_alive():
342+
self.thread.join()
343+
return
344+
345+
309346
@lru_cache(maxsize=None)
310347
def attach_shm_kv_cache_ptr() -> int:
311348
libc = ctypes.CDLL("/usr/lib/x86_64-linux-gnu/libc.so.6", use_errno=True)

0 commit comments

Comments
 (0)