@@ -124,12 +124,16 @@ def calcu_cpu_cache_meta() -> "CpuKVCacheMeta":
124124
125125
126126@lru_cache (maxsize = None )
127- def create_shm_kv_cache_ptr () -> int :
127+ def create_shm_kv_cache_ptr () -> tuple [ int , "AsyncPrefaultHandle" ] :
128128 libc = ctypes .CDLL ("/usr/lib/x86_64-linux-gnu/libc.so.6" , use_errno = True )
129129 libc .shmget .argtypes = (ctypes .c_long , ctypes .c_size_t , ctypes .c_int )
130130 libc .shmget .restype = ctypes .c_int
131131 libc .shmat .argtypes = (ctypes .c_int , ctypes .c_void_p , ctypes .c_int )
132132 libc .shmat .restype = ctypes .c_void_p
133+ libc .madvise .argtypes = (ctypes .c_void_p , ctypes .c_size_t , ctypes .c_int )
134+ libc .madvise .restype = ctypes .c_int
135+ libc .memset .argtypes = (ctypes .c_void_p , ctypes .c_int , ctypes .c_size_t )
136+ libc .memset .restype = ctypes .c_void_p
133137
134138 args = get_env_start_args ()
135139 key = args .cpu_kv_cache_shm_id
@@ -189,7 +193,28 @@ def _get_default_hugepage_size() -> int:
189193 raise Exception ("Error attaching shared memory" )
190194 logger .info (f"Shared cpu kv cache tensor memory at address: { shm_addr } " )
191195
192- return shm_addr
196+ # Best-effort memory prefaulting in background to speed up subsequent cudaHostRegister
197+ def _prefault_memory ():
198+ if not use_hugetlb :
199+ # MADV_HUGEPAGE only makes sense for regular pages with THP, not for SHM_HUGETLB
200+ MADV_HUGEPAGE = 14
201+ ret = libc .madvise (ctypes .c_void_p (shm_addr ), ctypes .c_size_t (size_to_alloc ), MADV_HUGEPAGE )
202+ if ret != 0 :
203+ err = ctypes .get_errno ()
204+ logger .warning (f"madvise(MADV_HUGEPAGE) failed with errno { err } , continue without THP" )
205+ else :
206+ logger .info ("madvise(MADV_HUGEPAGE) succeeded (best-effort THP)" )
207+ else :
208+ logger .debug ("Skipping MADV_HUGEPAGE for SHM_HUGETLB mode (already using hugepages)" )
209+
210+ # memset触发页面提前分配
211+ libc .memset (ctypes .c_void_p (shm_addr ), 0 , ctypes .c_size_t (size_to_alloc ))
212+ logger .info ("prefaulted shared memory pages successfully" )
213+
214+ th = threading .Thread (target = _prefault_memory , name = "cpu_cache_prefault" , daemon = True )
215+ th .start ()
216+
217+ return shm_addr , AsyncPrefaultHandle (thread = th )
193218
194219
195220@dataclasses .dataclass
@@ -306,6 +331,18 @@ def wait(self):
306331 return
307332
308333
334+ class AsyncPrefaultHandle :
335+ """A handle for async memory prefaulting."""
336+
337+ def __init__ (self , thread : Optional [threading .Thread ]):
338+ self .thread = thread
339+
340+ def wait (self ):
341+ if self .thread is not None and self .thread .is_alive ():
342+ self .thread .join ()
343+ return
344+
345+
309346@lru_cache (maxsize = None )
310347def attach_shm_kv_cache_ptr () -> int :
311348 libc = ctypes .CDLL ("/usr/lib/x86_64-linux-gnu/libc.so.6" , use_errno = True )
0 commit comments