Skip to content

Commit 4d251c7

Browse files
committed
add a more safe share memory create way.
1 parent f294e53 commit 4d251c7

File tree

1 file changed

+31
-25
lines changed

1 file changed

+31
-25
lines changed

lightllm/utils/shm_utils.py

Lines changed: 31 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,52 @@
1-
import time
21
from multiprocessing import shared_memory
32
import logging
4-
import random
3+
import os
4+
import fcntl
55

66
logger = logging.getLogger(__name__)
77

8-
MAX_RETRIES = 25
9-
RETRY_DELAY = 0.2 # seconds
8+
LIGHTLLM_SHM_LOCK_FILE = "/tmp/lightllm_shm_creation.lock"
109

1110

12-
def create_or_link_shm(name: str, expected_size: int) -> shared_memory.SharedMemory:
13-
for _ in range(MAX_RETRIES):
14-
shm = None
11+
def acquire_lock():
12+
lock_fd = os.open(LIGHTLLM_SHM_LOCK_FILE, os.O_CREAT | os.O_RDWR)
13+
fcntl.flock(lock_fd, fcntl.LOCK_EX)
14+
return lock_fd
15+
16+
17+
def release_lock(lock_fd):
18+
fcntl.flock(lock_fd, fcntl.LOCK_UN)
19+
os.close(lock_fd)
20+
21+
22+
def create_or_link_shm(name, expected_size):
23+
lock_fd = acquire_lock()
24+
try:
1525
try:
1626
shm = shared_memory.SharedMemory(name=name, create=True, size=expected_size)
17-
logger.info(f"Created new shared memory: {name} (size={expected_size})")
27+
logger.info(f"Created new shared memory: {name} ({expected_size=})")
1828
return shm
1929
except FileExistsError:
2030
try:
21-
shm = shared_memory.SharedMemory(name=name, create=False)
31+
shm = shared_memory.SharedMemory(name=name)
2232
except FileNotFoundError:
2333
logger.warning(f"Shared memory {name} disappeared, retrying...")
24-
time.sleep(RETRY_DELAY * random.uniform(1, 2))
25-
continue
26-
34+
shm = shared_memory.SharedMemory(name=name, create=True, size=expected_size)
35+
except Exception as e:
36+
logger.error(f"Unexpected error attaching to shared memory {name}: {e}")
37+
raise
2738
if shm.size != expected_size:
2839
logger.warning(f"Size mismatch: expected {expected_size}, got {shm.size}. Recreating {name}...")
2940
shm.close()
3041
try:
3142
shm.unlink()
3243
except FileNotFoundError:
3344
pass
34-
35-
time.sleep(RETRY_DELAY * random.uniform(1, 2))
36-
continue
37-
else:
38-
logger.info(f"Attached to existing shared memory: {name} (size={shm.size})")
39-
return shm
40-
except Exception as e:
41-
if shm:
42-
shm.close()
43-
logger.error(f"Unexpected error creating/attaching shm {name}: {e}")
44-
raise
45-
46-
raise RuntimeError(f"Failed to create or attach to shared memory '{name}' after {MAX_RETRIES} attempts")
45+
shm = shared_memory.SharedMemory(name=name, create=True, size=expected_size)
46+
logger.info(f"Attached to existing shared memory: {name} ({expected_size=})")
47+
return shm
48+
except Exception as e:
49+
logger.error(f"Unexpected error creating shared memory {name}: {e}")
50+
raise
51+
finally:
52+
release_lock(lock_fd)

0 commit comments

Comments
 (0)