[FIX]add shm lock (#905)

SangChengC · web-flow · commit bf7ae967ec95 · 2025-05-26T18:17:51.000+08:00
diff --git a/lightllm/server/core/objs/atomic_array_lock.py b/lightllm/server/core/objs/atomic_array_lock.py
@@ -1,3 +1,4 @@
+import asyncio
 import atomics
 from multiprocessing import shared_memory
 from lightllm.utils.log_utils import init_logger
@@ -41,18 +42,40 @@ class AtomicLockItem:
     def __init__(self, context: AtomicShmArrayLock, index: int):
         self.context = context
         self.index = index
+        self._buf = context.shm.buf[index * 4 : (index + 1) * 4]
+
+    def try_acquire(self) -> bool:
+        with atomics.atomicview(self._buf, atype=atomics.INT) as a:
+            return a.cmpxchg_weak(0, 1)
+
+    def release(self):
+        with atomics.atomicview(self._buf, atype=atomics.INT) as a:
+            a.store(0)
 
     def __enter__(self):
-        with atomics.atomicview(
-            buffer=self.context.shm.buf[self.index * 4 : (self.index + 1) * 4], atype=atomics.INT
-        ) as a:
+        with atomics.atomicview(buffer=self._buf, atype=atomics.INT) as a:
             while not a.cmpxchg_weak(0, 1):
                 pass
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        with atomics.atomicview(
-            buffer=self.context.shm.buf[self.index * 4 : (self.index + 1) * 4], atype=atomics.INT
-        ) as a:
+        with atomics.atomicview(buffer=self._buf, atype=atomics.INT) as a:
             while not a.cmpxchg_weak(1, 0):
                 pass
         return False
+
+
+class AsyncLock:
+    def __init__(self, lock_item, base_delay=0.01):
+        self._item = lock_item
+        self._base = base_delay
+
+    async def __aenter__(self):
+        delay = self._base
+        while True:
+            if self._item.try_acquire():  # 尝试拿锁；成功立即返回
+                return
+            await asyncio.sleep(delay)
+
+    async def __aexit__(self, exc_t, exc, tb):
+        self._item.release()
+        return False
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -24,6 +24,7 @@
 from lightllm.server.core.objs import SamplingParams
 from lightllm.server.core.objs.io_objs import GroupReqObjs
 from lightllm.server.core.objs.shm_req_manager import ShmReqManager
+from lightllm.server.core.objs.atomic_array_lock import AtomicShmArrayLock, AsyncLock, AtomicLockItem
 from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt
 from lightllm.utils.log_utils import init_logger
 from lightllm.server.metrics.manager import MetricClient
@@ -52,7 +53,8 @@ def __init__(
 
         self.multinode_req_manager = None
         self.nnodes = args.nnodes
-        self._resource_lock = asyncio.Lock()
+        self._shm_lock_pool = AtomicShmArrayLock("lightllm_resource_lock", 1)
+        self._resource_lock = AsyncLock(self._shm_lock_pool.get_lock_context(0))
         self.node_rank = args.node_rank
         self.transfer_lock = asyncio.Lock()  # the lock for transfer to next module in multi node mode.
         self.disable_abort = args.nnodes > 1 and args.dp == 1  # mulitnode dp=1 mode, disable abort