Skip to content

Commit bd2df3d

Browse files
committed
[FIX]fix dead lock when alloc resource
1 parent 2862e16 commit bd2df3d

File tree

3 files changed

+3
-15
lines changed

3 files changed

+3
-15
lines changed

lightllm/server/core/objs/req.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,11 +200,9 @@ def can_release(self):
200200
can_released_mark = self.can_released_mark
201201

202202
if self.is_aborted and can_released_mark and ref_count_ok:
203-
print("because of aborted, can release")
204203
return True
205204

206205
if self.finish_status.is_finished() and can_released_mark and ref_count_ok and self.out_tokens_queue.is_empty():
207-
print("because of finished, can release")
208206
return True
209207

210208
return False

lightllm/server/embed_cache/impl/naive_memory_cache.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,6 @@ def _clear(self):
7878
t = time.time()
7979
for id, record in items:
8080
if record.ref <= 0 or t - record.visittime >= self.expired_secs:
81-
if record.ref <= 0:
82-
logger.info(f"id {id}'s record ref is 0")
83-
if t - record.visittime >= self.expired_secs:
84-
logger.info(f"id {id}'s record expired, because of time_expired")
8581
if record.data:
8682
free_shm(get_shm_name_data(id))
8783
if record.embed:
@@ -133,7 +129,6 @@ def alloc(self, md5sum: str, token_num: int) -> dict:
133129
return {"id": record.id, "token_id": record.token_id, "token_num": record.token_num}
134130

135131
def release(self, id: int) -> None:
136-
logger.info(f"Releasing id {id}")
137132
with self.lock:
138133
self._records[id].ref -= 1
139134

lightllm/server/httpserver/manager.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -142,12 +142,9 @@ async def _alloc_resource(self, item: Union[ImageItem, AudioItem]):
142142
async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
143143
# 只有 P 和 NORMAL 节点需要真的管理多模态资源
144144
if self.pd_mode.is_P_or_NORMAL():
145-
# Acquire the lock so that two concurrent requests cannot both
146-
# allocate more records than the cache_capacity.
147-
# For example, if cache_capacity is 10 and each request has 6 images,
148-
# without the lock one request might allocate 5 images,
149-
# then another request allocates 5 more images, filling cache_capacity,
150-
# and both wait for space to free, causing a deadlock.
145+
# 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity,从而造成死锁的问题。
146+
# 如果不加任何锁,假如请求1和请求2都有6张图片,而cache_capacity为10,
147+
# 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图,将会资源竞争产生死锁。
151148
async with self._resource_lock:
152149
for img in multimodal_params.images:
153150
self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
@@ -169,7 +166,6 @@ async def _release_multimodal_resources(self, multimodal_params: MultimodalParam
169166
if multimodal_params is not None:
170167
for img in multimodal_params.images:
171168
if img.uuid is not None:
172-
logger.info(f"Releasing id {img.uuid}")
173169
self.cache_client.root.release(img.uuid)
174170
# 将 uuid 等 赋值为 None, 防止因为abort等异常情况造成重复释放异常
175171
img.uuid = None
@@ -602,7 +598,6 @@ async def recycle_resource_loop(self):
602598
release_req_status: List[ReqStatus] = []
603599
for req_status in self.req_id_to_out_inf.values():
604600
if req_status.can_release():
605-
logger.info(f"req_status {req_status.group_req_objs.group_req_id} can release")
606601
release_req_status.append(req_status)
607602
for req_status in release_req_status:
608603
self.req_id_to_out_inf.pop(req_status.group_req_objs.group_req_id, None)

0 commit comments

Comments
 (0)