[fix]fix rpyc in multimodal process

SangChengC · sangchengmeng · commit aa1c586ab58e · 2025-07-08T16:39:45.000+08:00
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
@@ -190,8 +190,14 @@ def encode(self, audio_items: List[AudioItem]):
         audio_lens_after_cnn = np.array(audio_lens_after_cnn, dtype=np.int32)
         audio_token_num = (audio_lens_after_cnn - 2) // 2 + 1
 
-        for i in range(len(uuids)):
-            if not self.cache_client.root.get_item_embed(uuids[i]):
-                cur_embed_bytes = tensor2bytes(audios[i][: audio_token_num[i]])
-                create_shm(get_shm_name_embed(uuids[i]), cur_embed_bytes)
-                self.cache_client.root.set_item_embed(uuids[i])
+        ready_audio = self.cache_client.root.get_items_data(uuids)
+        ids_to_set = []
+        for i, ready in enumerate(ready_audio):
+            if ready:
+                continue
+            uid = uuids[i]
+            cur_embed_bytes = tensor2bytes(audios[i][: audio_token_num[i]])
+            create_shm(get_shm_name_data(uid), cur_embed_bytes)
+            ids_to_set.append(uid)
+        if ids_to_set:
+            self.cache_client.root.set_items_data(ids=ids_to_set)
diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
@@ -94,8 +94,11 @@ async def loop_for_fwd(self):
 
                     multimodal_params = group_req_indexes.multimodal_params
 
-                    for audio in multimodal_params.audios:
-                        if not self.cache_client.root.get_item_embed(audio.uuid):
+                    audio_uuids = [audio.uuid for audio in multimodal_params.audios]
+                    ready_audio = self.cache_client.root.get_items_embed(audio_uuids)
+
+                    for audio, ready in zip(multimodal_params.audios, ready_audio):
+                        if not ready:
                             audios_need_infer.append(audio)
 
                         if len(audios_need_infer) == self.infer_batch_size:
diff --git a/lightllm/server/embed_cache/impl/naive_memory_cache.py b/lightllm/server/embed_cache/impl/naive_memory_cache.py
@@ -89,52 +89,12 @@ def _clear(self):
                 if deleted >= max_delete:
                     break
 
-    # def alloc(self, md5sum: str, token_num: int) -> dict:
-    #     with self.lock:
-    #         t = time.time()
-    #         # add new record
-    #         if md5sum not in self._md5_to_record:
-
-    #             # full, need to clear some unused items
-    #             if self.occupied >= self.capacity:
-    #                 self._clear()
-    #                 if self.occupied >= self.capacity:
-    #                     return None
-
-    #             id = uuid.uuid1()
-    #             id = id.int
-    #             self._check_and_set_new_id_range(token_num)
-    #             record = Record(
-    #                 id=id,
-    #                 md5sum=md5sum,
-    #                 ref=1,
-    #                 data=False,
-    #                 embed=False,
-    #                 createtime=t,
-    #                 visittime=t,
-    #                 token_id=self.token_id_range_start,
-    #                 token_num=token_num,
-    #             )
-    #             self.token_id_range_start += token_num
-    #             self._records[id] = record
-    #             self._md5_to_record[md5sum] = record
-    #             self.occupied += 1
-
-    #         # cache hit
-    #         else:
-    #             record = self._md5_to_record[md5sum]
-    #             record.visittime = t
-    #             record.ref += 1
-
-    #         return {"id": record.id, "token_id": record.token_id, "token_num": record.token_num}
-
     def alloc_batch(self, md5_list: list[str], token_num_list: list[int]) -> list[dict]:
         results = []
         with self.lock:
             for md5, tnum in zip(md5_list, token_num_list):
                 t = time.time()
                 if md5 not in self._md5_to_record:
-                    # 若不存在则分配新记录（与alloc逻辑相同）
                     if self.occupied >= self.capacity:
                         self._clear()
                         if self.occupied >= self.capacity:
@@ -158,34 +118,27 @@ def alloc_batch(self, md5_list: list[str], token_num_list: list[int]) -> list[di
                     self._md5_to_record[md5] = record
                     self.occupied += 1
                 else:
-                    # 缓存命中，更新引用计数和访问时间
                     record = self._md5_to_record[md5]
                     record.visittime = t
                     record.ref += 1
                 results.append({"id": record.id, "token_id": record.token_id, "token_num": record.token_num})
         return results
 
-    def release(self, id: int) -> None:
+    def release(self, ids: list[int]) -> None:
         with self.lock:
-            self._records[id].ref -= 1
-
-    # def set_item_data(self, id: int) -> None:
-    #     self._records[id].data = True
+            for id in ids:
+                self._records[id].ref -= 1
 
-    # def get_item_data(self, id: int) -> bool:
-    #     return self._records[id].data
+    def set_items_data(self, ids: list[int]) -> None:
+        for id in ids:
+            self._records[id].data = True
 
     def get_items_data(self, ids: list[int]) -> list[bool]:
-        with self.lock:
-            return [self._records.get(i).data if i in self._records else False for i in ids]
-
-    def set_items_data(self, ids: list[int]) -> None:
-        with self.lock:
-            for i in ids:
-                self._records[i].data = True
+        return [self._records.get(i).data if i in self._records else False for i in ids]
 
-    def set_item_embed(self, id: int) -> None:
-        self._records[id].embed = True
+    def set_items_embed(self, ids: list[int]) -> None:
+        for id in ids:
+            self._records[id].embed = True
 
-    def get_item_embed(self, id: int) -> bool:
-        return self._records[id].embed
+    def get_items_embed(self, ids: list[int]) -> list[bool]:
+        return [self._records.get(i).embed if i in self._records else False for i in ids]
diff --git a/lightllm/server/embed_cache/interface.py b/lightllm/server/embed_cache/interface.py
@@ -10,7 +10,7 @@ def __init__(self) -> None:
     def alloc_batch(self, md5sum_list: list[str], token_num_list: list[int]) -> list[dict]:
         pass
 
-    def release(self, id: int) -> None:
+    def release(self, ids: list[int]) -> None:
         pass
 
     def set_items_data(self, ids: list[int]) -> None:
@@ -19,10 +19,10 @@ def set_items_data(self, ids: list[int]) -> None:
     def get_items_data(self, ids: list[int]) -> list[bool]:
         pass
 
-    def set_item_embed(self, id: int) -> None:
+    def set_items_embed(self, ids: list[int]) -> None:
         pass
 
-    def get_item_embed(self, id: int) -> bool:
+    def get_items_embed(self, ids: list[int]) -> list[bool]:
         pass
 
 
diff --git a/lightllm/server/embed_cache/manager.py b/lightllm/server/embed_cache/manager.py
@@ -25,12 +25,12 @@ def on_disconnect(self, conn):
     def exposed_alloc_batch(self, md5sum_list: list[str], token_num_list: list[int]) -> dict:
         md5sum_list = obtain(md5sum_list)
         token_num_list = obtain(token_num_list)
-        record = self._impl.alloc(md5sum_list, token_num_list)
+        record = self._impl.alloc_batch(md5sum_list, token_num_list)
         return record
 
-    def exposed_release(self, id: int) -> None:
-        id = obtain(id)
-        return self._impl.release(id)
+    def exposed_release(self, ids: list[int]) -> None:
+        ids = obtain(ids)
+        return self._impl.release(ids)
 
     def exposed_set_items_data(self, ids: list[int]) -> None:
         ids = obtain(ids)
@@ -40,13 +40,13 @@ def exposed_get_items_data(self, ids: list[int]) -> list[bool]:
         ids = obtain(ids)
         return self._impl.get_items_data(ids=ids)
 
-    def exposed_set_item_embed(self, id: int) -> None:
-        id = obtain(id)
-        return self._impl.set_item_embed(id=id)
+    def exposed_set_items_embed(self, ids: list[int]) -> None:
+        ids = obtain(ids)
+        return self._impl.set_items_embed(ids=ids)
 
-    def exposed_get_item_embed(self, id: int) -> bool:
-        id = obtain(id)
-        return self._impl.get_item_embed(id=id)
+    def exposed_get_items_embed(self, ids: list[int]) -> list[bool]:
+        ids = obtain(ids)
+        return self._impl.get_items_embed(ids=ids)
 
 
 def start_cache_manager(port: int, args, pipe_writer):
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -82,7 +82,7 @@ def __init__(
 
         self.enable_multimodal = enable_multimodal
         if self.enable_multimodal:
-            self.cache_client = rpyc.connect("localhost", cache_port, onfig={"allow_pickle": True})
+            self.cache_client = rpyc.connect("localhost", cache_port, config={"allow_pickle": True})
             self.send_to_visual = context.socket(zmq.PUSH)
             self.send_to_visual.connect(f"{args.zmq_mode}127.0.0.1:{visual_port}")
 
@@ -114,34 +114,6 @@ def __init__(
         self.latest_success_infer_time_mark.set_value(int(time.time()))
         return
 
-    # connect cache server, calculate md5, alloc resource, return uuid
-    async def _alloc_resource(self, item: Union[ImageItem, AudioItem]):
-        if isinstance(item, ImageItem):
-            data = item.read()
-            # must after init_imageitem_extral_params
-            num_tokens = self.tokenizer.get_image_token_length(item)
-        elif isinstance(item, AudioItem):
-            data = item.read()
-            num_tokens = self.tokenizer.get_audio_token_length(item)
-        else:
-            raise ValueError(f"unexpected item type {type(item)}")
-
-        md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(item.extra_params)))
-        wait_time = 1
-        while True:
-            record = self.cache_client.root.alloc(md5sum, num_tokens)
-            # hit or new
-            if record:
-                uid = record["id"]
-                if not self.cache_client.root.get_item_data(uid):
-                    create_shm(get_shm_name_data(uid), data)
-                    self.cache_client.root.set_item_data(uid)
-                return record
-            # cache full
-            else:
-                await asyncio.sleep(wait_time)
-                wait_time = min(wait_time + 2, 9)
-
     async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
         # 只有 P 和 NORMAL 节点需要真的管理多模态资源
         if self.pd_mode.is_P_or_NORMAL():
@@ -160,9 +132,6 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
                     token_nums.append(num_tokens)
                     datas.append(data)
                     items.append(img)
-                    # img.uuid = record["id"]
-                    # img.token_id = record["token_id"]
-                    # img.token_num = record["token_num"]
                 for audio in multimodal_params.audios:
                     self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
                     data = audio.read()
@@ -172,9 +141,6 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
                     token_nums.append(num_tokens)
                     datas.append(data)
                     items.append(audio)
-                    # audio.uuid = record["id"]
-                    # audio.token_id = record["token_id"]
-                    # audio.token_num = record["token_num"]
             wait_time = 1
             while True:
                 records = self.cache_client.root.alloc_batch(md5s, token_nums)
@@ -194,7 +160,6 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
                 item.token_num = record["token_num"]
                 if not ready:
                     create_shm(get_shm_name_data(item.uuid), data)
-                    self.cache_client.root.set_items_data(item.uuid)
                     uids_to_write.append(item.uuid)
             if uids_to_write:
                 self.cache_client.root.set_items_data(uids_to_write)
@@ -203,20 +168,23 @@ async def _release_multimodal_resources(self, multimodal_params: MultimodalParam
         # 只有 P 和 NORMAL 节点需要真的管理多模态资源
         if self.pd_mode.is_P_or_NORMAL():
             if multimodal_params is not None:
+                ids_to_release = []
                 for img in multimodal_params.images:
                     if img.uuid is not None:
-                        self.cache_client.root.release(img.uuid)
+                        ids_to_release.append(img.uuid)
                         # 将 uuid 等 赋值为 None, 防止因为abort等异常情况造成重复释放异常
                         img.uuid = None
                         img.token_id = None
                         img.token_num = None
                 for audio in multimodal_params.audios:
                     if audio.uuid is not None:
-                        self.cache_client.root.release(audio.uuid)
+                        ids_to_release.append(audio.uuid)
                         # 将 uuid 等 赋值为 None, 防止因为abort等异常情况造成重复释放异常
                         audio.uuid = None
                         audio.token_id = None
                         audio.token_num = None
+                if ids_to_release:
+                    self.cache_client.root.release(ids_to_release)
         return
 
     def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwargs=None):
diff --git a/lightllm/server/visualserver/manager.py b/lightllm/server/visualserver/manager.py
@@ -35,7 +35,7 @@ def __init__(
 
         self.recv_from_httpserver = context.socket(zmq.PULL)
         self.recv_from_httpserver.bind(f"{args.zmq_mode}127.0.0.1:{visual_port}")
-        self.cache_client = rpyc.connect("localhost", cache_port)
+        self.cache_client = rpyc.connect("localhost", cache_port, config={"allow_pickle": True})
         self.cache_port = cache_port
         self.waiting_reqs: List[GroupReqIndexes] = []
         self.model_weightdir = args.model_dir
@@ -121,11 +121,9 @@ async def loop_for_fwd(self):
                     multimodal_params = group_req_indexes.multimodal_params
 
                     img_uuids = [img.uuid for img in multimodal_params.images]
-                    ready_flags = []
-                    for uuid in img_uuids:
-                        ready_flags.append(self.cache_client.root.get_items_embed(uuid))
+                    ready_image = self.cache_client.root.get_items_embed(img_uuids)
 
-                    for img, ready in zip(multimodal_params.images, ready_flags):
+                    for img, ready in zip(multimodal_params.images, ready_image):
                         if not ready:
                             images_need_infer.append(img)
 
diff --git a/lightllm/server/visualserver/model_infer/model_rpc.py b/lightllm/server/visualserver/model_infer/model_rpc.py
@@ -94,14 +94,20 @@ def exposed_encode(self, images: List[ImageItem]):
         images = obtain(images)
         all_img_embeds, uuids, valid_ids = self.forward(images)
         all_img_embeds = all_img_embeds.to(torch.device("cpu"))
+
         if self.tp_rank_id == 0:
-            for i in range(len(uuids)):
+            ready_flags = self.cache_client.root.get_items_embed(uuids)
+            ids_to_set = []
+            for i, ready in enumerate(ready_flags):
+                if ready:
+                    continue
                 uid = uuids[i]
-                if not self.cache_client.root.get_item_embed(uid):
-                    start, end = valid_ids[i]
-                    cur_embed_bytes = tensor2bytes(all_img_embeds[start:end])
-                    create_shm(get_shm_name_embed(uuids[i]), cur_embed_bytes)
-                    self.cache_client.root.set_item_embed(uuids[i])
+                start, end = valid_ids[i]
+                cur_embed_bytes = tensor2bytes(all_img_embeds[start:end])
+                create_shm(get_shm_name_embed(uid), cur_embed_bytes)
+                ids_to_set.append(uid)
+            if ids_to_set:
+                self.cache_client.root.set_items_embed(ids_to_set)
         return