Skip to content

Commit 9704f2b

Browse files
committed
fix: img token replace in tokenizer
1 parent d292894 commit 9704f2b

File tree

2 files changed

+7
-14
lines changed

2 files changed

+7
-14
lines changed

lightllm/models/mineru2_qwen/model.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,21 +91,22 @@ def get_audio_token_length(self, audio: AudioItem):
9191

9292
# only change the impl of the encode func:
9393
def encode(self, prompt, multimodal_params: MultimodalParams = None, add_special_tokens: bool = True):
94-
# TEXT<image>TEXT<image>TEXT --> TEXT<img></img>TEXT<img></img>TEXT
95-
image_tokens = IMG_START_TOKEN + IMG_END_TOKEN
9694
if multimodal_params is None:
9795
return self.tokenizer.encode(prompt, add_special_tokens=add_special_tokens)
96+
97+
# TEXT<image>TEXT<image>TEXT --> TEXT<img></img>TEXT<img></img>TEXT
98+
image_tokens = IMG_START_TOKEN + IMG_END_TOKEN
9899
image_count = len(multimodal_params.images)
99-
prompt = prompt.replace(IMG_TOKEN, image_tokens, image_count)
100+
prompt = prompt.replace(image_tokens, IMG_TOKEN, image_count)
100101

101102
origin_ids = self.tokenizer.encode(prompt, add_special_tokens=add_special_tokens)
102-
# <img></img> --> <img>id,id+1...id+num</img>
103+
# <image>
103104
input_ids = []
104105
image_id = 0
105106
start_idx = 0
106107
while True:
107108
try:
108-
start_idx = origin_ids.index(self.image_start_id, start_idx)
109+
start_idx = origin_ids.index(self.img_token_index, start_idx)
109110
if start_idx + 1 >= len(origin_ids):
110111
break
111112
if origin_ids[start_idx + 1] == self.image_end_id:
@@ -122,6 +123,7 @@ def encode(self, prompt, multimodal_params: MultimodalParams = None, add_special
122123
except ValueError:
123124
break
124125
input_ids.extend(origin_ids[start_idx:])
126+
print(f"[debug] mineru2_tokenizer input_ids={input_ids}")
125127
return input_ids
126128

127129

lightllm/server/visualserver/manager.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,9 @@ def __init__(
3333
visual_model_rpc_ports,
3434
):
3535
context = zmq.Context(2)
36-
# 向下一模块(路由服务器或音频服务器)发送数据的 PUSH socket
3736
self.send_to_next_module = context.socket(zmq.PUSH) # router or audio server (if --enable_multimodal_audio)
3837
self.send_to_next_module.connect(f"{args.zmq_mode}127.0.0.1:{next_module_port}")
3938

40-
# 从 HTTP 服务器接收数据的 PULL socket
4139
self.recv_from_httpserver = context.socket(zmq.PULL)
4240
self.recv_from_httpserver.bind(f"{args.zmq_mode}127.0.0.1:{visual_port}")
4341
self.cache_client = rpyc.connect("localhost", cache_port, config={"allow_pickle": True})
@@ -47,7 +45,6 @@ def __init__(
4745
self.tp_world_size = args.tp
4846
self.vit_dp = args.visual_dp
4947
self.vit_tp = args.visual_tp
50-
# visual server 的推理 batch size,默认为1
5148
self.infer_batch_size = args.visual_infer_batch_size
5249
self.trust_remote_code = args.trust_remote_code
5350
self.args = args
@@ -58,7 +55,6 @@ async def wait_to_model_ready(self):
5855

5956
self.model_rpcs: List[List[VisualModelRpcClient]] = [[] for _ in range(self.vit_dp)]
6057

61-
# 每个设备启动一个visual manager
6258
for dp_rank_id in range(self.vit_dp):
6359
tp_ports_each_dp = self.visual_model_rpc_ports[dp_rank_id]
6460
for tp_rank_id in range(self.vit_tp):
@@ -69,7 +65,6 @@ async def wait_to_model_ready(self):
6965
self.model_rpcs[dp_rank_id].append(rpc_model)
7066

7167
init_model_ret = []
72-
# 每个设备启动一个visual manager
7368
for dp_rank_id in range(self.vit_dp): # async init model process
7469
for tp_rank_id in range(self.vit_tp):
7570
kvargs = {
@@ -92,19 +87,15 @@ async def wait_to_model_ready(self):
9287
await asyncio.gather(*init_model_ret)
9388
return
9489

95-
# 对图片进行推理
9690
async def infer_imgs(self, images: List[ImageItem]):
9791
if len(images) == 0:
9892
return
9993

10094
tasks = []
101-
# 进行dp方式的推理
10295
for vit_dp_rank in range(self.vit_dp):
10396
assigned_images = [images[i] for i in range(vit_dp_rank, len(images), self.vit_dp)]
10497
if assigned_images:
105-
# 进行tp方式的推理
10698
for vit_tp_rank in range(self.vit_tp):
107-
# 调用encode函数进行相应的推理
10899
task = asyncio.create_task(self.model_rpcs[vit_dp_rank][vit_tp_rank].encode(assigned_images))
109100
tasks.append(task)
110101

0 commit comments

Comments
 (0)