Skip to content

Commit ce02b13

Browse files
committed
Merge branch 'add-qwen2-vl' of https://github.com/ModelTC/lightllm into add-qwen3-vl
2 parents f5d1d60 + fa45ff9 commit ce02b13

File tree

6 files changed

+24
-25
lines changed

6 files changed

+24
-25
lines changed

lightllm/models/qwen2_vl/vision_process.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ def resize_image(
8080
class Qwen2VLImageProcessor(BaseImageProcessorFast):
8181
def __init__(
8282
self,
83+
size: dict = None,
8384
do_resize: bool = True,
8485
resample: PILImageResampling = PILImageResampling.BICUBIC,
8586
do_rescale: bool = True,
@@ -98,6 +99,7 @@ def __init__(
9899
**kwargs,
99100
) -> None:
100101
super().__init__(**kwargs)
102+
self.size = size
101103
self.do_resize = do_resize
102104
self.resample = resample
103105
self.do_rescale = do_rescale

lightllm/models/qwen3_vl/infer_struct.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ class Qwen3VLInferStateInfo(Qwen2VLInferStateInfo):
55
def __init__(self):
66
super().__init__()
77
self.input_ids = None
8+
self.image_num_need_deepstack = 0
89
self.deepstack_features = []
9-
self.deepstack_end_layer = None
1010
self.img_start_token_ids = []
11-
self.img_token_lens = []
12-
self.img_start_locs = []
11+
self.img_token_lens = None
12+
self.img_start_locs = None

lightllm/models/qwen3_vl/layer_infer/pre_layer_infer.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ def context_forward(self, input_ids, infer_state: Qwen3VLInferStateInfo, layer_w
2727

2828
infer_state.input_ids = input_ids
2929
infer_state.img_start_token_ids = []
30-
infer_state.img_token_lens = []
31-
infer_state.img_start_locs = []
30+
img_token_lens = []
31+
img_start_locs = []
3232

3333
device = layer_weight.wte_weight_.device
3434
dtype = layer_weight.wte_weight_.dtype
@@ -42,6 +42,7 @@ def context_forward(self, input_ids, infer_state: Qwen3VLInferStateInfo, layer_w
4242
if img["token_id"] in infer_state.img_start_token_ids or img["_prefill_"] is False:
4343
continue
4444

45+
infer_state.image_num_need_deepstack += 1
4546
# all_img_embed_df的shape是
4647
# image_embed(token_num, hidden_dim) + deepstack(token_num*layer_num, hidden_dim)
4748
all_img_embed_df = bytes2tensor(read_shm(get_shm_name_embed(img["uuid"])))
@@ -58,8 +59,8 @@ def context_forward(self, input_ids, infer_state: Qwen3VLInferStateInfo, layer_w
5859

5960
infer_state.deepstack_features.append(per_image_deepstack)
6061
infer_state.img_start_token_ids.append(img["token_id"])
61-
infer_state.img_token_lens.append(img["token_num"])
62-
infer_state.img_start_locs.append(img_start_loc)
62+
img_token_lens.append(img["token_num"])
63+
img_start_locs.append(img_start_loc)
6364
img_start_loc += img["token_num"]
6465
out = torch.zeros((len(input_ids), hidden_size), dtype=dtype, device=device)
6566

@@ -74,17 +75,17 @@ def context_forward(self, input_ids, infer_state: Qwen3VLInferStateInfo, layer_w
7475
# each tp will fill the img embeds, should divide by world_size
7576
img_weight = img_weight / self.tp_world_size_
7677
img_start_token_ids = torch.Tensor(infer_state.img_start_token_ids).to(device=device, dtype=torch.long)
77-
img_token_lens = torch.Tensor(infer_state.img_token_lens).to(device=device, dtype=torch.long)
78-
img_start_locs = torch.Tensor(infer_state.img_start_locs).to(device=device, dtype=torch.long)
78+
infer_state.img_token_lens = torch.Tensor(img_token_lens).to(device=device, dtype=torch.long)
79+
infer_state.img_start_locs = torch.Tensor(img_start_locs).to(device=device, dtype=torch.long)
7980

8081
multimodal_emb(
8182
out,
8283
input_ids,
8384
layer_weight.wte_weight_,
8485
img_weight,
85-
img_token_lens,
86+
infer_state.img_token_lens,
8687
img_start_token_ids,
87-
img_start_locs,
88+
infer_state.img_start_locs,
8889
self.vob_start_id_,
8990
self.vob_end_id_,
9091
)

lightllm/models/qwen3_vl/qwen3_visual.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,6 @@ def encode(self, images: List[ImageItem]):
402402
max_pixels=self.processor.max_pixels,
403403
)
404404
pixel_values, image_grid_thw = self.processor.preprocess(image_data)
405-
print(f"pixel_values is {pixel_values}")
406405
img_tensors.append(pixel_values)
407406
img_grids.append(image_grid_thw)
408407
else:

lightllm/models/qwen3_vl/triton_kernel/deepstack_multimodal_emb.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,6 @@ def add_deepstack_embs(
7777
img_start_token_ids: torch.Tensor,
7878
img_start_locs: torch.Tensor,
7979
):
80-
print(f"deepstack_embs is {deepstack_embs}")
81-
8280
assert input_ids.dim() == 1
8381
assert out.dim() == 2
8482
assert deepstack_embs.dim() == 2
@@ -117,8 +115,9 @@ def clear_deepstack_state(
117115
total_layers = len(infer_state.deepstack_features[0])
118116
if layer_num == total_layers:
119117
infer_state.img_start_token_ids = []
120-
infer_state.img_token_lens = []
121-
infer_state.img_start_locs = []
118+
infer_state.img_token_lens = None
119+
infer_state.img_start_locs = None
120+
infer_state.image_num_need_deepstack = 0
122121
infer_state.deepstack_features = []
123122
return
124123

@@ -146,27 +145,25 @@ def apply_deepstack_features(
146145
device = input_embeddings.device
147146
dtype = input_embeddings.dtype
148147

149-
if len(infer_state.img_start_token_ids) == 0:
148+
if infer_state.image_num_need_deepstack == 0:
150149
clear_deepstack_state(layer_num, infer_state)
151150
return
152151

153152
per_img_deepstack_features = [
154153
infer_state.deepstack_features[i][layer_num].to(device=device, dtype=dtype, non_blocking=True)
155-
for i in range(len(infer_state.img_start_token_ids))
154+
for i in range(infer_state.image_num_need_deepstack)
156155
]
157156
all_deepstack_features = torch.cat(per_img_deepstack_features, dim=0)
158157

159-
img_start_token_ids_t = torch.as_tensor(infer_state.img_start_token_ids, device=device, dtype=input_ids.dtype)
160-
img_token_lens_t = torch.as_tensor(infer_state.img_token_lens, device=device, dtype=input_ids.dtype)
161-
img_start_locs_t = torch.as_tensor(infer_state.img_start_locs, device=device, dtype=input_ids.dtype)
158+
img_start_token_ids_t = torch.as_tensor(infer_state.img_start_token_ids, device=device, dtype=torch.long)
162159

163160
add_deepstack_embs(
164161
out=input_embeddings,
165162
input_ids=input_ids,
166163
deepstack_embs=all_deepstack_features,
167-
img_token_lens=img_token_lens_t,
164+
img_token_lens=infer_state.img_token_lens,
168165
img_start_token_ids=img_start_token_ids_t,
169-
img_start_locs=img_start_locs_t,
166+
img_start_locs=infer_state.img_start_locs,
170167
)
171168

172169
clear_deepstack_state(layer_num, infer_state)

test/acc/test_vlm_models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
),
2222
]
2323
os.environ["OPENAI_API_KEY"] = "lightllm123"
24-
os.environ["OPENAI_API_BASE"] = "http://localhost:8000/v1"
24+
os.environ["OPENAI_API_BASE"] = "http://localhost:18009/v1"
2525

2626

2727
def run_mmmu_eval(
@@ -72,4 +72,4 @@ def run_mmmu_eval(
7272
)
7373

7474

75-
run_mmmu_eval("Qwen/Qwen2.5-VL-7B-Instruct", "./logs")
75+
run_mmmu_eval("/mtc/sangchengmeng/models/Qwen3-VL-8B-Instruct/", "./logs")

0 commit comments

Comments
 (0)