Skip to content

Commit fa45ff9

Browse files
author
sangchengmeng
committed
add-qwen3-vl-1216
1 parent 0e7047d commit fa45ff9

File tree

4 files changed

+15
-10
lines changed

4 files changed

+15
-10
lines changed

lightllm/models/qwen3_vl/infer_struct.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@ def __init__(self):
77
self.input_ids = None
88
self.image_num_need_deepstack = 0
99
self.deepstack_features = []
10-
self.img_start_token_ids = None
10+
self.img_start_token_ids = []
1111
self.img_token_lens = None
1212
self.img_start_locs = None

lightllm/models/qwen3_vl/layer_infer/pre_layer_infer.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def context_forward(self, input_ids, infer_state: Qwen3VLInferStateInfo, layer_w
2626
img_start_loc = 0
2727

2828
infer_state.input_ids = input_ids
29-
img_start_token_ids = []
29+
infer_state.img_start_token_ids = []
3030
img_token_lens = []
3131
img_start_locs = []
3232

@@ -39,10 +39,10 @@ def context_forward(self, input_ids, infer_state: Qwen3VLInferStateInfo, layer_w
3939
for batch_id, p in enumerate(infer_state.multimodal_params):
4040
for img in p["images"] + p["audios"]:
4141
# skip the same image
42-
if img["token_id"] in img_start_token_ids or img["_prefill_"] is False:
42+
if img["token_id"] in infer_state.img_start_token_ids or img["_prefill_"] is False:
4343
continue
44-
infer_state.image_num_need_deepstack += 1
4544

45+
infer_state.image_num_need_deepstack += 1
4646
# all_img_embed_df的shape是
4747
# image_embed(token_num, hidden_dim) + deepstack(token_num*layer_num, hidden_dim)
4848
all_img_embed_df = bytes2tensor(read_shm(get_shm_name_embed(img["uuid"])))
@@ -58,7 +58,7 @@ def context_forward(self, input_ids, infer_state: Qwen3VLInferStateInfo, layer_w
5858
per_image_deepstack.append(all_img_embed_df[start:end])
5959

6060
infer_state.deepstack_features.append(per_image_deepstack)
61-
img_start_token_ids.append(img["token_id"])
61+
infer_state.img_start_token_ids.append(img["token_id"])
6262
img_token_lens.append(img["token_num"])
6363
img_start_locs.append(img_start_loc)
6464
img_start_loc += img["token_num"]
@@ -74,7 +74,7 @@ def context_forward(self, input_ids, infer_state: Qwen3VLInferStateInfo, layer_w
7474
)
7575
# each tp will fill the img embeds, should divide by world_size
7676
img_weight = img_weight / self.tp_world_size_
77-
infer_state.img_start_token_ids = torch.Tensor(img_start_token_ids).to(device=device, dtype=torch.long)
77+
img_start_token_ids = torch.Tensor(infer_state.img_start_token_ids).to(device=device, dtype=torch.long)
7878
infer_state.img_token_lens = torch.Tensor(img_token_lens).to(device=device, dtype=torch.long)
7979
infer_state.img_start_locs = torch.Tensor(img_start_locs).to(device=device, dtype=torch.long)
8080

@@ -84,7 +84,7 @@ def context_forward(self, input_ids, infer_state: Qwen3VLInferStateInfo, layer_w
8484
layer_weight.wte_weight_,
8585
img_weight,
8686
infer_state.img_token_lens,
87-
infer_state.img_start_token_ids,
87+
img_start_token_ids,
8888
infer_state.img_start_locs,
8989
self.vob_start_id_,
9090
self.vob_end_id_,

lightllm/models/qwen3_vl/triton_kernel/deepstack_multimodal_emb.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ def clear_deepstack_state(
114114
if infer_state.deepstack_features:
115115
total_layers = len(infer_state.deepstack_features[0])
116116
if layer_num == total_layers:
117+
infer_state.img_start_token_ids = []
118+
infer_state.img_token_lens = None
119+
infer_state.img_start_locs = None
117120
infer_state.image_num_need_deepstack = 0
118121
infer_state.deepstack_features = []
119122
return
@@ -152,12 +155,14 @@ def apply_deepstack_features(
152155
]
153156
all_deepstack_features = torch.cat(per_img_deepstack_features, dim=0)
154157

158+
img_start_token_ids_t = torch.as_tensor(infer_state.img_start_token_ids, device=device, dtype=torch.long)
159+
155160
add_deepstack_embs(
156161
out=input_embeddings,
157162
input_ids=input_ids,
158163
deepstack_embs=all_deepstack_features,
159164
img_token_lens=infer_state.img_token_lens,
160-
img_start_token_ids=infer_state.img_start_token_ids,
165+
img_start_token_ids=img_start_token_ids_t,
161166
img_start_locs=infer_state.img_start_locs,
162167
)
163168

test/acc/test_vlm_models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
),
2222
]
2323
os.environ["OPENAI_API_KEY"] = "lightllm123"
24-
os.environ["OPENAI_API_BASE"] = "http://localhost:8000/v1"
24+
os.environ["OPENAI_API_BASE"] = "http://localhost:18009/v1"
2525

2626

2727
def run_mmmu_eval(
@@ -72,4 +72,4 @@ def run_mmmu_eval(
7272
)
7373

7474

75-
run_mmmu_eval("Qwen/Qwen3-VL-8B-Instruct", "./logs")
75+
run_mmmu_eval("/mtc/sangchengmeng/models/Qwen3-VL-8B-Instruct/", "./logs")

0 commit comments

Comments
 (0)