Skip to content

Commit e7c0220

Browse files
authored
add Qwen2.5-VL UT (mindspore-lab#1011)
* add UT * fix UT with image inputs * clean code * support AutoModel * align to v4.50.0 and fix batch inference * add examples and fix batch inference
1 parent a9aa3e1 commit e7c0220

File tree

8 files changed

+566
-219
lines changed

8 files changed

+566
-219
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import os
2+
import ssl
3+
import urllib.request
4+
from typing import Optional
5+
6+
from PIL import Image
7+
from transformers import AutoProcessor
8+
9+
import mindspore as ms
10+
import mindspore.nn as nn
11+
12+
from mindone.transformers import Qwen2_5_VLForConditionalGeneration
13+
14+
MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"
15+
16+
17+
def get_image(url: str, fname: Optional[str] = None) -> Image.Image:
18+
if fname is None:
19+
fname = os.path.basename(url)
20+
21+
if not os.path.isfile(fname):
22+
ssl._create_default_https_context = ssl._create_unverified_context # disable ssl verify
23+
urllib.request.urlretrieve(url, fname)
24+
image = Image.open(fname)
25+
return image
26+
27+
28+
def main():
29+
with nn.no_init_parameters():
30+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
31+
MODEL_NAME, mindspore_dtype=ms.bfloat16, attn_implementation="flash_attention_2"
32+
)
33+
processor = AutoProcessor.from_pretrained(MODEL_NAME)
34+
35+
get_image(
36+
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
37+
"demo.jpeg",
38+
)
39+
messages = [
40+
{
41+
"role": "user",
42+
"content": [
43+
{
44+
"type": "image",
45+
"url": "demo.jpeg",
46+
},
47+
{"type": "text", "text": "Describe this image."},
48+
],
49+
}
50+
]
51+
52+
inputs = processor.apply_chat_template(
53+
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="np"
54+
)
55+
for k, v in inputs.items():
56+
inputs[k] = ms.Tensor(v)
57+
58+
generated_ids = model.generate(**inputs, max_new_tokens=128)
59+
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
60+
output_text = processor.batch_decode(
61+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
62+
)
63+
print(output_text)
64+
65+
66+
if __name__ == "__main__":
67+
main()
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import os
2+
import ssl
3+
import urllib.request
4+
from typing import Optional
5+
6+
from PIL import Image
7+
from transformers import AutoProcessor
8+
9+
import mindspore as ms
10+
import mindspore.nn as nn
11+
12+
from mindone.transformers import Qwen2_5_VLForConditionalGeneration
13+
from mindone.transformers.models.qwen2_vl.qwen_vl_utils import process_vision_info
14+
15+
MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"
16+
17+
18+
def get_image(url: str, fname: Optional[str] = None) -> Image.Image:
19+
if fname is None:
20+
fname = os.path.basename(url)
21+
22+
if not os.path.isfile(fname):
23+
ssl._create_default_https_context = ssl._create_unverified_context # disable ssl verify
24+
urllib.request.urlretrieve(url, fname)
25+
image = Image.open(fname)
26+
return image
27+
28+
29+
def main():
30+
with nn.no_init_parameters():
31+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
32+
MODEL_NAME, mindspore_dtype=ms.bfloat16, attn_implementation="flash_attention_2"
33+
)
34+
processor = AutoProcessor.from_pretrained(MODEL_NAME, padding_side="left")
35+
36+
get_image(
37+
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
38+
"demo.jpeg",
39+
)
40+
messages1 = [
41+
{
42+
"role": "user",
43+
"content": [
44+
{
45+
"type": "image",
46+
"image": "demo.jpeg",
47+
},
48+
{"type": "text", "text": "Describe this image."},
49+
],
50+
}
51+
]
52+
53+
messages2 = [
54+
{
55+
"role": "user",
56+
"content": [
57+
{
58+
"type": "image",
59+
"image": "demo.jpeg",
60+
},
61+
{"type": "text", "text": "Is this a AI generated image?"},
62+
],
63+
}
64+
]
65+
# Combine messages for batch processing
66+
messages = [messages1, messages2]
67+
68+
texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
69+
image_inputs, video_inputs = process_vision_info(messages)
70+
inputs = processor(
71+
text=texts,
72+
images=image_inputs,
73+
videos=video_inputs,
74+
padding=True,
75+
return_tensors="np",
76+
)
77+
for k, v in inputs.items():
78+
inputs[k] = ms.Tensor(v)
79+
80+
# Batch Inference
81+
generated_ids = model.generate(**inputs, max_new_tokens=128)
82+
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
83+
output_texts = processor.batch_decode(
84+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
85+
)
86+
print(output_texts)
87+
88+
89+
if __name__ == "__main__":
90+
main()

mindone/transformers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,8 @@
209209
from .models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLModel, Qwen2_5_VLPreTrainedModel
210210
from .models.qwen2_audio import Qwen2AudioEncoder, Qwen2AudioForConditionalGeneration, Qwen2AudioPreTrainedModel
211211
from .models.qwen2_vl import Qwen2VLForConditionalGeneration, Qwen2VLModel, Qwen2VLPreTrainedModel
212-
from .models.qwen3 import Qwen3ForCausalLM, Qwen3Model, Qwen3PreTrainedModel
212+
213+
# from .models.qwen3 import Qwen3ForCausalLM, Qwen3Model, Qwen3PreTrainedModel
213214
from .models.siglip import SiglipModel, SiglipPreTrainedModel, SiglipTextModel, SiglipVisionModel
214215
from .models.speecht5 import (
215216
SpeechT5ForSpeechToSpeech,

mindone/transformers/generation/logits_process.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -395,8 +395,8 @@ def __call__(
395395
if isinstance(scores, ms.Tensor):
396396
filter_value = self.filter_value if self.filter_value is not None else dtype_to_min(scores.dtype)
397397

398-
sorted_logits, sorted_indices = ops.sort(scores, descending=False)
399-
cumulative_probs = sorted_logits.softmax(axis=-1).cumsum(axis=-1)
398+
sorted_logits, sorted_indices = mint.sort(scores, descending=False)
399+
cumulative_probs = sorted_logits.softmax(axis=-1).cumsum(dim=-1)
400400

401401
# Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
402402
sorted_indices_to_remove = cumulative_probs <= (1 - self.top_p)

mindone/transformers/models/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from . import (
1+
from . import ( # qwen3,
22
albert,
33
auto,
44
bart,
@@ -31,7 +31,6 @@
3131
qwen2_5_vl,
3232
qwen2_audio,
3333
qwen2_vl,
34-
qwen3,
3534
speecht5,
3635
switch_transformers,
3736
t5,

0 commit comments

Comments
 (0)