Skip to content

Commit 87568cb

Browse files
committed
first step for moe
1 parent d214c04 commit 87568cb

File tree

4 files changed

+256
-2
lines changed

4 files changed

+256
-2
lines changed

_unittests/ut_tasks/try_tasks.py

Lines changed: 99 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,8 @@ def test_text2text_generation(self):
9999
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
100100

101101
@never_test()
102-
def test_text_generation_phi4(self):
103-
# clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k phi4
102+
def test_text_generation_phi4_mini(self):
103+
# clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k phi4_mini
104104

105105
import torch
106106
from transformers import RobertaTokenizer, T5ForConditionalGeneration
@@ -124,6 +124,103 @@ def test_text_generation_phi4(self):
124124
)
125125
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
126126

127+
@never_test()
128+
def test_text_generation_phi4_moe(self):
129+
# clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k phi4_moe
130+
131+
import requests
132+
import io
133+
from PIL import Image
134+
import soundfile as sf
135+
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
136+
from urllib.request import urlopen
137+
138+
# Define model path
139+
model_path = "microsoft/Phi-4-multimodal-instruct"
140+
141+
# Load model and processor
142+
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
143+
model = AutoModelForCausalLM.from_pretrained(
144+
model_path,
145+
device_map="cuda",
146+
torch_dtype="auto",
147+
trust_remote_code=True,
148+
# if you do not use Ampere or later GPUs, change attention to "eager"
149+
# _attn_implementation='flash_attention_2',
150+
_attn_implementation="eager",
151+
).cuda()
152+
153+
# Load generation config
154+
generation_config = GenerationConfig.from_pretrained(model_path)
155+
156+
# Define prompt structure
157+
user_prompt = "<|user|>"
158+
assistant_prompt = "<|assistant|>"
159+
prompt_suffix = "<|end|>"
160+
161+
# Part 1: Image Processing
162+
print("\n--- IMAGE PROCESSING ---")
163+
image_url = "https://www.ilankelman.org/stopsigns/australia.jpg"
164+
prompt = (
165+
f"{user_prompt}<|image_1|>What is shown in this image"
166+
f"?{prompt_suffix}{assistant_prompt}"
167+
)
168+
print(f">>> Prompt\n{prompt}")
169+
170+
# Download and open image
171+
image = Image.open(requests.get(image_url, stream=True).raw)
172+
inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda:0")
173+
174+
# Generate response
175+
print("--------- IMAGE PROCESSING ----------")
176+
print()
177+
with steal_forward(model):
178+
generate_ids = model.generate(
179+
**inputs,
180+
max_new_tokens=1000,
181+
generation_config=generation_config,
182+
)
183+
generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
184+
response = processor.batch_decode(
185+
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
186+
)[0]
187+
print(f">>> Response\n{response}")
188+
189+
# Part 2: Audio Processing
190+
print("\n--- AUDIO PROCESSING ---")
191+
audio_url = (
192+
"https://upload.wikimedia.org/wikipedia/commons/b/b0/"
193+
"Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac"
194+
)
195+
speech_prompt = (
196+
"Transcribe the audio to text, and then translate the audio to French. "
197+
"Use <sep> as a separator between the original transcript and the translation."
198+
)
199+
prompt = f"{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}"
200+
print(f">>> Prompt\n{prompt}")
201+
202+
# Downlowd and open audio file
203+
audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read()))
204+
205+
# Process with the model
206+
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(
207+
"cuda:0"
208+
)
209+
210+
print("--------- AUDIO PROCESSING ----------")
211+
print()
212+
with steal_forward(model):
213+
generate_ids = model.generate(
214+
**inputs,
215+
max_new_tokens=1000,
216+
generation_config=generation_config,
217+
)
218+
generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
219+
response = processor.batch_decode(
220+
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
221+
)[0]
222+
print(f">>> Response\n{response}")
223+
127224
@never_test()
128225
def test_imagetext2text_generation(self):
129226
# clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k etext2t

onnx_diagnostic/tasks/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
fill_mask,
55
image_classification,
66
image_text_to_text,
7+
mixture_of_expert,
78
sentence_similarity,
89
text_classification,
910
text_generation,
@@ -16,6 +17,7 @@
1617
fill_mask,
1718
image_classification,
1819
image_text_to_text,
20+
mixture_of_expert,
1921
sentence_similarity,
2022
text_classification,
2123
text_generation,
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
from typing import Any, Callable, Dict, Optional, Tuple
2+
import torch
3+
from ..helpers.cache_helper import make_dynamic_cache
4+
from ..helpers.config_helper import update_config, check_hasattr, _pick
5+
6+
__TASK__ = "MoE"
7+
8+
9+
def reduce_model_config(config: Any) -> Dict[str, Any]:
10+
"""Reduces a model size."""
11+
kwargs: Dict[str, Any] = {}
12+
if hasattr(config, "num_hidden_layers"):
13+
config.num_hidden_layers = min(config.num_hidden_layers, 2)
14+
if hasattr(config, "vision_config") and hasattr(config.vision_config, "num_hidden_layers"):
15+
config.vision_config.num_hidden_layers = min(config.vision_config.num_hidden_layers, 2)
16+
if hasattr(config, "audio_processor") and hasattr(
17+
config.audio_processor, "num_hidden_layers"
18+
):
19+
config.audio_processor.num_hidden_layers = min(
20+
config.audio_processor.num_hidden_layers, 2
21+
)
22+
if hasattr(config, "audio_processor") and hasattr(config.audio_processor, "attention_dim"):
23+
config.audio_processor.attention_dim = min(config.audio_processor.attention_dim, 2)
24+
update_config(config, kwargs)
25+
return kwargs
26+
27+
28+
def get_inputs(
29+
model: torch.nn.Module,
30+
config: Optional[Any],
31+
dummy_max_token_id: int,
32+
num_key_value_heads: int,
33+
num_hidden_layers: int,
34+
head_dim: int,
35+
width: int,
36+
height: int,
37+
num_channels: int,
38+
batch_size: int = 2,
39+
sequence_length: int = 30,
40+
sequence_length2: int = 3,
41+
n_images: int = 2,
42+
dynamic_rope: bool = False,
43+
**kwargs, # unused
44+
):
45+
"""
46+
Generates input for task ``text-generation``.
47+
48+
:param model: model to get the missing information
49+
:param config: configuration used to generate the model
50+
:param head_dim: last dimension of the cache
51+
:param dummy_max_token_id: dummy max token id
52+
:param batch_size: batch size
53+
:param sequence_length: sequence length
54+
:param sequence_length2: new sequence length
55+
:param n_images: number of images
56+
:param width: width of the image
57+
:param height: height of the image
58+
:param num_channels: number of channels
59+
:param dynamic_rope: use dynamic rope (see :class:`transformers.LlamaConfig`)
60+
:return: dictionary
61+
"""
62+
batch = torch.export.Dim("batch", min=1, max=1024)
63+
seq_length = "seq_length" # torch.export.Dim("seq_length", min=1, max=4096)
64+
cache_length = "cache_length" # torch.export.Dim("cache_length", min=1, max=4096)
65+
images = "images" # torch.export.Dim("images", min=1, max=4096)
66+
67+
shapes = {
68+
"input_ids": {0: batch, 1: seq_length},
69+
"attention_mask": {
70+
0: batch,
71+
1: "cache+seq", # cache_length + seq_length
72+
},
73+
"position_ids": {
74+
0: batch,
75+
1: "cache+seq", # cache_length + seq_length
76+
},
77+
"past_key_values": [
78+
[{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
79+
[{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
80+
],
81+
"pixel_values": {0: batch, 1: images},
82+
"image_attention_mask": {0: batch, 1: seq_length, 2: images},
83+
}
84+
inputs = dict(
85+
input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length2)).to(
86+
torch.int64
87+
),
88+
attention_mask=torch.ones((batch_size, sequence_length + sequence_length2)).to(
89+
torch.int64
90+
),
91+
position_ids=torch.arange(sequence_length, sequence_length + sequence_length2)
92+
.to(torch.int64)
93+
.expand((batch_size, -1)),
94+
past_key_values=make_dynamic_cache(
95+
[
96+
(
97+
torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
98+
torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
99+
)
100+
for i in range(num_hidden_layers)
101+
]
102+
),
103+
image_attention_mask=torch.ones((batch_size, sequence_length2, n_images)).to(
104+
torch.int64
105+
),
106+
pixel_values=torch.ones((batch_size, n_images, num_channels, width, height)).to(
107+
torch.int64
108+
),
109+
)
110+
return dict(inputs=inputs, dynamic_shapes=shapes)
111+
112+
113+
def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
114+
"""
115+
Inputs kwargs.
116+
117+
If the configuration is None, the function selects typical dimensions.
118+
"""
119+
if config is not None:
120+
check_hasattr(
121+
config,
122+
"vocab_size",
123+
"hidden_size",
124+
"num_attention_heads",
125+
("num_key_value_heads", "num_attention_heads"),
126+
"intermediate_size",
127+
"hidden_size",
128+
"vision_config",
129+
"audio_processor",
130+
)
131+
check_hasattr(config.vision_config, "image_size", "num_channels")
132+
kwargs = dict(
133+
batch_size=2,
134+
sequence_length=30,
135+
sequence_length2=3,
136+
head_dim=(
137+
16
138+
if config is None
139+
else getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
140+
),
141+
dummy_max_token_id=31999 if config is None else config.vocab_size - 1,
142+
num_hidden_layers=4 if config is None else config.num_hidden_layers,
143+
num_key_value_heads=(
144+
8
145+
if config is None
146+
else _pick(config, "num_key_value_heads", "num_attention_heads")
147+
),
148+
intermediate_size=1024 if config is None else config.intermediate_size,
149+
hidden_size=512 if config is None else config.hidden_size,
150+
width=224 if config is None else config.vision_config.image_size,
151+
height=224 if config is None else config.vision_config.image_size,
152+
num_channels=3 if config is None else config.vision_config.num_channels,
153+
)
154+
return kwargs, get_inputs

onnx_diagnostic/torch_models/hghub/hub_data.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
MobileNetV2Model,image-feature-extraction
7777
MobileViTForImageClassification,image-classification
7878
ModernBertForMaskedLM,fill-mask
79+
Phi4MMForCausalLM,MoE
7980
MoonshineForConditionalGeneration,automatic-speech-recognition
8081
MptForCausalLM,text-generation
8182
MusicgenForConditionalGeneration,text-to-audio

0 commit comments

Comments
 (0)