@@ -99,8 +99,8 @@ def test_text2text_generation(self):
9999 print (tokenizer .decode (generated_ids [0 ], skip_special_tokens = True ))
100100
101101 @never_test ()
102- def test_text_generation_phi4 (self ):
103- # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k phi4
102+ def test_text_generation_phi4_mini (self ):
103+ # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k phi4_mini
104104
105105 import torch
106106 from transformers import RobertaTokenizer , T5ForConditionalGeneration
@@ -124,6 +124,107 @@ def test_text_generation_phi4(self):
124124 )
125125 print (tokenizer .decode (generated_ids [0 ], skip_special_tokens = True ))
126126
127+ @never_test ()
128+ @unittest .skip (
129+ reason = "AttributeError: 'Phi4MMModel' object has no attribute "
130+ "'prepare_inputs_for_generation'"
131+ )
132+ def test_text_generation_phi4_moe (self ):
133+ # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k phi4_moe
134+
135+ import requests
136+ import io
137+ from PIL import Image
138+ import soundfile as sf
139+ from transformers import AutoModelForCausalLM , AutoProcessor , GenerationConfig
140+ from urllib .request import urlopen
141+
142+ # Define model path
143+ model_path = "microsoft/Phi-4-multimodal-instruct"
144+
145+ # Load model and processor
146+ processor = AutoProcessor .from_pretrained (model_path , trust_remote_code = True )
147+ model = AutoModelForCausalLM .from_pretrained (
148+ model_path ,
149+ device_map = "cuda" ,
150+ torch_dtype = "auto" ,
151+ trust_remote_code = True ,
152+ # if you do not use Ampere or later GPUs, change attention to "eager"
153+ # _attn_implementation='flash_attention_2',
154+ _attn_implementation = "eager" ,
155+ ).cuda ()
156+
157+ # Load generation config
158+ generation_config = GenerationConfig .from_pretrained (model_path )
159+
160+ # Define prompt structure
161+ user_prompt = "<|user|>"
162+ assistant_prompt = "<|assistant|>"
163+ prompt_suffix = "<|end|>"
164+
165+ # Part 1: Image Processing
166+ print ("\n --- IMAGE PROCESSING ---" )
167+ image_url = "https://www.ilankelman.org/stopsigns/australia.jpg"
168+ prompt = (
169+ f"{ user_prompt } <|image_1|>What is shown in this image"
170+ f"?{ prompt_suffix } { assistant_prompt } "
171+ )
172+ print (f">>> Prompt\n { prompt } " )
173+
174+ # Download and open image
175+ image = Image .open (requests .get (image_url , stream = True ).raw )
176+ inputs = processor (text = prompt , images = image , return_tensors = "pt" ).to ("cuda:0" )
177+
178+ # Generate response
179+ print ("--------- IMAGE PROCESSING ----------" )
180+ print ()
181+ with steal_forward (model ):
182+ generate_ids = model .generate (
183+ ** inputs ,
184+ max_new_tokens = 1000 ,
185+ generation_config = generation_config ,
186+ )
187+ generate_ids = generate_ids [:, inputs ["input_ids" ].shape [1 ] :]
188+ response = processor .batch_decode (
189+ generate_ids , skip_special_tokens = True , clean_up_tokenization_spaces = False
190+ )[0 ]
191+ print (f">>> Response\n { response } " )
192+
193+ # Part 2: Audio Processing
194+ print ("\n --- AUDIO PROCESSING ---" )
195+ audio_url = (
196+ "https://upload.wikimedia.org/wikipedia/commons/b/b0/"
197+ "Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac"
198+ )
199+ speech_prompt = (
200+ "Transcribe the audio to text, and then translate the audio to French. "
201+ "Use <sep> as a separator between the original transcript and the translation."
202+ )
203+ prompt = f"{ user_prompt } <|audio_1|>{ speech_prompt } { prompt_suffix } { assistant_prompt } "
204+ print (f">>> Prompt\n { prompt } " )
205+
206+ # Download and open audio file
207+ audio , samplerate = sf .read (io .BytesIO (urlopen (audio_url ).read ()))
208+
209+ # Process with the model
210+ inputs = processor (text = prompt , audios = [(audio , samplerate )], return_tensors = "pt" ).to (
211+ "cuda:0"
212+ )
213+
214+ print ("--------- AUDIO PROCESSING ----------" )
215+ print ()
216+ with steal_forward (model ):
217+ generate_ids = model .generate (
218+ ** inputs ,
219+ max_new_tokens = 1000 ,
220+ generation_config = generation_config ,
221+ )
222+ generate_ids = generate_ids [:, inputs ["input_ids" ].shape [1 ] :]
223+ response = processor .batch_decode (
224+ generate_ids , skip_special_tokens = True , clean_up_tokenization_spaces = False
225+ )[0 ]
226+ print (f">>> Response\n { response } " )
227+
127228 @never_test ()
128229 def test_imagetext2text_generation (self ):
129230 # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k etext2t
@@ -237,6 +338,22 @@ def test_fill_mask(self):
237338 output = model (** encoded_input )
238339 print ("-- outputs" , string_type (output , with_shape = True , with_min_max = True ))
239340
341+ @never_test ()
342+ def test_feature_extraction (self ):
343+ # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k feature_ex
344+ # https://huggingface.co/google-bert/bert-base-multilingual-cased
345+
346+ from transformers import BartTokenizer , BartModel
347+
348+ tokenizer = BartTokenizer .from_pretrained ("facebook/bart-base" )
349+ model = BartModel .from_pretrained ("facebook/bart-base" )
350+ text = "Replace me by any text you'd like."
351+ encoded_input = tokenizer (text , return_tensors = "pt" )
352+ print ()
353+ print ("-- inputs" , string_type (encoded_input , with_shape = True , with_min_max = True ))
354+ output = model (** encoded_input )
355+ print ("-- outputs" , string_type (output , with_shape = True , with_min_max = True ))
356+
240357 @never_test ()
241358 def test_text_classification (self ):
242359 # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k text_cl
0 commit comments