Skip to content

Commit ae00a21

Browse files
committed
improves text-image-to-text
1 parent c6e21b6 commit ae00a21

File tree

5 files changed

+127
-7
lines changed

5 files changed

+127
-7
lines changed

_unittests/ut_tasks/test_tasks_image_text_to_text.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class TestTasksImageTextToText(ExtTestCase):
1616
@hide_stdout()
1717
@requires_transformers("4.53")
1818
@requires_torch("2.7.99")
19-
def test_image_text_to_text(self):
19+
def test_image_text_to_text_idefics(self):
2020
mid = "HuggingFaceM4/tiny-random-idefics"
2121
data = get_untrained_model_with_inputs(mid, verbose=1, add_second_input=True)
2222
self.assertEqual(data["task"], "image-text-to-text")
@@ -29,6 +29,23 @@ def test_image_text_to_text(self):
2929
model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
3030
)
3131

32+
@hide_stdout()
33+
@requires_transformers("4.53")
34+
@requires_torch("2.7.99")
35+
def test_image_text_to_text_gemma3(self):
36+
mid = "tiny-random/gemma-3"
37+
data = get_untrained_model_with_inputs(mid, verbose=1, add_second_input=True)
38+
self.assertEqual(data["task"], "image-text-to-text")
39+
self.assertIn((data["size"], data["n_weights"]), [(34401152, 8600288)])
40+
model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
41+
print("--", self.string_type(data["inputs"], with_shape=True))
42+
model(**torch_deepcopy(inputs))
43+
model(**data["inputs2"])
44+
with torch_export_patches(patch_transformers=True, verbose=10):
45+
torch.export.export(
46+
model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
47+
)
48+
3249

3350
if __name__ == "__main__":
3451
unittest.main(verbosity=2)

_unittests/ut_tasks/try_tasks.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ def test_text_generation_phi4_moe(self):
257257
print(f">>> Response\n{response}")
258258

259259
@never_test()
260-
def test_imagetext2text_generation(self):
260+
def test_imagetext2text_generation_idefics(self):
261261
# clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k etext2t
262262
# https://huggingface.co/docs/transformers/main/en/tasks/idefics
263263

@@ -287,6 +287,47 @@ def test_imagetext2text_generation(self):
287287

288288
print(generated_text[0])
289289

290+
@never_test()
291+
def test_imagetext2text_generation_gemma3(self):
292+
import torch
293+
from transformers import Gemma3ForConditionalGeneration, AutoProcessor
294+
295+
mid = "tiny-random/gemma-3"
296+
processor = AutoProcessor.from_pretrained(mid)
297+
model = Gemma3ForConditionalGeneration.from_pretrained(
298+
mid, torch_dtype=torch.bfloat16, device_map="auto"
299+
)
300+
301+
messages = [
302+
{
303+
"role": "system",
304+
"content": [{"type": "text", "text": "You are a helpful assistant."}],
305+
},
306+
{
307+
"role": "user",
308+
"content": [
309+
{
310+
"type": "image",
311+
"image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
312+
},
313+
{"type": "text", "text": "Describe this image in detail."},
314+
],
315+
},
316+
]
317+
inputs = processor.apply_chat_template(
318+
messages,
319+
add_generation_prompt=True,
320+
tokenize=True,
321+
return_dict=True,
322+
return_tensors="pt",
323+
).to(model.device, dtype=torch.bfloat16)
324+
print()
325+
with steal_forward(model):
326+
generated_ids = model.generate(**inputs, max_new_tokens=10)
327+
decoded = processor.decode(generated_ids, skip_special_tokens=True)
328+
329+
print(decoded[0])
330+
290331
@never_test()
291332
def test_automatic_speech_recognition(self):
292333
# clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k automatic_speech

onnx_diagnostic/helpers/helper.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -558,7 +558,12 @@ def string_type(
558558
print(f"[string_type] CACHE1:{type(obj)}")
559559
return f"MambaCache(conv_states={c}, ssm_states={d})"
560560

561-
if obj.__class__.__name__ in {"DynamicCache", "SlidingWindowCache", "StaticCache"}:
561+
if obj.__class__.__name__ in {
562+
"DynamicCache",
563+
"SlidingWindowCache",
564+
"StaticCache",
565+
"HybridCache",
566+
}:
562567
kc = string_type(
563568
obj.key_cache,
564569
with_shape=with_shape,

onnx_diagnostic/tasks/image_text_to_text.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def get_inputs(
5656
"cls_cache" not in kwargs
5757
), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
5858
batch = torch.export.Dim("batch", min=1, max=1024)
59+
batch_img = torch.export.Dim("batch_img", min=1, max=1024)
5960
seq_length = "seq_length" # torch.export.Dim("seq_length", min=1, max=4096)
6061
cache_length = "cache_length" # torch.export.Dim("cache_length", min=1, max=4096)
6162
images = "images" # torch.export.Dim("images", min=1, max=4096)
@@ -74,7 +75,7 @@ def get_inputs(
7475
[{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
7576
[{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
7677
],
77-
"pixel_values": {0: batch, 1: images},
78+
"pixel_values": {0: batch_img},
7879
"image_attention_mask": {0: batch, 1: seq_length, 2: images},
7980
}
8081
inputs = dict(
@@ -96,9 +97,7 @@ def get_inputs(
9697
for i in range(num_hidden_layers)
9798
]
9899
),
99-
pixel_values=torch.ones((batch_size, n_images, num_channels, width, height)).to(
100-
torch.int64
101-
),
100+
pixel_values=torch.randn(n_images, num_channels, width, height).clamp(-1, 1),
102101
image_attention_mask=torch.ones((batch_size, sequence_length2, n_images)).to(
103102
torch.int64
104103
),

onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4330,3 +4330,61 @@ def _ccached_diffusers_tiny_torch_full_checker_unet():
43304330
"up_block_types": ["CrossAttnUpBlock2D", "UpBlock2D"],
43314331
"use_linear_projection": false,
43324332
}
4333+
4334+
4335+
def _ccached_riny_random_gemma_3():
4336+
"tiny-random/gemma-3"
4337+
return transformers.Gemma3Config(
4338+
**{
4339+
"architectures": ["Gemma3ForConditionalGeneration"],
4340+
"boi_token_index": 255999,
4341+
"eoi_token_index": 256000,
4342+
"eos_token_id": [1, 106],
4343+
"image_token_index": 262144,
4344+
"initializer_range": 0.02,
4345+
"mm_tokens_per_image": 256,
4346+
"model_type": "gemma3",
4347+
"text_config": {
4348+
"attention_bias": false,
4349+
"attention_dropout": 0.0,
4350+
"attn_logit_softcapping": null,
4351+
"cache_implementation": "hybrid",
4352+
"final_logit_softcapping": null,
4353+
"head_dim": 32,
4354+
"hidden_activation": "gelu_pytorch_tanh",
4355+
"hidden_size": 32,
4356+
"initializer_range": 0.02,
4357+
"intermediate_size": 128,
4358+
"max_position_embeddings": 131072,
4359+
"model_type": "gemma3_text",
4360+
"num_attention_heads": 1,
4361+
"num_hidden_layers": 2,
4362+
"num_key_value_heads": 1,
4363+
"query_pre_attn_scalar": 168,
4364+
"rms_norm_eps": 1e-06,
4365+
"rope_local_base_freq": 10000.0,
4366+
"rope_scaling": {"factor": 8.0, "rope_type": "linear"},
4367+
"rope_theta": 1000000.0,
4368+
"sliding_window": 1024,
4369+
"sliding_window_pattern": 2,
4370+
"use_cache": true,
4371+
"vocab_size": 262208,
4372+
},
4373+
"torch_dtype": "bfloat16",
4374+
"transformers_version": "4.50.0.dev0",
4375+
"vision_config": {
4376+
"attention_dropout": 0.0,
4377+
"hidden_act": "gelu_pytorch_tanh",
4378+
"hidden_size": 32,
4379+
"image_size": 896,
4380+
"intermediate_size": 128,
4381+
"layer_norm_eps": 1e-06,
4382+
"model_type": "siglip_vision_model",
4383+
"num_attention_heads": 1,
4384+
"num_channels": 3,
4385+
"num_hidden_layers": 2,
4386+
"patch_size": 14,
4387+
"vision_use_head": false,
4388+
},
4389+
}
4390+
)

0 commit comments

Comments
 (0)