improves text-image-to-text

xadupre · xadupre · commit ae00a21c4534 · 2025-07-22T11:26:36.000+02:00
diff --git a/_unittests/ut_tasks/test_tasks_image_text_to_text.py b/_unittests/ut_tasks/test_tasks_image_text_to_text.py
@@ -16,7 +16,7 @@ class TestTasksImageTextToText(ExtTestCase):
     @hide_stdout()
     @requires_transformers("4.53")
     @requires_torch("2.7.99")
-    def test_image_text_to_text(self):
+    def test_image_text_to_text_idefics(self):
         mid = "HuggingFaceM4/tiny-random-idefics"
         data = get_untrained_model_with_inputs(mid, verbose=1, add_second_input=True)
         self.assertEqual(data["task"], "image-text-to-text")
@@ -29,6 +29,23 @@ def test_image_text_to_text(self):
                 model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
             )
 
+    @hide_stdout()
+    @requires_transformers("4.53")
+    @requires_torch("2.7.99")
+    def test_image_text_to_text_gemma3(self):
+        mid = "tiny-random/gemma-3"
+        data = get_untrained_model_with_inputs(mid, verbose=1, add_second_input=True)
+        self.assertEqual(data["task"], "image-text-to-text")
+        self.assertIn((data["size"], data["n_weights"]), [(34401152, 8600288)])
+        model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
+        print("--", self.string_type(data["inputs"], with_shape=True))
+        model(**torch_deepcopy(inputs))
+        model(**data["inputs2"])
+        with torch_export_patches(patch_transformers=True, verbose=10):
+            torch.export.export(
+                model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
+            )
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
@@ -257,7 +257,7 @@ def test_text_generation_phi4_moe(self):
         print(f">>> Response\n{response}")
 
     @never_test()
-    def test_imagetext2text_generation(self):
+    def test_imagetext2text_generation_idefics(self):
         # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k etext2t
         # https://huggingface.co/docs/transformers/main/en/tasks/idefics
 
@@ -287,6 +287,47 @@ def test_imagetext2text_generation(self):
 
         print(generated_text[0])
 
+    @never_test()
+    def test_imagetext2text_generation_gemma3(self):
+        import torch
+        from transformers import Gemma3ForConditionalGeneration, AutoProcessor
+
+        mid = "tiny-random/gemma-3"
+        processor = AutoProcessor.from_pretrained(mid)
+        model = Gemma3ForConditionalGeneration.from_pretrained(
+            mid, torch_dtype=torch.bfloat16, device_map="auto"
+        )
+
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful assistant."}],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
+                    },
+                    {"type": "text", "text": "Describe this image in detail."},
+                ],
+            },
+        ]
+        inputs = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(model.device, dtype=torch.bfloat16)
+        print()
+        with steal_forward(model):
+            generated_ids = model.generate(**inputs, max_new_tokens=10)
+        decoded = processor.decode(generated_ids, skip_special_tokens=True)
+
+        print(decoded[0])
+
     @never_test()
     def test_automatic_speech_recognition(self):
         # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k automatic_speech
diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py
@@ -558,7 +558,12 @@ def string_type(
             print(f"[string_type] CACHE1:{type(obj)}")
         return f"MambaCache(conv_states={c}, ssm_states={d})"
 
-    if obj.__class__.__name__ in {"DynamicCache", "SlidingWindowCache", "StaticCache"}:
+    if obj.__class__.__name__ in {
+        "DynamicCache",
+        "SlidingWindowCache",
+        "StaticCache",
+        "HybridCache",
+    }:
         kc = string_type(
             obj.key_cache,
             with_shape=with_shape,
diff --git a/onnx_diagnostic/tasks/image_text_to_text.py b/onnx_diagnostic/tasks/image_text_to_text.py
@@ -56,6 +56,7 @@ def get_inputs(
         "cls_cache" not in kwargs
     ), f"Not yet implemented for cls_cache={kwargs['cls_cache']!r}."
     batch = torch.export.Dim("batch", min=1, max=1024)
+    batch_img = torch.export.Dim("batch_img", min=1, max=1024)
     seq_length = "seq_length"  # torch.export.Dim("seq_length", min=1, max=4096)
     cache_length = "cache_length"  # torch.export.Dim("cache_length", min=1, max=4096)
     images = "images"  # torch.export.Dim("images", min=1, max=4096)
@@ -74,7 +75,7 @@ def get_inputs(
             [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
             [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
         ],
-        "pixel_values": {0: batch, 1: images},
+        "pixel_values": {0: batch_img},
         "image_attention_mask": {0: batch, 1: seq_length, 2: images},
     }
     inputs = dict(
@@ -96,9 +97,7 @@ def get_inputs(
                 for i in range(num_hidden_layers)
             ]
         ),
-        pixel_values=torch.ones((batch_size, n_images, num_channels, width, height)).to(
-            torch.int64
-        ),
+        pixel_values=torch.randn(n_images, num_channels, width, height).clamp(-1, 1),
         image_attention_mask=torch.ones((batch_size, sequence_length2, n_images)).to(
             torch.int64
         ),
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py
@@ -4330,3 +4330,61 @@ def _ccached_diffusers_tiny_torch_full_checker_unet():
         "up_block_types": ["CrossAttnUpBlock2D", "UpBlock2D"],
         "use_linear_projection": false,
     }
+
+
+def _ccached_riny_random_gemma_3():
+    "tiny-random/gemma-3"
+    return transformers.Gemma3Config(
+        **{
+            "architectures": ["Gemma3ForConditionalGeneration"],
+            "boi_token_index": 255999,
+            "eoi_token_index": 256000,
+            "eos_token_id": [1, 106],
+            "image_token_index": 262144,
+            "initializer_range": 0.02,
+            "mm_tokens_per_image": 256,
+            "model_type": "gemma3",
+            "text_config": {
+                "attention_bias": false,
+                "attention_dropout": 0.0,
+                "attn_logit_softcapping": null,
+                "cache_implementation": "hybrid",
+                "final_logit_softcapping": null,
+                "head_dim": 32,
+                "hidden_activation": "gelu_pytorch_tanh",
+                "hidden_size": 32,
+                "initializer_range": 0.02,
+                "intermediate_size": 128,
+                "max_position_embeddings": 131072,
+                "model_type": "gemma3_text",
+                "num_attention_heads": 1,
+                "num_hidden_layers": 2,
+                "num_key_value_heads": 1,
+                "query_pre_attn_scalar": 168,
+                "rms_norm_eps": 1e-06,
+                "rope_local_base_freq": 10000.0,
+                "rope_scaling": {"factor": 8.0, "rope_type": "linear"},
+                "rope_theta": 1000000.0,
+                "sliding_window": 1024,
+                "sliding_window_pattern": 2,
+                "use_cache": true,
+                "vocab_size": 262208,
+            },
+            "torch_dtype": "bfloat16",
+            "transformers_version": "4.50.0.dev0",
+            "vision_config": {
+                "attention_dropout": 0.0,
+                "hidden_act": "gelu_pytorch_tanh",
+                "hidden_size": 32,
+                "image_size": 896,
+                "intermediate_size": 128,
+                "layer_norm_eps": 1e-06,
+                "model_type": "siglip_vision_model",
+                "num_attention_heads": 1,
+                "num_channels": 3,
+                "num_hidden_layers": 2,
+                "patch_size": 14,
+                "vision_use_head": false,
+            },
+        }
+    )