add patch

xadupre · xadupre · commit 45567134640b · 2025-10-03T12:49:09.000+02:00
diff --git a/_unittests/ut_tasks/test_tasks_image_text_to_text.py b/_unittests/ut_tasks/test_tasks_image_text_to_text.py
@@ -54,7 +54,7 @@ def test_image_text_to_text_tiny_gemma3(self):
 
     @hide_stdout()
     @requires_transformers("4.56.2")
-    @requires_torch("2.7.99")
+    @requires_torch("2.8.99")
     def test_image_text_to_text_gemma3_4b_it(self):
         mid = "google/gemma-3-4b-it"
         data = get_untrained_model_with_inputs(
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -1840,3 +1840,55 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                 batch_size, sequence_length, hidden_dim
             )
             return final_hidden_states, router_logits
+
+
+try:
+    import transformers.models.gemma3.modeling_gemma3
+
+    patch_gemma3 = True
+except ImportError:
+    patch_gemma3 = False
+
+
+if patch_gemma3:
+
+    class patched_Gemma3Model(torch.nn.Module):
+        _PATCHES_ = ["get_placeholder_mask"]
+        _PATCHED_CLASS_ = transformers.models.gemma3.modeling_gemma3.Gemma3Model
+
+        def get_placeholder_mask(
+            self,
+            input_ids: torch.LongTensor,
+            inputs_embeds: torch.FloatTensor,
+            image_features: torch.FloatTensor,
+        ):
+            if input_ids is None:
+                special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                    torch.tensor(
+                        self.config.image_token_id,
+                        dtype=torch.long,
+                        device=inputs_embeds.device,
+                    )
+                )
+                special_image_mask = special_image_mask.all(-1)
+            else:
+                special_image_mask = input_ids == self.config.image_token_id
+
+            n_image_tokens = special_image_mask.sum()
+            special_image_mask = (
+                special_image_mask.unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
+            )
+            n_image_features = image_features.shape[0] * image_features.shape[1]
+            # PATCHED: torch._check
+            # if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            #    raise ValueError( ... )
+            torch._check(
+                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                lambda: (
+                    f"Image features and image tokens do not match: tokens: "
+                    f"{n_image_tokens}, features {n_image_features}"
+                ),
+            )
+            return special_image_mask